1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124 PetscInt *diag = a->diag,oidx; 125 MatScalar *aa=a->a,*v; 126 PetscScalar s1,s2,s3,x1,x2,x3; 127 PetscScalar *x,*b; 128 129 PetscFunctionBegin; 130 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133 134 /* forward solve the U^T */ 135 idx = 0; 136 for (i=0; i<n; i++) { 137 138 v = aa + 9*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144 v += 9; 145 146 vi = aj + diag[i] + 1; 147 nz = ai[i+1] - diag[i] - 1; 148 while (nz--) { 149 oidx = 3*(*vi++); 150 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153 v += 9; 154 } 155 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156 idx += 3; 157 } 158 /* backward solve the L^T */ 159 for (i=n-1; i>=0; i--){ 160 v = aa + 9*diag[i] - 9; 161 vi = aj + diag[i] - 1; 162 nz = diag[i] - ai[i]; 163 idt = 3*i; 164 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165 while (nz--) { 166 idx = 3*(*vi--); 167 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170 v -= 9; 171 } 172 } 173 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182 { 183 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184 PetscErrorCode ierr; 185 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186 PetscInt *diag = a->diag,oidx; 187 MatScalar *aa=a->a,*v; 188 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 189 PetscScalar *x,*b; 190 191 PetscFunctionBegin; 192 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 193 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195 196 /* forward solve the U^T */ 197 idx = 0; 198 for (i=0; i<n; i++) { 199 200 v = aa + 16*diag[i]; 201 /* multiply by the inverse of the block diagonal */ 202 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207 v += 16; 208 209 vi = aj + diag[i] + 1; 210 nz = ai[i+1] - diag[i] - 1; 211 while (nz--) { 212 oidx = 4*(*vi++); 213 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217 v += 16; 218 } 219 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220 idx += 4; 221 } 222 /* backward solve the L^T */ 223 for (i=n-1; i>=0; i--){ 224 v = aa + 16*diag[i] - 16; 225 vi = aj + diag[i] - 1; 226 nz = diag[i] - ai[i]; 227 idt = 4*i; 228 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229 while (nz--) { 230 idx = 4*(*vi--); 231 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235 v -= 16; 236 } 237 } 238 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 239 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247 { 248 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249 PetscErrorCode ierr; 250 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251 PetscInt *diag = a->diag,oidx; 252 MatScalar *aa=a->a,*v; 253 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 254 PetscScalar *x,*b; 255 256 PetscFunctionBegin; 257 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 258 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 259 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260 261 /* forward solve the U^T */ 262 idx = 0; 263 for (i=0; i<n; i++) { 264 265 v = aa + 25*diag[i]; 266 /* multiply by the inverse of the block diagonal */ 267 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273 v += 25; 274 275 vi = aj + diag[i] + 1; 276 nz = ai[i+1] - diag[i] - 1; 277 while (nz--) { 278 oidx = 5*(*vi++); 279 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284 v += 25; 285 } 286 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287 idx += 5; 288 } 289 /* backward solve the L^T */ 290 for (i=n-1; i>=0; i--){ 291 v = aa + 25*diag[i] - 25; 292 vi = aj + diag[i] - 1; 293 nz = diag[i] - ai[i]; 294 idt = 5*i; 295 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296 while (nz--) { 297 idx = 5*(*vi--); 298 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303 v -= 25; 304 } 305 } 306 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 307 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311 312 #undef __FUNCT__ 313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315 { 316 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317 PetscErrorCode ierr; 318 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319 PetscInt *diag = a->diag,oidx; 320 MatScalar *aa=a->a,*v; 321 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 322 PetscScalar *x,*b; 323 324 PetscFunctionBegin; 325 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 326 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 327 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328 329 /* forward solve the U^T */ 330 idx = 0; 331 for (i=0; i<n; i++) { 332 333 v = aa + 36*diag[i]; 334 /* multiply by the inverse of the block diagonal */ 335 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336 x6 = x[5+idx]; 337 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343 v += 36; 344 345 vi = aj + diag[i] + 1; 346 nz = ai[i+1] - diag[i] - 1; 347 while (nz--) { 348 oidx = 6*(*vi++); 349 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355 v += 36; 356 } 357 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358 x[5+idx] = s6; 359 idx += 6; 360 } 361 /* backward solve the L^T */ 362 for (i=n-1; i>=0; i--){ 363 v = aa + 36*diag[i] - 36; 364 vi = aj + diag[i] - 1; 365 nz = diag[i] - ai[i]; 366 idt = 6*i; 367 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368 s6 = x[5+idt]; 369 while (nz--) { 370 idx = 6*(*vi--); 371 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377 v -= 36; 378 } 379 } 380 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389 { 390 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391 PetscErrorCode ierr; 392 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393 PetscInt *diag = a->diag,oidx; 394 MatScalar *aa=a->a,*v; 395 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 396 PetscScalar *x,*b; 397 398 PetscFunctionBegin; 399 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 400 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 401 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402 403 /* forward solve the U^T */ 404 idx = 0; 405 for (i=0; i<n; i++) { 406 407 v = aa + 49*diag[i]; 408 /* multiply by the inverse of the block diagonal */ 409 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410 x6 = x[5+idx]; x7 = x[6+idx]; 411 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418 v += 49; 419 420 vi = aj + diag[i] + 1; 421 nz = ai[i+1] - diag[i] - 1; 422 while (nz--) { 423 oidx = 7*(*vi++); 424 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431 v += 49; 432 } 433 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434 x[5+idx] = s6;x[6+idx] = s7; 435 idx += 7; 436 } 437 /* backward solve the L^T */ 438 for (i=n-1; i>=0; i--){ 439 v = aa + 49*diag[i] - 49; 440 vi = aj + diag[i] - 1; 441 nz = diag[i] - ai[i]; 442 idt = 7*i; 443 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444 s6 = x[5+idt];s7 = x[6+idt]; 445 while (nz--) { 446 idx = 7*(*vi--); 447 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454 v -= 49; 455 } 456 } 457 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460 PetscFunctionReturn(0); 461 } 462 463 /*---------------------------------------------------------------------------------------------*/ 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469 IS iscol=a->col,isrow=a->row; 470 PetscErrorCode ierr; 471 const PetscInt *r,*c,*rout,*cout; 472 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473 PetscInt *diag = a->diag; 474 MatScalar *aa=a->a,*v; 475 PetscScalar s1,*x,*b,*t; 476 477 PetscFunctionBegin; 478 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 482 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484 485 /* copy the b into temp work space according to permutation */ 486 for (i=0; i<n; i++) { 487 t[i] = b[c[i]]; 488 } 489 490 /* forward solve the U^T */ 491 for (i=0; i<n; i++) { 492 493 v = aa + diag[i]; 494 /* multiply by the inverse of the block diagonal */ 495 s1 = (*v++)*t[i]; 496 vi = aj + diag[i] + 1; 497 nz = ai[i+1] - diag[i] - 1; 498 while (nz--) { 499 t[*vi++] -= (*v++)*s1; 500 } 501 t[i] = s1; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--){ 505 v = aa + diag[i] - 1; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 s1 = t[i]; 509 while (nz--) { 510 t[*vi--] -= (*v--)*s1; 511 } 512 } 513 514 /* copy t into x according to permutation */ 515 for (i=0; i<n; i++) { 516 x[r[i]] = t[i]; 517 } 518 519 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 521 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530 { 531 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532 IS iscol=a->col,isrow=a->row; 533 PetscErrorCode ierr; 534 const PetscInt *r,*c,*rout,*cout; 535 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536 PetscInt *diag = a->diag,ii,ic,ir,oidx; 537 MatScalar *aa=a->a,*v; 538 PetscScalar s1,s2,x1,x2; 539 PetscScalar *x,*b,*t; 540 541 PetscFunctionBegin; 542 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544 t = a->solve_work; 545 546 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548 549 /* copy the b into temp work space according to permutation */ 550 ii = 0; 551 for (i=0; i<n; i++) { 552 ic = 2*c[i]; 553 t[ii] = b[ic]; 554 t[ii+1] = b[ic+1]; 555 ii += 2; 556 } 557 558 /* forward solve the U^T */ 559 idx = 0; 560 for (i=0; i<n; i++) { 561 562 v = aa + 4*diag[i]; 563 /* multiply by the inverse of the block diagonal */ 564 x1 = t[idx]; x2 = t[1+idx]; 565 s1 = v[0]*x1 + v[1]*x2; 566 s2 = v[2]*x1 + v[3]*x2; 567 v += 4; 568 569 vi = aj + diag[i] + 1; 570 nz = ai[i+1] - diag[i] - 1; 571 while (nz--) { 572 oidx = 2*(*vi++); 573 t[oidx] -= v[0]*s1 + v[1]*s2; 574 t[oidx+1] -= v[2]*s1 + v[3]*s2; 575 v += 4; 576 } 577 t[idx] = s1;t[1+idx] = s2; 578 idx += 2; 579 } 580 /* backward solve the L^T */ 581 for (i=n-1; i>=0; i--){ 582 v = aa + 4*diag[i] - 4; 583 vi = aj + diag[i] - 1; 584 nz = diag[i] - ai[i]; 585 idt = 2*i; 586 s1 = t[idt]; s2 = t[1+idt]; 587 while (nz--) { 588 idx = 2*(*vi--); 589 t[idx] -= v[0]*s1 + v[1]*s2; 590 t[idx+1] -= v[2]*s1 + v[3]*s2; 591 v -= 4; 592 } 593 } 594 595 /* copy t into x according to permutation */ 596 ii = 0; 597 for (i=0; i<n; i++) { 598 ir = 2*r[i]; 599 x[ir] = t[ii]; 600 x[ir+1] = t[ii+1]; 601 ii += 2; 602 } 603 604 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 606 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 #undef __FUNCT__ 613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615 { 616 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617 IS iscol=a->col,isrow=a->row; 618 PetscErrorCode ierr; 619 const PetscInt *r,*c,*rout,*cout; 620 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621 PetscInt *diag = a->diag,ii,ic,ir,oidx; 622 MatScalar *aa=a->a,*v; 623 PetscScalar s1,s2,s3,x1,x2,x3; 624 PetscScalar *x,*b,*t; 625 626 PetscFunctionBegin; 627 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 628 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629 t = a->solve_work; 630 631 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633 634 /* copy the b into temp work space according to permutation */ 635 ii = 0; 636 for (i=0; i<n; i++) { 637 ic = 3*c[i]; 638 t[ii] = b[ic]; 639 t[ii+1] = b[ic+1]; 640 t[ii+2] = b[ic+2]; 641 ii += 3; 642 } 643 644 /* forward solve the U^T */ 645 idx = 0; 646 for (i=0; i<n; i++) { 647 648 v = aa + 9*diag[i]; 649 /* multiply by the inverse of the block diagonal */ 650 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654 v += 9; 655 656 vi = aj + diag[i] + 1; 657 nz = ai[i+1] - diag[i] - 1; 658 while (nz--) { 659 oidx = 3*(*vi++); 660 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663 v += 9; 664 } 665 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666 idx += 3; 667 } 668 /* backward solve the L^T */ 669 for (i=n-1; i>=0; i--){ 670 v = aa + 9*diag[i] - 9; 671 vi = aj + diag[i] - 1; 672 nz = diag[i] - ai[i]; 673 idt = 3*i; 674 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675 while (nz--) { 676 idx = 3*(*vi--); 677 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680 v -= 9; 681 } 682 } 683 684 /* copy t into x according to permutation */ 685 ii = 0; 686 for (i=0; i<n; i++) { 687 ir = 3*r[i]; 688 x[ir] = t[ii]; 689 x[ir+1] = t[ii+1]; 690 x[ir+2] = t[ii+2]; 691 ii += 3; 692 } 693 694 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 696 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707 IS iscol=a->col,isrow=a->row; 708 PetscErrorCode ierr; 709 const PetscInt *r,*c,*rout,*cout; 710 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711 PetscInt *diag = a->diag,ii,ic,ir,oidx; 712 MatScalar *aa=a->a,*v; 713 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 714 PetscScalar *x,*b,*t; 715 716 PetscFunctionBegin; 717 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 t = a->solve_work; 720 721 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723 724 /* copy the b into temp work space according to permutation */ 725 ii = 0; 726 for (i=0; i<n; i++) { 727 ic = 4*c[i]; 728 t[ii] = b[ic]; 729 t[ii+1] = b[ic+1]; 730 t[ii+2] = b[ic+2]; 731 t[ii+3] = b[ic+3]; 732 ii += 4; 733 } 734 735 /* forward solve the U^T */ 736 idx = 0; 737 for (i=0; i<n; i++) { 738 739 v = aa + 16*diag[i]; 740 /* multiply by the inverse of the block diagonal */ 741 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746 v += 16; 747 748 vi = aj + diag[i] + 1; 749 nz = ai[i+1] - diag[i] - 1; 750 while (nz--) { 751 oidx = 4*(*vi++); 752 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756 v += 16; 757 } 758 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759 idx += 4; 760 } 761 /* backward solve the L^T */ 762 for (i=n-1; i>=0; i--){ 763 v = aa + 16*diag[i] - 16; 764 vi = aj + diag[i] - 1; 765 nz = diag[i] - ai[i]; 766 idt = 4*i; 767 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768 while (nz--) { 769 idx = 4*(*vi--); 770 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774 v -= 16; 775 } 776 } 777 778 /* copy t into x according to permutation */ 779 ii = 0; 780 for (i=0; i<n; i++) { 781 ir = 4*r[i]; 782 x[ir] = t[ii]; 783 x[ir+1] = t[ii+1]; 784 x[ir+2] = t[ii+2]; 785 x[ir+3] = t[ii+3]; 786 ii += 4; 787 } 788 789 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 791 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800 { 801 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802 IS iscol=a->col,isrow=a->row; 803 PetscErrorCode ierr; 804 const PetscInt *r,*c,*rout,*cout; 805 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806 PetscInt *diag = a->diag,ii,ic,ir,oidx; 807 MatScalar *aa=a->a,*v; 808 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 809 PetscScalar *x,*b,*t; 810 811 PetscFunctionBegin; 812 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 813 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814 t = a->solve_work; 815 816 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818 819 /* copy the b into temp work space according to permutation */ 820 ii = 0; 821 for (i=0; i<n; i++) { 822 ic = 5*c[i]; 823 t[ii] = b[ic]; 824 t[ii+1] = b[ic+1]; 825 t[ii+2] = b[ic+2]; 826 t[ii+3] = b[ic+3]; 827 t[ii+4] = b[ic+4]; 828 ii += 5; 829 } 830 831 /* forward solve the U^T */ 832 idx = 0; 833 for (i=0; i<n; i++) { 834 835 v = aa + 25*diag[i]; 836 /* multiply by the inverse of the block diagonal */ 837 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843 v += 25; 844 845 vi = aj + diag[i] + 1; 846 nz = ai[i+1] - diag[i] - 1; 847 while (nz--) { 848 oidx = 5*(*vi++); 849 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v += 25; 855 } 856 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857 idx += 5; 858 } 859 /* backward solve the L^T */ 860 for (i=n-1; i>=0; i--){ 861 v = aa + 25*diag[i] - 25; 862 vi = aj + diag[i] - 1; 863 nz = diag[i] - ai[i]; 864 idt = 5*i; 865 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866 while (nz--) { 867 idx = 5*(*vi--); 868 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873 v -= 25; 874 } 875 } 876 877 /* copy t into x according to permutation */ 878 ii = 0; 879 for (i=0; i<n; i++) { 880 ir = 5*r[i]; 881 x[ir] = t[ii]; 882 x[ir+1] = t[ii+1]; 883 x[ir+2] = t[ii+2]; 884 x[ir+3] = t[ii+3]; 885 x[ir+4] = t[ii+4]; 886 ii += 5; 887 } 888 889 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 891 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902 IS iscol=a->col,isrow=a->row; 903 PetscErrorCode ierr; 904 const PetscInt *r,*c,*rout,*cout; 905 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906 PetscInt *diag = a->diag,ii,ic,ir,oidx; 907 MatScalar *aa=a->a,*v; 908 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 909 PetscScalar *x,*b,*t; 910 911 PetscFunctionBegin; 912 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 t = a->solve_work; 915 916 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918 919 /* copy the b into temp work space according to permutation */ 920 ii = 0; 921 for (i=0; i<n; i++) { 922 ic = 6*c[i]; 923 t[ii] = b[ic]; 924 t[ii+1] = b[ic+1]; 925 t[ii+2] = b[ic+2]; 926 t[ii+3] = b[ic+3]; 927 t[ii+4] = b[ic+4]; 928 t[ii+5] = b[ic+5]; 929 ii += 6; 930 } 931 932 /* forward solve the U^T */ 933 idx = 0; 934 for (i=0; i<n; i++) { 935 936 v = aa + 36*diag[i]; 937 /* multiply by the inverse of the block diagonal */ 938 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939 x6 = t[5+idx]; 940 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946 v += 36; 947 948 vi = aj + diag[i] + 1; 949 nz = ai[i+1] - diag[i] - 1; 950 while (nz--) { 951 oidx = 6*(*vi++); 952 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958 v += 36; 959 } 960 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961 t[5+idx] = s6; 962 idx += 6; 963 } 964 /* backward solve the L^T */ 965 for (i=n-1; i>=0; i--){ 966 v = aa + 36*diag[i] - 36; 967 vi = aj + diag[i] - 1; 968 nz = diag[i] - ai[i]; 969 idt = 6*i; 970 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971 s6 = t[5+idt]; 972 while (nz--) { 973 idx = 6*(*vi--); 974 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980 v -= 36; 981 } 982 } 983 984 /* copy t into x according to permutation */ 985 ii = 0; 986 for (i=0; i<n; i++) { 987 ir = 6*r[i]; 988 x[ir] = t[ii]; 989 x[ir+1] = t[ii+1]; 990 x[ir+2] = t[ii+2]; 991 x[ir+3] = t[ii+3]; 992 x[ir+4] = t[ii+4]; 993 x[ir+5] = t[ii+5]; 994 ii += 6; 995 } 996 997 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 999 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1000 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008 { 1009 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010 IS iscol=a->col,isrow=a->row; 1011 PetscErrorCode ierr; 1012 const PetscInt *r,*c,*rout,*cout; 1013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015 MatScalar *aa=a->a,*v; 1016 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1017 PetscScalar *x,*b,*t; 1018 1019 PetscFunctionBegin; 1020 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022 t = a->solve_work; 1023 1024 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026 1027 /* copy the b into temp work space according to permutation */ 1028 ii = 0; 1029 for (i=0; i<n; i++) { 1030 ic = 7*c[i]; 1031 t[ii] = b[ic]; 1032 t[ii+1] = b[ic+1]; 1033 t[ii+2] = b[ic+2]; 1034 t[ii+3] = b[ic+3]; 1035 t[ii+4] = b[ic+4]; 1036 t[ii+5] = b[ic+5]; 1037 t[ii+6] = b[ic+6]; 1038 ii += 7; 1039 } 1040 1041 /* forward solve the U^T */ 1042 idx = 0; 1043 for (i=0; i<n; i++) { 1044 1045 v = aa + 49*diag[i]; 1046 /* multiply by the inverse of the block diagonal */ 1047 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048 x6 = t[5+idx]; x7 = t[6+idx]; 1049 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056 v += 49; 1057 1058 vi = aj + diag[i] + 1; 1059 nz = ai[i+1] - diag[i] - 1; 1060 while (nz--) { 1061 oidx = 7*(*vi++); 1062 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v += 49; 1070 } 1071 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072 t[5+idx] = s6;t[6+idx] = s7; 1073 idx += 7; 1074 } 1075 /* backward solve the L^T */ 1076 for (i=n-1; i>=0; i--){ 1077 v = aa + 49*diag[i] - 49; 1078 vi = aj + diag[i] - 1; 1079 nz = diag[i] - ai[i]; 1080 idt = 7*i; 1081 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082 s6 = t[5+idt];s7 = t[6+idt]; 1083 while (nz--) { 1084 idx = 7*(*vi--); 1085 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092 v -= 49; 1093 } 1094 } 1095 1096 /* copy t into x according to permutation */ 1097 ii = 0; 1098 for (i=0; i<n; i++) { 1099 ir = 7*r[i]; 1100 x[ir] = t[ii]; 1101 x[ir+1] = t[ii+1]; 1102 x[ir+2] = t[ii+2]; 1103 x[ir+3] = t[ii+3]; 1104 x[ir+4] = t[ii+4]; 1105 x[ir+5] = t[ii+5]; 1106 x[ir+6] = t[ii+6]; 1107 ii += 7; 1108 } 1109 1110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1112 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 /* ----------------------------------------------------------- */ 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1124 IS iscol=a->col,isrow=a->row; 1125 PetscErrorCode ierr; 1126 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1127 PetscInt i,n=a->mbs; 1128 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1129 MatScalar *aa=a->a,*v; 1130 PetscScalar *x,*b,*s,*t,*ls; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135 t = a->solve_work; 1136 1137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1139 1140 /* forward solve the lower triangular */ 1141 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1142 for (i=1; i<n; i++) { 1143 v = aa + bs2*ai[i]; 1144 vi = aj + ai[i]; 1145 nz = a->diag[i] - ai[i]; 1146 s = t + bs*i; 1147 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 while (nz--) { 1149 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1150 v += bs2; 1151 } 1152 } 1153 /* backward solve the upper triangular */ 1154 ls = a->solve_work + A->cmap->n; 1155 for (i=n-1; i>=0; i--){ 1156 v = aa + bs2*(a->diag[i] + 1); 1157 vi = aj + a->diag[i] + 1; 1158 nz = ai[i+1] - a->diag[i] - 1; 1159 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1160 while (nz--) { 1161 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1162 v += bs2; 1163 } 1164 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1165 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1166 } 1167 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1179 { 1180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1181 IS iscol=a->col,isrow=a->row; 1182 PetscErrorCode ierr; 1183 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1184 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1185 MatScalar *aa=a->a,*v; 1186 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1187 PetscScalar *x,*b,*t; 1188 1189 PetscFunctionBegin; 1190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192 t = a->solve_work; 1193 1194 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1195 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1196 1197 /* forward solve the lower triangular */ 1198 idx = 7*(*r++); 1199 t[0] = b[idx]; t[1] = b[1+idx]; 1200 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201 t[5] = b[5+idx]; t[6] = b[6+idx]; 1202 1203 for (i=1; i<n; i++) { 1204 v = aa + 49*ai[i]; 1205 vi = aj + ai[i]; 1206 nz = diag[i] - ai[i]; 1207 idx = 7*(*r++); 1208 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1210 while (nz--) { 1211 idx = 7*(*vi++); 1212 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213 x4 = t[3+idx];x5 = t[4+idx]; 1214 x6 = t[5+idx];x7 = t[6+idx]; 1215 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1222 v += 49; 1223 } 1224 idx = 7*i; 1225 t[idx] = s1;t[1+idx] = s2; 1226 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227 t[5+idx] = s6;t[6+idx] = s7; 1228 } 1229 /* backward solve the upper triangular */ 1230 for (i=n-1; i>=0; i--){ 1231 v = aa + 49*diag[i] + 49; 1232 vi = aj + diag[i] + 1; 1233 nz = ai[i+1] - diag[i] - 1; 1234 idt = 7*i; 1235 s1 = t[idt]; s2 = t[1+idt]; 1236 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237 s6 = t[5+idt];s7 = t[6+idt]; 1238 while (nz--) { 1239 idx = 7*(*vi++); 1240 x1 = t[idx]; x2 = t[1+idx]; 1241 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242 x6 = t[5+idx]; x7 = t[6+idx]; 1243 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1250 v += 49; 1251 } 1252 idc = 7*(*c--); 1253 v = aa + 49*diag[i]; 1254 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1268 } 1269 1270 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1271 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1272 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1273 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1275 PetscFunctionReturn(0); 1276 } 1277 1278 #undef __FUNCT__ 1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1280 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1281 { 1282 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1283 IS iscol=a->col,isrow=a->row; 1284 PetscErrorCode ierr; 1285 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 1286 PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 1287 MatScalar *aa=a->a,*v; 1288 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1289 PetscScalar *x,*b,*t; 1290 1291 PetscFunctionBegin; 1292 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1293 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1294 t = a->solve_work; 1295 1296 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1297 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1298 1299 /* forward solve the lower triangular */ 1300 idx = 7*r[0]; 1301 t[0] = b[idx]; t[1] = b[1+idx]; 1302 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1303 t[5] = b[5+idx]; t[6] = b[6+idx]; 1304 1305 for (i=1; i<n; i++) { 1306 v = aa + 49*ai[i]; 1307 vi = aj + ai[i]; 1308 nz = ai[i+1] - ai[i]; 1309 idx = 7*r[i]; 1310 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1311 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1312 for(m=0;m<nz;m++){ 1313 idx = 7*vi[m]; 1314 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1315 x4 = t[3+idx];x5 = t[4+idx]; 1316 x6 = t[5+idx];x7 = t[6+idx]; 1317 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1318 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1319 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1320 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1321 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1322 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1323 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1324 v += 49; 1325 } 1326 idx = 7*i; 1327 t[idx] = s1;t[1+idx] = s2; 1328 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1329 t[5+idx] = s6;t[6+idx] = s7; 1330 } 1331 /* backward solve the upper triangular */ 1332 for (i=n-1; i>=0; i--){ 1333 k = 2*n-i; 1334 v = aa + 49*ai[k]; 1335 vi = aj + ai[k]; 1336 nz = ai[k+1] - ai[k] - 1; 1337 idt = 7*i; 1338 s1 = t[idt]; s2 = t[1+idt]; 1339 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1340 s6 = t[5+idt];s7 = t[6+idt]; 1341 for(m=0;m<nz;m++){ 1342 idx = 7*vi[m]; 1343 x1 = t[idx]; x2 = t[1+idx]; 1344 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1345 x6 = t[5+idx]; x7 = t[6+idx]; 1346 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1347 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1348 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1349 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1350 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1351 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1352 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1353 v += 49; 1354 } 1355 idc = 7*c[i]; 1356 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1357 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1358 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1359 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1360 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1361 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1362 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1363 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1364 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1365 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1366 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1367 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1368 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1369 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1370 } 1371 1372 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1373 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1374 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1375 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1376 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1377 PetscFunctionReturn(0); 1378 } 1379 1380 #undef __FUNCT__ 1381 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1382 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1383 { 1384 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1385 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1386 PetscErrorCode ierr; 1387 PetscInt *diag = a->diag,jdx; 1388 const MatScalar *aa=a->a,*v; 1389 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1390 const PetscScalar *b; 1391 1392 PetscFunctionBegin; 1393 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1394 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1395 /* forward solve the lower triangular */ 1396 idx = 0; 1397 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1398 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1399 x[6] = b[6+idx]; 1400 for (i=1; i<n; i++) { 1401 v = aa + 49*ai[i]; 1402 vi = aj + ai[i]; 1403 nz = diag[i] - ai[i]; 1404 idx = 7*i; 1405 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1406 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1407 s7 = b[6+idx]; 1408 while (nz--) { 1409 jdx = 7*(*vi++); 1410 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1411 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1412 x7 = x[6+jdx]; 1413 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1414 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1415 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1416 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1417 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1418 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1419 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1420 v += 49; 1421 } 1422 x[idx] = s1; 1423 x[1+idx] = s2; 1424 x[2+idx] = s3; 1425 x[3+idx] = s4; 1426 x[4+idx] = s5; 1427 x[5+idx] = s6; 1428 x[6+idx] = s7; 1429 } 1430 /* backward solve the upper triangular */ 1431 for (i=n-1; i>=0; i--){ 1432 v = aa + 49*diag[i] + 49; 1433 vi = aj + diag[i] + 1; 1434 nz = ai[i+1] - diag[i] - 1; 1435 idt = 7*i; 1436 s1 = x[idt]; s2 = x[1+idt]; 1437 s3 = x[2+idt]; s4 = x[3+idt]; 1438 s5 = x[4+idt]; s6 = x[5+idt]; 1439 s7 = x[6+idt]; 1440 while (nz--) { 1441 idx = 7*(*vi++); 1442 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1443 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1444 x7 = x[6+idx]; 1445 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1446 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1447 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1448 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1449 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1450 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1451 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1452 v += 49; 1453 } 1454 v = aa + 49*diag[i]; 1455 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1456 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1457 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1458 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1459 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1460 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1461 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1462 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1463 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1464 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1465 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1466 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1467 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1468 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1469 } 1470 1471 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1472 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1473 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1474 PetscFunctionReturn(0); 1475 } 1476 1477 #undef __FUNCT__ 1478 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1479 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1480 { 1481 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1482 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1483 PetscErrorCode ierr; 1484 PetscInt idx,jdx,idt; 1485 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1486 const MatScalar *aa=a->a,*v; 1487 PetscScalar *x; 1488 const PetscScalar *b; 1489 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1490 1491 PetscFunctionBegin; 1492 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1493 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1494 /* forward solve the lower triangular */ 1495 idx = 0; 1496 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1497 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1498 for (i=1; i<n; i++) { 1499 v = aa + bs2*ai[i]; 1500 vi = aj + ai[i]; 1501 nz = ai[i+1] - ai[i]; 1502 idx = bs*i; 1503 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1504 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1505 for(k=0;k<nz;k++) { 1506 jdx = bs*vi[k]; 1507 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1508 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1509 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1510 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1511 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1512 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1513 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1514 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1515 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1516 v += bs2; 1517 } 1518 1519 x[idx] = s1; 1520 x[1+idx] = s2; 1521 x[2+idx] = s3; 1522 x[3+idx] = s4; 1523 x[4+idx] = s5; 1524 x[5+idx] = s6; 1525 x[6+idx] = s7; 1526 } 1527 1528 /* backward solve the upper triangular */ 1529 for (i=n-1; i>=0; i--){ 1530 v = aa + bs2*ai[2*n-i]; 1531 vi = aj + ai[2*n-i]; 1532 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1533 idt = bs*i; 1534 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1535 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1536 for(k=0;k<nz;k++) { 1537 idx = bs*vi[k]; 1538 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1539 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1540 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1541 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1542 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1543 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1544 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1545 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1546 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1547 v += bs2; 1548 } 1549 /* x = inv_diagonal*x */ 1550 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1551 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1552 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1553 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1554 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1555 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1556 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1557 } 1558 1559 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1560 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1561 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 #undef __FUNCT__ 1566 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1567 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1568 { 1569 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1570 IS iscol=a->col,isrow=a->row; 1571 PetscErrorCode ierr; 1572 const PetscInt *r,*c,*rout,*cout; 1573 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1574 const MatScalar *aa=a->a,*v; 1575 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1576 const PetscScalar *b; 1577 PetscFunctionBegin; 1578 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1579 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1580 t = a->solve_work; 1581 1582 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1583 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1584 1585 /* forward solve the lower triangular */ 1586 idx = 6*(*r++); 1587 t[0] = b[idx]; t[1] = b[1+idx]; 1588 t[2] = b[2+idx]; t[3] = b[3+idx]; 1589 t[4] = b[4+idx]; t[5] = b[5+idx]; 1590 for (i=1; i<n; i++) { 1591 v = aa + 36*ai[i]; 1592 vi = aj + ai[i]; 1593 nz = diag[i] - ai[i]; 1594 idx = 6*(*r++); 1595 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1596 s5 = b[4+idx]; s6 = b[5+idx]; 1597 while (nz--) { 1598 idx = 6*(*vi++); 1599 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1600 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1601 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1602 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1603 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1604 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1605 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1606 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1607 v += 36; 1608 } 1609 idx = 6*i; 1610 t[idx] = s1;t[1+idx] = s2; 1611 t[2+idx] = s3;t[3+idx] = s4; 1612 t[4+idx] = s5;t[5+idx] = s6; 1613 } 1614 /* backward solve the upper triangular */ 1615 for (i=n-1; i>=0; i--){ 1616 v = aa + 36*diag[i] + 36; 1617 vi = aj + diag[i] + 1; 1618 nz = ai[i+1] - diag[i] - 1; 1619 idt = 6*i; 1620 s1 = t[idt]; s2 = t[1+idt]; 1621 s3 = t[2+idt];s4 = t[3+idt]; 1622 s5 = t[4+idt];s6 = t[5+idt]; 1623 while (nz--) { 1624 idx = 6*(*vi++); 1625 x1 = t[idx]; x2 = t[1+idx]; 1626 x3 = t[2+idx]; x4 = t[3+idx]; 1627 x5 = t[4+idx]; x6 = t[5+idx]; 1628 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1629 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1630 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1631 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1632 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1633 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1634 v += 36; 1635 } 1636 idc = 6*(*c--); 1637 v = aa + 36*diag[i]; 1638 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1639 v[18]*s4+v[24]*s5+v[30]*s6; 1640 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1641 v[19]*s4+v[25]*s5+v[31]*s6; 1642 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1643 v[20]*s4+v[26]*s5+v[32]*s6; 1644 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1645 v[21]*s4+v[27]*s5+v[33]*s6; 1646 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1647 v[22]*s4+v[28]*s5+v[34]*s6; 1648 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1649 v[23]*s4+v[29]*s5+v[35]*s6; 1650 } 1651 1652 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1653 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1654 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1655 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1656 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1657 PetscFunctionReturn(0); 1658 } 1659 1660 #undef __FUNCT__ 1661 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 1662 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 1663 { 1664 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1665 IS iscol=a->col,isrow=a->row; 1666 PetscErrorCode ierr; 1667 const PetscInt *r,*c,*rout,*cout; 1668 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 1669 const MatScalar *aa=a->a,*v; 1670 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1671 const PetscScalar *b; 1672 PetscFunctionBegin; 1673 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1674 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1675 t = a->solve_work; 1676 1677 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1678 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1679 1680 /* forward solve the lower triangular */ 1681 idx = 6*r[0]; 1682 t[0] = b[idx]; t[1] = b[1+idx]; 1683 t[2] = b[2+idx]; t[3] = b[3+idx]; 1684 t[4] = b[4+idx]; t[5] = b[5+idx]; 1685 for (i=1; i<n; i++) { 1686 v = aa + 36*ai[i]; 1687 vi = aj + ai[i]; 1688 nz = ai[i+1] - ai[i]; 1689 idx = 6*r[i]; 1690 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1691 s5 = b[4+idx]; s6 = b[5+idx]; 1692 for(m=0;m<nz;m++){ 1693 idx = 6*vi[m]; 1694 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1695 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1696 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1697 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1698 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1699 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1700 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1701 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1702 v += 36; 1703 } 1704 idx = 6*i; 1705 t[idx] = s1;t[1+idx] = s2; 1706 t[2+idx] = s3;t[3+idx] = s4; 1707 t[4+idx] = s5;t[5+idx] = s6; 1708 } 1709 /* backward solve the upper triangular */ 1710 for (i=n-1; i>=0; i--){ 1711 k = 2*n-i; 1712 v = aa + 36*ai[k]; 1713 vi = aj + ai[k]; 1714 nz = ai[k+1] - ai[k] - 1; 1715 idt = 6*i; 1716 s1 = t[idt]; s2 = t[1+idt]; 1717 s3 = t[2+idt];s4 = t[3+idt]; 1718 s5 = t[4+idt];s6 = t[5+idt]; 1719 for(m=0;m<nz;m++){ 1720 idx = 6*vi[m]; 1721 x1 = t[idx]; x2 = t[1+idx]; 1722 x3 = t[2+idx]; x4 = t[3+idx]; 1723 x5 = t[4+idx]; x6 = t[5+idx]; 1724 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1725 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1726 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1727 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1728 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1729 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1730 v += 36; 1731 } 1732 idc = 6*c[i]; 1733 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1734 v[18]*s4+v[24]*s5+v[30]*s6; 1735 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1736 v[19]*s4+v[25]*s5+v[31]*s6; 1737 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1738 v[20]*s4+v[26]*s5+v[32]*s6; 1739 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1740 v[21]*s4+v[27]*s5+v[33]*s6; 1741 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1742 v[22]*s4+v[28]*s5+v[34]*s6; 1743 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1744 v[23]*s4+v[29]*s5+v[35]*s6; 1745 } 1746 1747 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1748 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1749 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1750 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1751 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1752 PetscFunctionReturn(0); 1753 } 1754 1755 1756 #undef __FUNCT__ 1757 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1758 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1759 { 1760 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1761 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1762 PetscErrorCode ierr; 1763 PetscInt *diag = a->diag,jdx; 1764 const MatScalar *aa=a->a,*v; 1765 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1766 const PetscScalar *b; 1767 1768 PetscFunctionBegin; 1769 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1770 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1771 /* forward solve the lower triangular */ 1772 idx = 0; 1773 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1774 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1775 for (i=1; i<n; i++) { 1776 v = aa + 36*ai[i]; 1777 vi = aj + ai[i]; 1778 nz = diag[i] - ai[i]; 1779 idx = 6*i; 1780 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1781 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1782 while (nz--) { 1783 jdx = 6*(*vi++); 1784 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1785 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1786 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1787 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1788 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1789 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1790 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1791 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1792 v += 36; 1793 } 1794 x[idx] = s1; 1795 x[1+idx] = s2; 1796 x[2+idx] = s3; 1797 x[3+idx] = s4; 1798 x[4+idx] = s5; 1799 x[5+idx] = s6; 1800 } 1801 /* backward solve the upper triangular */ 1802 for (i=n-1; i>=0; i--){ 1803 v = aa + 36*diag[i] + 36; 1804 vi = aj + diag[i] + 1; 1805 nz = ai[i+1] - diag[i] - 1; 1806 idt = 6*i; 1807 s1 = x[idt]; s2 = x[1+idt]; 1808 s3 = x[2+idt]; s4 = x[3+idt]; 1809 s5 = x[4+idt]; s6 = x[5+idt]; 1810 while (nz--) { 1811 idx = 6*(*vi++); 1812 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1813 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1814 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1815 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1816 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1817 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1818 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1819 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1820 v += 36; 1821 } 1822 v = aa + 36*diag[i]; 1823 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1824 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1825 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1826 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1827 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1828 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1829 } 1830 1831 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1832 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1833 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1834 PetscFunctionReturn(0); 1835 } 1836 1837 #undef __FUNCT__ 1838 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1839 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1840 { 1841 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1842 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1843 PetscErrorCode ierr; 1844 PetscInt idx,jdx,idt; 1845 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1846 const MatScalar *aa=a->a,*v; 1847 PetscScalar *x; 1848 const PetscScalar *b; 1849 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1850 1851 PetscFunctionBegin; 1852 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1853 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1854 /* forward solve the lower triangular */ 1855 idx = 0; 1856 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1857 x[4] = b[4+idx];x[5] = b[5+idx]; 1858 for (i=1; i<n; i++) { 1859 v = aa + bs2*ai[i]; 1860 vi = aj + ai[i]; 1861 nz = ai[i+1] - ai[i]; 1862 idx = bs*i; 1863 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1864 s5 = b[4+idx];s6 = b[5+idx]; 1865 for(k=0;k<nz;k++){ 1866 jdx = bs*vi[k]; 1867 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1868 x5 = x[4+jdx]; x6 = x[5+jdx]; 1869 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1870 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1871 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1872 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1873 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1874 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1875 v += bs2; 1876 } 1877 1878 x[idx] = s1; 1879 x[1+idx] = s2; 1880 x[2+idx] = s3; 1881 x[3+idx] = s4; 1882 x[4+idx] = s5; 1883 x[5+idx] = s6; 1884 } 1885 1886 /* backward solve the upper triangular */ 1887 for (i=n-1; i>=0; i--){ 1888 v = aa + bs2*ai[2*n-i]; 1889 vi = aj + ai[2*n-i]; 1890 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1891 idt = bs*i; 1892 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1893 s5 = x[4+idt];s6 = x[5+idt]; 1894 for(k=0;k<nz;k++){ 1895 idx = bs*vi[k]; 1896 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1897 x5 = x[4+idx];x6 = x[5+idx]; 1898 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1899 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1900 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1901 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1902 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1903 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1904 v += bs2; 1905 } 1906 /* x = inv_diagonal*x */ 1907 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1908 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1909 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1910 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1911 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1912 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1913 } 1914 1915 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1916 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1917 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1918 PetscFunctionReturn(0); 1919 } 1920 1921 #undef __FUNCT__ 1922 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1923 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1924 { 1925 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1926 IS iscol=a->col,isrow=a->row; 1927 PetscErrorCode ierr; 1928 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 1929 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1930 const MatScalar *aa=a->a,*v; 1931 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1932 const PetscScalar *b; 1933 1934 PetscFunctionBegin; 1935 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1936 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1937 t = a->solve_work; 1938 1939 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1940 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1941 1942 /* forward solve the lower triangular */ 1943 idx = 5*(*r++); 1944 t[0] = b[idx]; t[1] = b[1+idx]; 1945 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1946 for (i=1; i<n; i++) { 1947 v = aa + 25*ai[i]; 1948 vi = aj + ai[i]; 1949 nz = diag[i] - ai[i]; 1950 idx = 5*(*r++); 1951 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1952 s5 = b[4+idx]; 1953 while (nz--) { 1954 idx = 5*(*vi++); 1955 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1956 x4 = t[3+idx];x5 = t[4+idx]; 1957 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1958 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1959 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1960 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1961 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1962 v += 25; 1963 } 1964 idx = 5*i; 1965 t[idx] = s1;t[1+idx] = s2; 1966 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1967 } 1968 /* backward solve the upper triangular */ 1969 for (i=n-1; i>=0; i--){ 1970 v = aa + 25*diag[i] + 25; 1971 vi = aj + diag[i] + 1; 1972 nz = ai[i+1] - diag[i] - 1; 1973 idt = 5*i; 1974 s1 = t[idt]; s2 = t[1+idt]; 1975 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1976 while (nz--) { 1977 idx = 5*(*vi++); 1978 x1 = t[idx]; x2 = t[1+idx]; 1979 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1980 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1981 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1982 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1983 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1984 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1985 v += 25; 1986 } 1987 idc = 5*(*c--); 1988 v = aa + 25*diag[i]; 1989 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1990 v[15]*s4+v[20]*s5; 1991 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1992 v[16]*s4+v[21]*s5; 1993 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1994 v[17]*s4+v[22]*s5; 1995 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1996 v[18]*s4+v[23]*s5; 1997 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1998 v[19]*s4+v[24]*s5; 1999 } 2000 2001 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2002 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2003 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2004 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2005 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2006 PetscFunctionReturn(0); 2007 } 2008 2009 #undef __FUNCT__ 2010 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2011 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2012 { 2013 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2014 IS iscol=a->col,isrow=a->row; 2015 PetscErrorCode ierr; 2016 const PetscInt *r,*c,*rout,*cout; 2017 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2018 const MatScalar *aa=a->a,*v; 2019 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2020 const PetscScalar *b; 2021 2022 PetscFunctionBegin; 2023 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2024 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2025 t = a->solve_work; 2026 2027 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2028 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2029 2030 /* forward solve the lower triangular */ 2031 idx = 5*r[0]; 2032 t[0] = b[idx]; t[1] = b[1+idx]; 2033 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2034 for (i=1; i<n; i++) { 2035 v = aa + 25*ai[i]; 2036 vi = aj + ai[i]; 2037 nz = ai[i+1] - ai[i]; 2038 idx = 5*r[i]; 2039 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2040 s5 = b[4+idx]; 2041 for(m=0;m<nz;m++){ 2042 idx = 5*vi[m]; 2043 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2044 x4 = t[3+idx];x5 = t[4+idx]; 2045 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2046 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2047 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2048 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2049 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2050 v += 25; 2051 } 2052 idx = 5*i; 2053 t[idx] = s1;t[1+idx] = s2; 2054 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2055 } 2056 /* backward solve the upper triangular */ 2057 for (i=n-1; i>=0; i--){ 2058 k = 2*n-i; 2059 v = aa + 25*ai[k]; 2060 vi = aj + ai[k]; 2061 nz = ai[k+1] - ai[k] - 1; 2062 idt = 5*i; 2063 s1 = t[idt]; s2 = t[1+idt]; 2064 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2065 for(m=0;m<nz;m++){ 2066 idx = 5*vi[m]; 2067 x1 = t[idx]; x2 = t[1+idx]; 2068 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2069 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2070 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2071 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2072 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2073 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2074 v += 25; 2075 } 2076 idc = 5*c[i]; 2077 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2078 v[15]*s4+v[20]*s5; 2079 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2080 v[16]*s4+v[21]*s5; 2081 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2082 v[17]*s4+v[22]*s5; 2083 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2084 v[18]*s4+v[23]*s5; 2085 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2086 v[19]*s4+v[24]*s5; 2087 } 2088 2089 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2090 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2091 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2092 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2093 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2094 PetscFunctionReturn(0); 2095 } 2096 #undef __FUNCT__ 2097 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2098 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 2099 { 2100 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2101 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2102 PetscErrorCode ierr; 2103 PetscInt *diag = a->diag,jdx; 2104 const MatScalar *aa=a->a,*v; 2105 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2106 const PetscScalar *b; 2107 2108 PetscFunctionBegin; 2109 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2110 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2111 /* forward solve the lower triangular */ 2112 idx = 0; 2113 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2114 for (i=1; i<n; i++) { 2115 v = aa + 25*ai[i]; 2116 vi = aj + ai[i]; 2117 nz = diag[i] - ai[i]; 2118 idx = 5*i; 2119 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2120 while (nz--) { 2121 jdx = 5*(*vi++); 2122 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2123 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2124 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2125 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2126 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2127 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2128 v += 25; 2129 } 2130 x[idx] = s1; 2131 x[1+idx] = s2; 2132 x[2+idx] = s3; 2133 x[3+idx] = s4; 2134 x[4+idx] = s5; 2135 } 2136 /* backward solve the upper triangular */ 2137 for (i=n-1; i>=0; i--){ 2138 v = aa + 25*diag[i] + 25; 2139 vi = aj + diag[i] + 1; 2140 nz = ai[i+1] - diag[i] - 1; 2141 idt = 5*i; 2142 s1 = x[idt]; s2 = x[1+idt]; 2143 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2144 while (nz--) { 2145 idx = 5*(*vi++); 2146 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2147 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2148 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2149 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2150 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2151 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2152 v += 25; 2153 } 2154 v = aa + 25*diag[i]; 2155 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2156 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2157 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2158 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2159 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2160 } 2161 2162 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2163 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2164 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2165 PetscFunctionReturn(0); 2166 } 2167 2168 #undef __FUNCT__ 2169 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2170 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2171 { 2172 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2173 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2174 PetscErrorCode ierr; 2175 PetscInt jdx; 2176 const MatScalar *aa=a->a,*v; 2177 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2178 const PetscScalar *b; 2179 2180 PetscFunctionBegin; 2181 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2182 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2183 /* forward solve the lower triangular */ 2184 idx = 0; 2185 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2186 for (i=1; i<n; i++) { 2187 v = aa + 25*ai[i]; 2188 vi = aj + ai[i]; 2189 nz = ai[i+1] - ai[i]; 2190 idx = 5*i; 2191 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2192 for(k=0;k<nz;k++) { 2193 jdx = 5*vi[k]; 2194 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2195 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2196 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2197 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2198 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2199 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2200 v += 25; 2201 } 2202 x[idx] = s1; 2203 x[1+idx] = s2; 2204 x[2+idx] = s3; 2205 x[3+idx] = s4; 2206 x[4+idx] = s5; 2207 } 2208 2209 /* backward solve the upper triangular */ 2210 for (i=n-1; i>=0; i--){ 2211 v = aa + 25*ai[2*n-i]; 2212 vi = aj + ai[2*n-i]; 2213 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2214 idt = 5*i; 2215 s1 = x[idt]; s2 = x[1+idt]; 2216 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2217 for(k=0;k<nz;k++){ 2218 idx = 5*vi[k]; 2219 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2220 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2221 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2222 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2223 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2224 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2225 v += 25; 2226 } 2227 /* x = inv_diagonal*x */ 2228 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2229 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2230 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2231 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2232 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2233 } 2234 2235 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2236 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2237 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2238 PetscFunctionReturn(0); 2239 } 2240 2241 #undef __FUNCT__ 2242 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2243 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 2244 { 2245 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2246 IS iscol=a->col,isrow=a->row; 2247 PetscErrorCode ierr; 2248 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2249 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2250 const MatScalar *aa=a->a,*v; 2251 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2252 const PetscScalar *b; 2253 2254 PetscFunctionBegin; 2255 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2256 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2257 t = a->solve_work; 2258 2259 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2260 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2261 2262 /* forward solve the lower triangular */ 2263 idx = 4*(*r++); 2264 t[0] = b[idx]; t[1] = b[1+idx]; 2265 t[2] = b[2+idx]; t[3] = b[3+idx]; 2266 for (i=1; i<n; i++) { 2267 v = aa + 16*ai[i]; 2268 vi = aj + ai[i]; 2269 nz = diag[i] - ai[i]; 2270 idx = 4*(*r++); 2271 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2272 while (nz--) { 2273 idx = 4*(*vi++); 2274 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2275 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2276 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2277 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2278 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2279 v += 16; 2280 } 2281 idx = 4*i; 2282 t[idx] = s1;t[1+idx] = s2; 2283 t[2+idx] = s3;t[3+idx] = s4; 2284 } 2285 /* backward solve the upper triangular */ 2286 for (i=n-1; i>=0; i--){ 2287 v = aa + 16*diag[i] + 16; 2288 vi = aj + diag[i] + 1; 2289 nz = ai[i+1] - diag[i] - 1; 2290 idt = 4*i; 2291 s1 = t[idt]; s2 = t[1+idt]; 2292 s3 = t[2+idt];s4 = t[3+idt]; 2293 while (nz--) { 2294 idx = 4*(*vi++); 2295 x1 = t[idx]; x2 = t[1+idx]; 2296 x3 = t[2+idx]; x4 = t[3+idx]; 2297 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2298 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2299 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2300 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2301 v += 16; 2302 } 2303 idc = 4*(*c--); 2304 v = aa + 16*diag[i]; 2305 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2306 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2307 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2308 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2309 } 2310 2311 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2312 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2313 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2314 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2315 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2316 PetscFunctionReturn(0); 2317 } 2318 2319 #undef __FUNCT__ 2320 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2321 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 2322 { 2323 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2324 IS iscol=a->col,isrow=a->row; 2325 PetscErrorCode ierr; 2326 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2327 const PetscInt *r,*c,*rout,*cout; 2328 const MatScalar *aa=a->a,*v; 2329 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2330 const PetscScalar *b; 2331 2332 PetscFunctionBegin; 2333 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2334 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2335 t = a->solve_work; 2336 2337 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2338 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2339 2340 /* forward solve the lower triangular */ 2341 idx = 4*r[0]; 2342 t[0] = b[idx]; t[1] = b[1+idx]; 2343 t[2] = b[2+idx]; t[3] = b[3+idx]; 2344 for (i=1; i<n; i++) { 2345 v = aa + 16*ai[i]; 2346 vi = aj + ai[i]; 2347 nz = ai[i+1] - ai[i]; 2348 idx = 4*r[i]; 2349 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2350 for(m=0;m<nz;m++){ 2351 idx = 4*vi[m]; 2352 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2353 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2354 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2355 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2356 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2357 v += 16; 2358 } 2359 idx = 4*i; 2360 t[idx] = s1;t[1+idx] = s2; 2361 t[2+idx] = s3;t[3+idx] = s4; 2362 } 2363 /* backward solve the upper triangular */ 2364 for (i=n-1; i>=0; i--){ 2365 k = 2*n-i; 2366 v = aa + 16*ai[k]; 2367 vi = aj + ai[k]; 2368 nz = ai[k+1] - ai[k] - 1; 2369 idt = 4*i; 2370 s1 = t[idt]; s2 = t[1+idt]; 2371 s3 = t[2+idt];s4 = t[3+idt]; 2372 for(m=0;m<nz;m++){ 2373 idx = 4*vi[m]; 2374 x1 = t[idx]; x2 = t[1+idx]; 2375 x3 = t[2+idx]; x4 = t[3+idx]; 2376 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2377 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2378 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2379 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2380 v += 16; 2381 } 2382 idc = 4*c[i]; 2383 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2384 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2385 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2386 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2387 } 2388 2389 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2390 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2391 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2392 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2393 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2394 PetscFunctionReturn(0); 2395 } 2396 2397 #undef __FUNCT__ 2398 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2399 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2400 { 2401 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2402 IS iscol=a->col,isrow=a->row; 2403 PetscErrorCode ierr; 2404 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2405 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2406 const MatScalar *aa=a->a,*v; 2407 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2408 PetscScalar *x; 2409 const PetscScalar *b; 2410 2411 PetscFunctionBegin; 2412 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2413 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2414 t = (MatScalar *)a->solve_work; 2415 2416 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2417 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2418 2419 /* forward solve the lower triangular */ 2420 idx = 4*(*r++); 2421 t[0] = (MatScalar)b[idx]; 2422 t[1] = (MatScalar)b[1+idx]; 2423 t[2] = (MatScalar)b[2+idx]; 2424 t[3] = (MatScalar)b[3+idx]; 2425 for (i=1; i<n; i++) { 2426 v = aa + 16*ai[i]; 2427 vi = aj + ai[i]; 2428 nz = diag[i] - ai[i]; 2429 idx = 4*(*r++); 2430 s1 = (MatScalar)b[idx]; 2431 s2 = (MatScalar)b[1+idx]; 2432 s3 = (MatScalar)b[2+idx]; 2433 s4 = (MatScalar)b[3+idx]; 2434 while (nz--) { 2435 idx = 4*(*vi++); 2436 x1 = t[idx]; 2437 x2 = t[1+idx]; 2438 x3 = t[2+idx]; 2439 x4 = t[3+idx]; 2440 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2441 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2442 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2443 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2444 v += 16; 2445 } 2446 idx = 4*i; 2447 t[idx] = s1; 2448 t[1+idx] = s2; 2449 t[2+idx] = s3; 2450 t[3+idx] = s4; 2451 } 2452 /* backward solve the upper triangular */ 2453 for (i=n-1; i>=0; i--){ 2454 v = aa + 16*diag[i] + 16; 2455 vi = aj + diag[i] + 1; 2456 nz = ai[i+1] - diag[i] - 1; 2457 idt = 4*i; 2458 s1 = t[idt]; 2459 s2 = t[1+idt]; 2460 s3 = t[2+idt]; 2461 s4 = t[3+idt]; 2462 while (nz--) { 2463 idx = 4*(*vi++); 2464 x1 = t[idx]; 2465 x2 = t[1+idx]; 2466 x3 = t[2+idx]; 2467 x4 = t[3+idx]; 2468 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2469 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2470 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2471 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2472 v += 16; 2473 } 2474 idc = 4*(*c--); 2475 v = aa + 16*diag[i]; 2476 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2477 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2478 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2479 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2480 x[idc] = (PetscScalar)t[idt]; 2481 x[1+idc] = (PetscScalar)t[1+idt]; 2482 x[2+idc] = (PetscScalar)t[2+idt]; 2483 x[3+idc] = (PetscScalar)t[3+idt]; 2484 } 2485 2486 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2487 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2488 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2489 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2490 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2491 PetscFunctionReturn(0); 2492 } 2493 2494 #if defined (PETSC_HAVE_SSE) 2495 2496 #include PETSC_HAVE_SSE 2497 2498 #undef __FUNCT__ 2499 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2500 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 2501 { 2502 /* 2503 Note: This code uses demotion of double 2504 to float when performing the mixed-mode computation. 2505 This may not be numerically reasonable for all applications. 2506 */ 2507 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2508 IS iscol=a->col,isrow=a->row; 2509 PetscErrorCode ierr; 2510 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 2511 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2512 MatScalar *aa=a->a,*v; 2513 PetscScalar *x,*b,*t; 2514 2515 /* Make space in temp stack for 16 Byte Aligned arrays */ 2516 float ssealignedspace[11],*tmps,*tmpx; 2517 unsigned long offset; 2518 2519 PetscFunctionBegin; 2520 SSE_SCOPE_BEGIN; 2521 2522 offset = (unsigned long)ssealignedspace % 16; 2523 if (offset) offset = (16 - offset)/4; 2524 tmps = &ssealignedspace[offset]; 2525 tmpx = &ssealignedspace[offset+4]; 2526 PREFETCH_NTA(aa+16*ai[1]); 2527 2528 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2529 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2530 t = a->solve_work; 2531 2532 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2533 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2534 2535 /* forward solve the lower triangular */ 2536 idx = 4*(*r++); 2537 t[0] = b[idx]; t[1] = b[1+idx]; 2538 t[2] = b[2+idx]; t[3] = b[3+idx]; 2539 v = aa + 16*ai[1]; 2540 2541 for (i=1; i<n;) { 2542 PREFETCH_NTA(&v[8]); 2543 vi = aj + ai[i]; 2544 nz = diag[i] - ai[i]; 2545 idx = 4*(*r++); 2546 2547 /* Demote sum from double to float */ 2548 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2549 LOAD_PS(tmps,XMM7); 2550 2551 while (nz--) { 2552 PREFETCH_NTA(&v[16]); 2553 idx = 4*(*vi++); 2554 2555 /* Demote solution (so far) from double to float */ 2556 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2557 2558 /* 4x4 Matrix-Vector product with negative accumulation: */ 2559 SSE_INLINE_BEGIN_2(tmpx,v) 2560 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2561 2562 /* First Column */ 2563 SSE_COPY_PS(XMM0,XMM6) 2564 SSE_SHUFFLE(XMM0,XMM0,0x00) 2565 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2566 SSE_SUB_PS(XMM7,XMM0) 2567 2568 /* Second Column */ 2569 SSE_COPY_PS(XMM1,XMM6) 2570 SSE_SHUFFLE(XMM1,XMM1,0x55) 2571 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2572 SSE_SUB_PS(XMM7,XMM1) 2573 2574 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2575 2576 /* Third Column */ 2577 SSE_COPY_PS(XMM2,XMM6) 2578 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2579 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2580 SSE_SUB_PS(XMM7,XMM2) 2581 2582 /* Fourth Column */ 2583 SSE_COPY_PS(XMM3,XMM6) 2584 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2585 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2586 SSE_SUB_PS(XMM7,XMM3) 2587 SSE_INLINE_END_2 2588 2589 v += 16; 2590 } 2591 idx = 4*i; 2592 v = aa + 16*ai[++i]; 2593 PREFETCH_NTA(v); 2594 STORE_PS(tmps,XMM7); 2595 2596 /* Promote result from float to double */ 2597 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 2598 } 2599 /* backward solve the upper triangular */ 2600 idt = 4*(n-1); 2601 ai16 = 16*diag[n-1]; 2602 v = aa + ai16 + 16; 2603 for (i=n-1; i>=0;){ 2604 PREFETCH_NTA(&v[8]); 2605 vi = aj + diag[i] + 1; 2606 nz = ai[i+1] - diag[i] - 1; 2607 2608 /* Demote accumulator from double to float */ 2609 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 2610 LOAD_PS(tmps,XMM7); 2611 2612 while (nz--) { 2613 PREFETCH_NTA(&v[16]); 2614 idx = 4*(*vi++); 2615 2616 /* Demote solution (so far) from double to float */ 2617 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2618 2619 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2620 SSE_INLINE_BEGIN_2(tmpx,v) 2621 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2622 2623 /* First Column */ 2624 SSE_COPY_PS(XMM0,XMM6) 2625 SSE_SHUFFLE(XMM0,XMM0,0x00) 2626 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2627 SSE_SUB_PS(XMM7,XMM0) 2628 2629 /* Second Column */ 2630 SSE_COPY_PS(XMM1,XMM6) 2631 SSE_SHUFFLE(XMM1,XMM1,0x55) 2632 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2633 SSE_SUB_PS(XMM7,XMM1) 2634 2635 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2636 2637 /* Third Column */ 2638 SSE_COPY_PS(XMM2,XMM6) 2639 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2640 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2641 SSE_SUB_PS(XMM7,XMM2) 2642 2643 /* Fourth Column */ 2644 SSE_COPY_PS(XMM3,XMM6) 2645 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2646 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2647 SSE_SUB_PS(XMM7,XMM3) 2648 SSE_INLINE_END_2 2649 v += 16; 2650 } 2651 v = aa + ai16; 2652 ai16 = 16*diag[--i]; 2653 PREFETCH_NTA(aa+ai16+16); 2654 /* 2655 Scale the result by the diagonal 4x4 block, 2656 which was inverted as part of the factorization 2657 */ 2658 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2659 /* First Column */ 2660 SSE_COPY_PS(XMM0,XMM7) 2661 SSE_SHUFFLE(XMM0,XMM0,0x00) 2662 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2663 2664 /* Second Column */ 2665 SSE_COPY_PS(XMM1,XMM7) 2666 SSE_SHUFFLE(XMM1,XMM1,0x55) 2667 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2668 SSE_ADD_PS(XMM0,XMM1) 2669 2670 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2671 2672 /* Third Column */ 2673 SSE_COPY_PS(XMM2,XMM7) 2674 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2675 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2676 SSE_ADD_PS(XMM0,XMM2) 2677 2678 /* Fourth Column */ 2679 SSE_COPY_PS(XMM3,XMM7) 2680 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2681 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2682 SSE_ADD_PS(XMM0,XMM3) 2683 2684 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2685 SSE_INLINE_END_3 2686 2687 /* Promote solution from float to double */ 2688 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2689 2690 /* Apply reordering to t and stream into x. */ 2691 /* This way, x doesn't pollute the cache. */ 2692 /* Be careful with size: 2 doubles = 4 floats! */ 2693 idc = 4*(*c--); 2694 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2695 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2696 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2697 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2698 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2699 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2700 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2701 SSE_INLINE_END_2 2702 v = aa + ai16 + 16; 2703 idt -= 4; 2704 } 2705 2706 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2707 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2708 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2709 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2710 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2711 SSE_SCOPE_END; 2712 PetscFunctionReturn(0); 2713 } 2714 2715 #endif 2716 2717 2718 /* 2719 Special case where the matrix was ILU(0) factored in the natural 2720 ordering. This eliminates the need for the column and row permutation. 2721 */ 2722 #undef __FUNCT__ 2723 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2724 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2725 { 2726 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2727 PetscInt n=a->mbs; 2728 const PetscInt *ai=a->i,*aj=a->j; 2729 PetscErrorCode ierr; 2730 const PetscInt *diag = a->diag; 2731 const MatScalar *aa=a->a; 2732 PetscScalar *x; 2733 const PetscScalar *b; 2734 2735 PetscFunctionBegin; 2736 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2737 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2738 2739 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2740 { 2741 static PetscScalar w[2000]; /* very BAD need to fix */ 2742 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2743 } 2744 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2745 { 2746 static PetscScalar w[2000]; /* very BAD need to fix */ 2747 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2748 } 2749 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2750 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2751 #else 2752 { 2753 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2754 const MatScalar *v; 2755 PetscInt jdx,idt,idx,nz,i,ai16; 2756 const PetscInt *vi; 2757 2758 /* forward solve the lower triangular */ 2759 idx = 0; 2760 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2761 for (i=1; i<n; i++) { 2762 v = aa + 16*ai[i]; 2763 vi = aj + ai[i]; 2764 nz = diag[i] - ai[i]; 2765 idx += 4; 2766 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2767 while (nz--) { 2768 jdx = 4*(*vi++); 2769 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2770 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2771 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2772 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2773 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2774 v += 16; 2775 } 2776 x[idx] = s1; 2777 x[1+idx] = s2; 2778 x[2+idx] = s3; 2779 x[3+idx] = s4; 2780 } 2781 /* backward solve the upper triangular */ 2782 idt = 4*(n-1); 2783 for (i=n-1; i>=0; i--){ 2784 ai16 = 16*diag[i]; 2785 v = aa + ai16 + 16; 2786 vi = aj + diag[i] + 1; 2787 nz = ai[i+1] - diag[i] - 1; 2788 s1 = x[idt]; s2 = x[1+idt]; 2789 s3 = x[2+idt];s4 = x[3+idt]; 2790 while (nz--) { 2791 idx = 4*(*vi++); 2792 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2793 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2794 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2795 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2796 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2797 v += 16; 2798 } 2799 v = aa + ai16; 2800 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2801 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2802 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2803 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2804 idt -= 4; 2805 } 2806 } 2807 #endif 2808 2809 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2810 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2811 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2812 PetscFunctionReturn(0); 2813 } 2814 2815 #undef __FUNCT__ 2816 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 2817 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2818 { 2819 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2820 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2821 PetscErrorCode ierr; 2822 PetscInt idx,jdx,idt; 2823 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2824 const MatScalar *aa=a->a,*v; 2825 PetscScalar *x; 2826 const PetscScalar *b; 2827 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2828 2829 PetscFunctionBegin; 2830 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2831 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2832 /* forward solve the lower triangular */ 2833 idx = 0; 2834 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2835 for (i=1; i<n; i++) { 2836 v = aa + bs2*ai[i]; 2837 vi = aj + ai[i]; 2838 nz = ai[i+1] - ai[i]; 2839 idx = bs*i; 2840 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2841 for(k=0;k<nz;k++) { 2842 jdx = bs*vi[k]; 2843 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2844 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2845 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2846 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2847 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2848 2849 v += bs2; 2850 } 2851 2852 x[idx] = s1; 2853 x[1+idx] = s2; 2854 x[2+idx] = s3; 2855 x[3+idx] = s4; 2856 } 2857 2858 /* backward solve the upper triangular */ 2859 for (i=n-1; i>=0; i--){ 2860 v = aa + bs2*ai[2*n-i]; 2861 vi = aj + ai[2*n-i]; 2862 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2863 idt = bs*i; 2864 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2865 2866 for(k=0;k<nz;k++){ 2867 idx = bs*vi[k]; 2868 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2869 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2870 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2871 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2872 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2873 2874 v += bs2; 2875 } 2876 /* x = inv_diagonal*x */ 2877 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2878 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 2879 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2880 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2881 2882 } 2883 2884 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2885 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2886 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2887 PetscFunctionReturn(0); 2888 } 2889 2890 #undef __FUNCT__ 2891 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 2892 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2893 { 2894 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2895 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2896 PetscErrorCode ierr; 2897 PetscInt idx,jdx,idt; 2898 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2899 const MatScalar *aa=a->a,*v; 2900 PetscScalar *x; 2901 const PetscScalar *b; 2902 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2903 2904 PetscFunctionBegin; 2905 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2906 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2907 /* forward solve the lower triangular */ 2908 idx = 0; 2909 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2910 for (i=1; i<n; i++) { 2911 v = aa + bs2*ai[i]; 2912 vi = aj + ai[i]; 2913 nz = ai[i+1] - ai[i]; 2914 idx = bs*i; 2915 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2916 for(k=0;k<nz;k++) { 2917 jdx = bs*vi[k]; 2918 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2919 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2920 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2921 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2922 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2923 2924 v += bs2; 2925 } 2926 2927 x[idx] = s1; 2928 x[1+idx] = s2; 2929 x[2+idx] = s3; 2930 x[3+idx] = s4; 2931 } 2932 2933 /* backward solve the upper triangular */ 2934 for (i=n-1; i>=0; i--){ 2935 v = aa + bs2*(adiag[i+1]+1); 2936 vi = aj + adiag[i+1]+1; 2937 nz = adiag[i] - adiag[i+1]-1; 2938 idt = bs*i; 2939 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2940 2941 for(k=0;k<nz;k++){ 2942 idx = bs*vi[k]; 2943 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2944 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2945 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2946 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2947 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2948 2949 v += bs2; 2950 } 2951 /* x = inv_diagonal*x */ 2952 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2953 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 2954 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2955 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2956 2957 } 2958 2959 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2960 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2961 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2962 PetscFunctionReturn(0); 2963 } 2964 2965 #undef __FUNCT__ 2966 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2967 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2968 { 2969 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2970 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2971 PetscErrorCode ierr; 2972 PetscInt *diag = a->diag; 2973 MatScalar *aa=a->a; 2974 PetscScalar *x,*b; 2975 2976 PetscFunctionBegin; 2977 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2978 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2979 2980 { 2981 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2982 MatScalar *v,*t=(MatScalar *)x; 2983 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2984 2985 /* forward solve the lower triangular */ 2986 idx = 0; 2987 t[0] = (MatScalar)b[0]; 2988 t[1] = (MatScalar)b[1]; 2989 t[2] = (MatScalar)b[2]; 2990 t[3] = (MatScalar)b[3]; 2991 for (i=1; i<n; i++) { 2992 v = aa + 16*ai[i]; 2993 vi = aj + ai[i]; 2994 nz = diag[i] - ai[i]; 2995 idx += 4; 2996 s1 = (MatScalar)b[idx]; 2997 s2 = (MatScalar)b[1+idx]; 2998 s3 = (MatScalar)b[2+idx]; 2999 s4 = (MatScalar)b[3+idx]; 3000 while (nz--) { 3001 jdx = 4*(*vi++); 3002 x1 = t[jdx]; 3003 x2 = t[1+jdx]; 3004 x3 = t[2+jdx]; 3005 x4 = t[3+jdx]; 3006 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3007 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3008 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3009 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3010 v += 16; 3011 } 3012 t[idx] = s1; 3013 t[1+idx] = s2; 3014 t[2+idx] = s3; 3015 t[3+idx] = s4; 3016 } 3017 /* backward solve the upper triangular */ 3018 idt = 4*(n-1); 3019 for (i=n-1; i>=0; i--){ 3020 ai16 = 16*diag[i]; 3021 v = aa + ai16 + 16; 3022 vi = aj + diag[i] + 1; 3023 nz = ai[i+1] - diag[i] - 1; 3024 s1 = t[idt]; 3025 s2 = t[1+idt]; 3026 s3 = t[2+idt]; 3027 s4 = t[3+idt]; 3028 while (nz--) { 3029 idx = 4*(*vi++); 3030 x1 = (MatScalar)x[idx]; 3031 x2 = (MatScalar)x[1+idx]; 3032 x3 = (MatScalar)x[2+idx]; 3033 x4 = (MatScalar)x[3+idx]; 3034 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3035 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3036 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3037 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3038 v += 16; 3039 } 3040 v = aa + ai16; 3041 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3042 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3043 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3044 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3045 idt -= 4; 3046 } 3047 } 3048 3049 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3050 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3051 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3052 PetscFunctionReturn(0); 3053 } 3054 3055 #if defined (PETSC_HAVE_SSE) 3056 3057 #include PETSC_HAVE_SSE 3058 #undef __FUNCT__ 3059 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3060 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 3061 { 3062 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3063 unsigned short *aj=(unsigned short *)a->j; 3064 PetscErrorCode ierr; 3065 int *ai=a->i,n=a->mbs,*diag = a->diag; 3066 MatScalar *aa=a->a; 3067 PetscScalar *x,*b; 3068 3069 PetscFunctionBegin; 3070 SSE_SCOPE_BEGIN; 3071 /* 3072 Note: This code currently uses demotion of double 3073 to float when performing the mixed-mode computation. 3074 This may not be numerically reasonable for all applications. 3075 */ 3076 PREFETCH_NTA(aa+16*ai[1]); 3077 3078 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3079 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3080 { 3081 /* x will first be computed in single precision then promoted inplace to double */ 3082 MatScalar *v,*t=(MatScalar *)x; 3083 int nz,i,idt,ai16; 3084 unsigned int jdx,idx; 3085 unsigned short *vi; 3086 /* Forward solve the lower triangular factor. */ 3087 3088 /* First block is the identity. */ 3089 idx = 0; 3090 CONVERT_DOUBLE4_FLOAT4(t,b); 3091 v = aa + 16*((unsigned int)ai[1]); 3092 3093 for (i=1; i<n;) { 3094 PREFETCH_NTA(&v[8]); 3095 vi = aj + ai[i]; 3096 nz = diag[i] - ai[i]; 3097 idx += 4; 3098 3099 /* Demote RHS from double to float. */ 3100 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3101 LOAD_PS(&t[idx],XMM7); 3102 3103 while (nz--) { 3104 PREFETCH_NTA(&v[16]); 3105 jdx = 4*((unsigned int)(*vi++)); 3106 3107 /* 4x4 Matrix-Vector product with negative accumulation: */ 3108 SSE_INLINE_BEGIN_2(&t[jdx],v) 3109 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3110 3111 /* First Column */ 3112 SSE_COPY_PS(XMM0,XMM6) 3113 SSE_SHUFFLE(XMM0,XMM0,0x00) 3114 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3115 SSE_SUB_PS(XMM7,XMM0) 3116 3117 /* Second Column */ 3118 SSE_COPY_PS(XMM1,XMM6) 3119 SSE_SHUFFLE(XMM1,XMM1,0x55) 3120 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3121 SSE_SUB_PS(XMM7,XMM1) 3122 3123 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3124 3125 /* Third Column */ 3126 SSE_COPY_PS(XMM2,XMM6) 3127 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3128 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3129 SSE_SUB_PS(XMM7,XMM2) 3130 3131 /* Fourth Column */ 3132 SSE_COPY_PS(XMM3,XMM6) 3133 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3134 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3135 SSE_SUB_PS(XMM7,XMM3) 3136 SSE_INLINE_END_2 3137 3138 v += 16; 3139 } 3140 v = aa + 16*ai[++i]; 3141 PREFETCH_NTA(v); 3142 STORE_PS(&t[idx],XMM7); 3143 } 3144 3145 /* Backward solve the upper triangular factor.*/ 3146 3147 idt = 4*(n-1); 3148 ai16 = 16*diag[n-1]; 3149 v = aa + ai16 + 16; 3150 for (i=n-1; i>=0;){ 3151 PREFETCH_NTA(&v[8]); 3152 vi = aj + diag[i] + 1; 3153 nz = ai[i+1] - diag[i] - 1; 3154 3155 LOAD_PS(&t[idt],XMM7); 3156 3157 while (nz--) { 3158 PREFETCH_NTA(&v[16]); 3159 idx = 4*((unsigned int)(*vi++)); 3160 3161 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3162 SSE_INLINE_BEGIN_2(&t[idx],v) 3163 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3164 3165 /* First Column */ 3166 SSE_COPY_PS(XMM0,XMM6) 3167 SSE_SHUFFLE(XMM0,XMM0,0x00) 3168 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3169 SSE_SUB_PS(XMM7,XMM0) 3170 3171 /* Second Column */ 3172 SSE_COPY_PS(XMM1,XMM6) 3173 SSE_SHUFFLE(XMM1,XMM1,0x55) 3174 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3175 SSE_SUB_PS(XMM7,XMM1) 3176 3177 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3178 3179 /* Third Column */ 3180 SSE_COPY_PS(XMM2,XMM6) 3181 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3182 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3183 SSE_SUB_PS(XMM7,XMM2) 3184 3185 /* Fourth Column */ 3186 SSE_COPY_PS(XMM3,XMM6) 3187 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3188 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3189 SSE_SUB_PS(XMM7,XMM3) 3190 SSE_INLINE_END_2 3191 v += 16; 3192 } 3193 v = aa + ai16; 3194 ai16 = 16*diag[--i]; 3195 PREFETCH_NTA(aa+ai16+16); 3196 /* 3197 Scale the result by the diagonal 4x4 block, 3198 which was inverted as part of the factorization 3199 */ 3200 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3201 /* First Column */ 3202 SSE_COPY_PS(XMM0,XMM7) 3203 SSE_SHUFFLE(XMM0,XMM0,0x00) 3204 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3205 3206 /* Second Column */ 3207 SSE_COPY_PS(XMM1,XMM7) 3208 SSE_SHUFFLE(XMM1,XMM1,0x55) 3209 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3210 SSE_ADD_PS(XMM0,XMM1) 3211 3212 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3213 3214 /* Third Column */ 3215 SSE_COPY_PS(XMM2,XMM7) 3216 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3217 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3218 SSE_ADD_PS(XMM0,XMM2) 3219 3220 /* Fourth Column */ 3221 SSE_COPY_PS(XMM3,XMM7) 3222 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3223 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3224 SSE_ADD_PS(XMM0,XMM3) 3225 3226 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3227 SSE_INLINE_END_3 3228 3229 v = aa + ai16 + 16; 3230 idt -= 4; 3231 } 3232 3233 /* Convert t from single precision back to double precision (inplace)*/ 3234 idt = 4*(n-1); 3235 for (i=n-1;i>=0;i--) { 3236 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3237 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3238 PetscScalar *xtemp=&x[idt]; 3239 MatScalar *ttemp=&t[idt]; 3240 xtemp[3] = (PetscScalar)ttemp[3]; 3241 xtemp[2] = (PetscScalar)ttemp[2]; 3242 xtemp[1] = (PetscScalar)ttemp[1]; 3243 xtemp[0] = (PetscScalar)ttemp[0]; 3244 idt -= 4; 3245 } 3246 3247 } /* End of artificial scope. */ 3248 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3249 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3250 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3251 SSE_SCOPE_END; 3252 PetscFunctionReturn(0); 3253 } 3254 3255 #undef __FUNCT__ 3256 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3257 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 3258 { 3259 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3260 int *aj=a->j; 3261 PetscErrorCode ierr; 3262 int *ai=a->i,n=a->mbs,*diag = a->diag; 3263 MatScalar *aa=a->a; 3264 PetscScalar *x,*b; 3265 3266 PetscFunctionBegin; 3267 SSE_SCOPE_BEGIN; 3268 /* 3269 Note: This code currently uses demotion of double 3270 to float when performing the mixed-mode computation. 3271 This may not be numerically reasonable for all applications. 3272 */ 3273 PREFETCH_NTA(aa+16*ai[1]); 3274 3275 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3276 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3277 { 3278 /* x will first be computed in single precision then promoted inplace to double */ 3279 MatScalar *v,*t=(MatScalar *)x; 3280 int nz,i,idt,ai16; 3281 int jdx,idx; 3282 int *vi; 3283 /* Forward solve the lower triangular factor. */ 3284 3285 /* First block is the identity. */ 3286 idx = 0; 3287 CONVERT_DOUBLE4_FLOAT4(t,b); 3288 v = aa + 16*ai[1]; 3289 3290 for (i=1; i<n;) { 3291 PREFETCH_NTA(&v[8]); 3292 vi = aj + ai[i]; 3293 nz = diag[i] - ai[i]; 3294 idx += 4; 3295 3296 /* Demote RHS from double to float. */ 3297 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3298 LOAD_PS(&t[idx],XMM7); 3299 3300 while (nz--) { 3301 PREFETCH_NTA(&v[16]); 3302 jdx = 4*(*vi++); 3303 /* jdx = *vi++; */ 3304 3305 /* 4x4 Matrix-Vector product with negative accumulation: */ 3306 SSE_INLINE_BEGIN_2(&t[jdx],v) 3307 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3308 3309 /* First Column */ 3310 SSE_COPY_PS(XMM0,XMM6) 3311 SSE_SHUFFLE(XMM0,XMM0,0x00) 3312 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3313 SSE_SUB_PS(XMM7,XMM0) 3314 3315 /* Second Column */ 3316 SSE_COPY_PS(XMM1,XMM6) 3317 SSE_SHUFFLE(XMM1,XMM1,0x55) 3318 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3319 SSE_SUB_PS(XMM7,XMM1) 3320 3321 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3322 3323 /* Third Column */ 3324 SSE_COPY_PS(XMM2,XMM6) 3325 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3326 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3327 SSE_SUB_PS(XMM7,XMM2) 3328 3329 /* Fourth Column */ 3330 SSE_COPY_PS(XMM3,XMM6) 3331 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3332 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3333 SSE_SUB_PS(XMM7,XMM3) 3334 SSE_INLINE_END_2 3335 3336 v += 16; 3337 } 3338 v = aa + 16*ai[++i]; 3339 PREFETCH_NTA(v); 3340 STORE_PS(&t[idx],XMM7); 3341 } 3342 3343 /* Backward solve the upper triangular factor.*/ 3344 3345 idt = 4*(n-1); 3346 ai16 = 16*diag[n-1]; 3347 v = aa + ai16 + 16; 3348 for (i=n-1; i>=0;){ 3349 PREFETCH_NTA(&v[8]); 3350 vi = aj + diag[i] + 1; 3351 nz = ai[i+1] - diag[i] - 1; 3352 3353 LOAD_PS(&t[idt],XMM7); 3354 3355 while (nz--) { 3356 PREFETCH_NTA(&v[16]); 3357 idx = 4*(*vi++); 3358 /* idx = *vi++; */ 3359 3360 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3361 SSE_INLINE_BEGIN_2(&t[idx],v) 3362 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3363 3364 /* First Column */ 3365 SSE_COPY_PS(XMM0,XMM6) 3366 SSE_SHUFFLE(XMM0,XMM0,0x00) 3367 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3368 SSE_SUB_PS(XMM7,XMM0) 3369 3370 /* Second Column */ 3371 SSE_COPY_PS(XMM1,XMM6) 3372 SSE_SHUFFLE(XMM1,XMM1,0x55) 3373 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3374 SSE_SUB_PS(XMM7,XMM1) 3375 3376 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3377 3378 /* Third Column */ 3379 SSE_COPY_PS(XMM2,XMM6) 3380 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3381 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3382 SSE_SUB_PS(XMM7,XMM2) 3383 3384 /* Fourth Column */ 3385 SSE_COPY_PS(XMM3,XMM6) 3386 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3387 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3388 SSE_SUB_PS(XMM7,XMM3) 3389 SSE_INLINE_END_2 3390 v += 16; 3391 } 3392 v = aa + ai16; 3393 ai16 = 16*diag[--i]; 3394 PREFETCH_NTA(aa+ai16+16); 3395 /* 3396 Scale the result by the diagonal 4x4 block, 3397 which was inverted as part of the factorization 3398 */ 3399 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3400 /* First Column */ 3401 SSE_COPY_PS(XMM0,XMM7) 3402 SSE_SHUFFLE(XMM0,XMM0,0x00) 3403 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3404 3405 /* Second Column */ 3406 SSE_COPY_PS(XMM1,XMM7) 3407 SSE_SHUFFLE(XMM1,XMM1,0x55) 3408 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3409 SSE_ADD_PS(XMM0,XMM1) 3410 3411 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3412 3413 /* Third Column */ 3414 SSE_COPY_PS(XMM2,XMM7) 3415 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3416 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3417 SSE_ADD_PS(XMM0,XMM2) 3418 3419 /* Fourth Column */ 3420 SSE_COPY_PS(XMM3,XMM7) 3421 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3422 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3423 SSE_ADD_PS(XMM0,XMM3) 3424 3425 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3426 SSE_INLINE_END_3 3427 3428 v = aa + ai16 + 16; 3429 idt -= 4; 3430 } 3431 3432 /* Convert t from single precision back to double precision (inplace)*/ 3433 idt = 4*(n-1); 3434 for (i=n-1;i>=0;i--) { 3435 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3436 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3437 PetscScalar *xtemp=&x[idt]; 3438 MatScalar *ttemp=&t[idt]; 3439 xtemp[3] = (PetscScalar)ttemp[3]; 3440 xtemp[2] = (PetscScalar)ttemp[2]; 3441 xtemp[1] = (PetscScalar)ttemp[1]; 3442 xtemp[0] = (PetscScalar)ttemp[0]; 3443 idt -= 4; 3444 } 3445 3446 } /* End of artificial scope. */ 3447 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3448 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3449 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3450 SSE_SCOPE_END; 3451 PetscFunctionReturn(0); 3452 } 3453 3454 #endif 3455 3456 #undef __FUNCT__ 3457 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3458 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 3459 { 3460 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3461 IS iscol=a->col,isrow=a->row; 3462 PetscErrorCode ierr; 3463 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3464 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3465 const MatScalar *aa=a->a,*v; 3466 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3467 const PetscScalar *b; 3468 3469 PetscFunctionBegin; 3470 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3471 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3472 t = a->solve_work; 3473 3474 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3475 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3476 3477 /* forward solve the lower triangular */ 3478 idx = 3*(*r++); 3479 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3480 for (i=1; i<n; i++) { 3481 v = aa + 9*ai[i]; 3482 vi = aj + ai[i]; 3483 nz = diag[i] - ai[i]; 3484 idx = 3*(*r++); 3485 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3486 while (nz--) { 3487 idx = 3*(*vi++); 3488 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3489 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3490 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3491 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3492 v += 9; 3493 } 3494 idx = 3*i; 3495 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3496 } 3497 /* backward solve the upper triangular */ 3498 for (i=n-1; i>=0; i--){ 3499 v = aa + 9*diag[i] + 9; 3500 vi = aj + diag[i] + 1; 3501 nz = ai[i+1] - diag[i] - 1; 3502 idt = 3*i; 3503 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3504 while (nz--) { 3505 idx = 3*(*vi++); 3506 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3507 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3508 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3509 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3510 v += 9; 3511 } 3512 idc = 3*(*c--); 3513 v = aa + 9*diag[i]; 3514 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3515 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3516 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3517 } 3518 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3519 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3520 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3521 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3522 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3523 PetscFunctionReturn(0); 3524 } 3525 3526 #undef __FUNCT__ 3527 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 3528 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 3529 { 3530 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3531 IS iscol=a->col,isrow=a->row; 3532 PetscErrorCode ierr; 3533 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 3534 const PetscInt *r,*c,*rout,*cout; 3535 const MatScalar *aa=a->a,*v; 3536 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3537 const PetscScalar *b; 3538 3539 PetscFunctionBegin; 3540 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3541 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3542 t = a->solve_work; 3543 3544 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3545 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3546 3547 /* forward solve the lower triangular */ 3548 idx = 3*r[0]; 3549 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3550 for (i=1; i<n; i++) { 3551 v = aa + 9*ai[i]; 3552 vi = aj + ai[i]; 3553 nz = ai[i+1] - ai[i]; 3554 idx = 3*r[i]; 3555 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3556 for(m=0;m<nz;m++){ 3557 idx = 3*vi[m]; 3558 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3559 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3560 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3561 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3562 v += 9; 3563 } 3564 idx = 3*i; 3565 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3566 } 3567 /* backward solve the upper triangular */ 3568 for (i=n-1; i>=0; i--){ 3569 k = 2*n-i; 3570 v = aa + 9*ai[k]; 3571 vi = aj + ai[k]; 3572 nz = ai[k +1] - ai[k] - 1; 3573 idt = 3*i; 3574 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3575 for(m=0;m<nz;m++){ 3576 idx = 3*vi[m]; 3577 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3578 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3579 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3580 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3581 v += 9; 3582 } 3583 idc = 3*c[i]; 3584 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3585 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3586 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3587 } 3588 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3589 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3590 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3591 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3592 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3593 PetscFunctionReturn(0); 3594 } 3595 3596 /* 3597 Special case where the matrix was ILU(0) factored in the natural 3598 ordering. This eliminates the need for the column and row permutation. 3599 */ 3600 #undef __FUNCT__ 3601 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 3602 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 3603 { 3604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3605 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3606 PetscErrorCode ierr; 3607 PetscInt *diag = a->diag; 3608 const MatScalar *aa=a->a,*v; 3609 PetscScalar *x,s1,s2,s3,x1,x2,x3; 3610 const PetscScalar *b; 3611 PetscInt jdx,idt,idx,nz,*vi,i; 3612 3613 PetscFunctionBegin; 3614 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3615 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3616 3617 /* forward solve the lower triangular */ 3618 idx = 0; 3619 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 3620 for (i=1; i<n; i++) { 3621 v = aa + 9*ai[i]; 3622 vi = aj + ai[i]; 3623 nz = diag[i] - ai[i]; 3624 idx += 3; 3625 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3626 while (nz--) { 3627 jdx = 3*(*vi++); 3628 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 3629 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3630 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3631 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3632 v += 9; 3633 } 3634 x[idx] = s1; 3635 x[1+idx] = s2; 3636 x[2+idx] = s3; 3637 } 3638 /* backward solve the upper triangular */ 3639 for (i=n-1; i>=0; i--){ 3640 v = aa + 9*diag[i] + 9; 3641 vi = aj + diag[i] + 1; 3642 nz = ai[i+1] - diag[i] - 1; 3643 idt = 3*i; 3644 s1 = x[idt]; s2 = x[1+idt]; 3645 s3 = x[2+idt]; 3646 while (nz--) { 3647 idx = 3*(*vi++); 3648 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 3649 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3650 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3651 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3652 v += 9; 3653 } 3654 v = aa + 9*diag[i]; 3655 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3656 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3657 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3658 } 3659 3660 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3661 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3662 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3663 PetscFunctionReturn(0); 3664 } 3665 3666 #undef __FUNCT__ 3667 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 3668 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3669 { 3670 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3671 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3672 PetscErrorCode ierr; 3673 PetscInt idx,jdx,idt; 3674 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3675 const MatScalar *aa=a->a,*v; 3676 PetscScalar *x; 3677 const PetscScalar *b; 3678 PetscScalar s1,s2,s3,x1,x2,x3; 3679 3680 PetscFunctionBegin; 3681 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3682 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3683 /* forward solve the lower triangular */ 3684 idx = 0; 3685 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3686 for (i=1; i<n; i++) { 3687 v = aa + bs2*ai[i]; 3688 vi = aj + ai[i]; 3689 nz = ai[i+1] - ai[i]; 3690 idx = bs*i; 3691 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3692 for(k=0;k<nz;k++){ 3693 jdx = bs*vi[k]; 3694 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3695 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3696 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3697 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3698 3699 v += bs2; 3700 } 3701 3702 x[idx] = s1; 3703 x[1+idx] = s2; 3704 x[2+idx] = s3; 3705 } 3706 3707 /* backward solve the upper triangular */ 3708 for (i=n-1; i>=0; i--){ 3709 v = aa + bs2*ai[2*n-i]; 3710 vi = aj + ai[2*n-i]; 3711 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3712 idt = bs*i; 3713 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3714 3715 for(k=0;k<nz;k++){ 3716 idx = bs*vi[k]; 3717 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3718 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3719 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3720 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3721 3722 v += bs2; 3723 } 3724 /* x = inv_diagonal*x */ 3725 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3726 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3727 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3728 3729 } 3730 3731 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3732 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3733 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3734 PetscFunctionReturn(0); 3735 } 3736 3737 #undef __FUNCT__ 3738 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 3739 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3740 { 3741 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3742 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3743 PetscErrorCode ierr; 3744 PetscInt idx,jdx,idt; 3745 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3746 const MatScalar *aa=a->a,*v; 3747 PetscScalar *x; 3748 const PetscScalar *b; 3749 PetscScalar s1,s2,s3,x1,x2,x3; 3750 3751 PetscFunctionBegin; 3752 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3753 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3754 /* forward solve the lower triangular */ 3755 idx = 0; 3756 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3757 for (i=1; i<n; i++) { 3758 v = aa + bs2*ai[i]; 3759 vi = aj + ai[i]; 3760 nz = ai[i+1] - ai[i]; 3761 idx = bs*i; 3762 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3763 for(k=0;k<nz;k++){ 3764 jdx = bs*vi[k]; 3765 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3766 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3767 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3768 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3769 3770 v += bs2; 3771 } 3772 3773 x[idx] = s1; 3774 x[1+idx] = s2; 3775 x[2+idx] = s3; 3776 } 3777 3778 /* backward solve the upper triangular */ 3779 for (i=n-1; i>=0; i--){ 3780 v = aa + bs2*(adiag[i+1]+1); 3781 vi = aj + adiag[i+1]+1; 3782 nz = adiag[i] - adiag[i+1]-1; 3783 idt = bs*i; 3784 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3785 3786 for(k=0;k<nz;k++){ 3787 idx = bs*vi[k]; 3788 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3789 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3790 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3791 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3792 3793 v += bs2; 3794 } 3795 /* x = inv_diagonal*x */ 3796 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3797 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3798 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3799 3800 } 3801 3802 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3803 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3804 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3805 PetscFunctionReturn(0); 3806 } 3807 3808 #undef __FUNCT__ 3809 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 3810 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 3811 { 3812 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3813 IS iscol=a->col,isrow=a->row; 3814 PetscErrorCode ierr; 3815 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3816 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3817 const MatScalar *aa=a->a,*v; 3818 PetscScalar *x,s1,s2,x1,x2,*t; 3819 const PetscScalar *b; 3820 3821 PetscFunctionBegin; 3822 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3823 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3824 t = a->solve_work; 3825 3826 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3827 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3828 3829 /* forward solve the lower triangular */ 3830 idx = 2*(*r++); 3831 t[0] = b[idx]; t[1] = b[1+idx]; 3832 for (i=1; i<n; i++) { 3833 v = aa + 4*ai[i]; 3834 vi = aj + ai[i]; 3835 nz = diag[i] - ai[i]; 3836 idx = 2*(*r++); 3837 s1 = b[idx]; s2 = b[1+idx]; 3838 while (nz--) { 3839 idx = 2*(*vi++); 3840 x1 = t[idx]; x2 = t[1+idx]; 3841 s1 -= v[0]*x1 + v[2]*x2; 3842 s2 -= v[1]*x1 + v[3]*x2; 3843 v += 4; 3844 } 3845 idx = 2*i; 3846 t[idx] = s1; t[1+idx] = s2; 3847 } 3848 /* backward solve the upper triangular */ 3849 for (i=n-1; i>=0; i--){ 3850 v = aa + 4*diag[i] + 4; 3851 vi = aj + diag[i] + 1; 3852 nz = ai[i+1] - diag[i] - 1; 3853 idt = 2*i; 3854 s1 = t[idt]; s2 = t[1+idt]; 3855 while (nz--) { 3856 idx = 2*(*vi++); 3857 x1 = t[idx]; x2 = t[1+idx]; 3858 s1 -= v[0]*x1 + v[2]*x2; 3859 s2 -= v[1]*x1 + v[3]*x2; 3860 v += 4; 3861 } 3862 idc = 2*(*c--); 3863 v = aa + 4*diag[i]; 3864 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3865 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 3866 } 3867 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3868 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3869 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3870 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3871 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3872 PetscFunctionReturn(0); 3873 } 3874 3875 #undef __FUNCT__ 3876 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 3877 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 3878 { 3879 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3880 IS iscol=a->col,isrow=a->row; 3881 PetscErrorCode ierr; 3882 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 3883 const PetscInt *r,*c,*rout,*cout; 3884 const MatScalar *aa=a->a,*v; 3885 PetscScalar *x,s1,s2,x1,x2,*t; 3886 const PetscScalar *b; 3887 3888 PetscFunctionBegin; 3889 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3890 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3891 t = a->solve_work; 3892 3893 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3894 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3895 3896 /* forward solve the lower triangular */ 3897 idx = 2*r[0]; 3898 t[0] = b[idx]; t[1] = b[1+idx]; 3899 for (i=1; i<n; i++) { 3900 v = aa + 4*ai[i]; 3901 vi = aj + ai[i]; 3902 nz = ai[i+1] - ai[i]; 3903 idx = 2*r[i]; 3904 s1 = b[idx]; s2 = b[1+idx]; 3905 for(m=0;m<nz;m++){ 3906 jdx = 2*vi[m]; 3907 x1 = t[jdx]; x2 = t[1+jdx]; 3908 s1 -= v[0]*x1 + v[2]*x2; 3909 s2 -= v[1]*x1 + v[3]*x2; 3910 v += 4; 3911 } 3912 idx = 2*i; 3913 t[idx] = s1; t[1+idx] = s2; 3914 } 3915 /* backward solve the upper triangular */ 3916 for (i=n-1; i>=0; i--){ 3917 k = 2*n-i; 3918 v = aa + 4*ai[k]; 3919 vi = aj + ai[k]; 3920 nz = ai[k +1] - ai[k] - 1; 3921 idt = 2*i; 3922 s1 = t[idt]; s2 = t[1+idt]; 3923 for(m=0;m<nz;m++){ 3924 idx = 2*vi[m]; 3925 x1 = t[idx]; x2 = t[1+idx]; 3926 s1 -= v[0]*x1 + v[2]*x2; 3927 s2 -= v[1]*x1 + v[3]*x2; 3928 v += 4; 3929 } 3930 idc = 2*c[i]; 3931 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3932 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 3933 } 3934 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3935 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3936 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3937 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3938 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3939 PetscFunctionReturn(0); 3940 } 3941 3942 3943 /* 3944 Special case where the matrix was ILU(0) factored in the natural 3945 ordering. This eliminates the need for the column and row permutation. 3946 */ 3947 #undef __FUNCT__ 3948 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3949 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 3950 { 3951 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3952 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3953 PetscErrorCode ierr; 3954 PetscInt *diag = a->diag; 3955 const MatScalar *aa=a->a,*v; 3956 PetscScalar *x,s1,s2,x1,x2; 3957 const PetscScalar *b; 3958 PetscInt jdx,idt,idx,nz,*vi,i; 3959 3960 PetscFunctionBegin; 3961 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3962 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3963 3964 /* forward solve the lower triangular */ 3965 idx = 0; 3966 x[0] = b[0]; x[1] = b[1]; 3967 for (i=1; i<n; i++) { 3968 v = aa + 4*ai[i]; 3969 vi = aj + ai[i]; 3970 nz = diag[i] - ai[i]; 3971 idx += 2; 3972 s1 = b[idx];s2 = b[1+idx]; 3973 while (nz--) { 3974 jdx = 2*(*vi++); 3975 x1 = x[jdx];x2 = x[1+jdx]; 3976 s1 -= v[0]*x1 + v[2]*x2; 3977 s2 -= v[1]*x1 + v[3]*x2; 3978 v += 4; 3979 } 3980 x[idx] = s1; 3981 x[1+idx] = s2; 3982 } 3983 /* backward solve the upper triangular */ 3984 for (i=n-1; i>=0; i--){ 3985 v = aa + 4*diag[i] + 4; 3986 vi = aj + diag[i] + 1; 3987 nz = ai[i+1] - diag[i] - 1; 3988 idt = 2*i; 3989 s1 = x[idt]; s2 = x[1+idt]; 3990 while (nz--) { 3991 idx = 2*(*vi++); 3992 x1 = x[idx]; x2 = x[1+idx]; 3993 s1 -= v[0]*x1 + v[2]*x2; 3994 s2 -= v[1]*x1 + v[3]*x2; 3995 v += 4; 3996 } 3997 v = aa + 4*diag[i]; 3998 x[idt] = v[0]*s1 + v[2]*s2; 3999 x[1+idt] = v[1]*s1 + v[3]*s2; 4000 } 4001 4002 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4003 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4004 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4005 PetscFunctionReturn(0); 4006 } 4007 4008 #undef __FUNCT__ 4009 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4010 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4011 { 4012 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4013 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4014 PetscErrorCode ierr; 4015 PetscInt jdx; 4016 const MatScalar *aa=a->a,*v; 4017 PetscScalar *x,s1,s2,x1,x2; 4018 const PetscScalar *b; 4019 4020 PetscFunctionBegin; 4021 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4022 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4023 /* forward solve the lower triangular */ 4024 idx = 0; 4025 x[0] = b[idx]; x[1] = b[1+idx]; 4026 for (i=1; i<n; i++) { 4027 v = aa + 4*ai[i]; 4028 vi = aj + ai[i]; 4029 nz = ai[i+1] - ai[i]; 4030 idx = 2*i; 4031 s1 = b[idx];s2 = b[1+idx]; 4032 for(k=0;k<nz;k++){ 4033 jdx = 2*vi[k]; 4034 x1 = x[jdx];x2 = x[1+jdx]; 4035 s1 -= v[0]*x1 + v[2]*x2; 4036 s2 -= v[1]*x1 + v[3]*x2; 4037 v += 4; 4038 } 4039 x[idx] = s1; 4040 x[1+idx] = s2; 4041 } 4042 4043 /* backward solve the upper triangular */ 4044 for (i=n-1; i>=0; i--){ 4045 v = aa + 4*ai[2*n-i]; 4046 vi = aj + ai[2*n-i]; 4047 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4048 idt = 2*i; 4049 s1 = x[idt]; s2 = x[1+idt]; 4050 for(k=0;k<nz;k++){ 4051 idx = 2*vi[k]; 4052 x1 = x[idx]; x2 = x[1+idx]; 4053 s1 -= v[0]*x1 + v[2]*x2; 4054 s2 -= v[1]*x1 + v[3]*x2; 4055 v += 4; 4056 } 4057 /* x = inv_diagonal*x */ 4058 x[idt] = v[0]*s1 + v[2]*s2; 4059 x[1+idt] = v[1]*s1 + v[3]*s2; 4060 } 4061 4062 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4063 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4064 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4065 PetscFunctionReturn(0); 4066 } 4067 4068 #undef __FUNCT__ 4069 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4070 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4071 { 4072 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4073 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4074 PetscErrorCode ierr; 4075 PetscInt jdx; 4076 const MatScalar *aa=a->a,*v; 4077 PetscScalar *x,s1,s2,x1,x2; 4078 const PetscScalar *b; 4079 4080 PetscFunctionBegin; 4081 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4082 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4083 /* forward solve the lower triangular */ 4084 idx = 0; 4085 x[0] = b[idx]; x[1] = b[1+idx]; 4086 for (i=1; i<n; i++) { 4087 v = aa + 4*ai[i]; 4088 vi = aj + ai[i]; 4089 nz = ai[i+1] - ai[i]; 4090 idx = 2*i; 4091 s1 = b[idx];s2 = b[1+idx]; 4092 for(k=0;k<nz;k++){ 4093 jdx = 2*vi[k]; 4094 x1 = x[jdx];x2 = x[1+jdx]; 4095 s1 -= v[0]*x1 + v[2]*x2; 4096 s2 -= v[1]*x1 + v[3]*x2; 4097 v += 4; 4098 } 4099 x[idx] = s1; 4100 x[1+idx] = s2; 4101 } 4102 4103 /* backward solve the upper triangular */ 4104 for (i=n-1; i>=0; i--){ 4105 v = aa + 4*(adiag[i+1]+1); 4106 vi = aj + adiag[i+1]+1; 4107 nz = adiag[i] - adiag[i+1]-1; 4108 idt = 2*i; 4109 s1 = x[idt]; s2 = x[1+idt]; 4110 for(k=0;k<nz;k++){ 4111 idx = 2*vi[k]; 4112 x1 = x[idx]; x2 = x[1+idx]; 4113 s1 -= v[0]*x1 + v[2]*x2; 4114 s2 -= v[1]*x1 + v[3]*x2; 4115 v += 4; 4116 } 4117 /* x = inv_diagonal*x */ 4118 x[idt] = v[0]*s1 + v[2]*s2; 4119 x[1+idt] = v[1]*s1 + v[3]*s2; 4120 } 4121 4122 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4123 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4124 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4125 PetscFunctionReturn(0); 4126 } 4127 4128 #undef __FUNCT__ 4129 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4130 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4131 { 4132 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4133 IS iscol=a->col,isrow=a->row; 4134 PetscErrorCode ierr; 4135 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4136 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4137 MatScalar *aa=a->a,*v; 4138 PetscScalar *x,*b,s1,*t; 4139 4140 PetscFunctionBegin; 4141 if (!n) PetscFunctionReturn(0); 4142 4143 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4144 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4145 t = a->solve_work; 4146 4147 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4148 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4149 4150 /* forward solve the lower triangular */ 4151 t[0] = b[*r++]; 4152 for (i=1; i<n; i++) { 4153 v = aa + ai[i]; 4154 vi = aj + ai[i]; 4155 nz = diag[i] - ai[i]; 4156 s1 = b[*r++]; 4157 while (nz--) { 4158 s1 -= (*v++)*t[*vi++]; 4159 } 4160 t[i] = s1; 4161 } 4162 /* backward solve the upper triangular */ 4163 for (i=n-1; i>=0; i--){ 4164 v = aa + diag[i] + 1; 4165 vi = aj + diag[i] + 1; 4166 nz = ai[i+1] - diag[i] - 1; 4167 s1 = t[i]; 4168 while (nz--) { 4169 s1 -= (*v++)*t[*vi++]; 4170 } 4171 x[*c--] = t[i] = aa[diag[i]]*s1; 4172 } 4173 4174 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4175 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4176 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4177 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4178 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4179 PetscFunctionReturn(0); 4180 } 4181 /* 4182 Special case where the matrix was ILU(0) factored in the natural 4183 ordering. This eliminates the need for the column and row permutation. 4184 */ 4185 #undef __FUNCT__ 4186 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4187 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 4188 { 4189 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4190 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4191 PetscErrorCode ierr; 4192 PetscInt *diag = a->diag; 4193 MatScalar *aa=a->a; 4194 PetscScalar *x,*b; 4195 PetscScalar s1,x1; 4196 MatScalar *v; 4197 PetscInt jdx,idt,idx,nz,*vi,i; 4198 4199 PetscFunctionBegin; 4200 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4201 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4202 4203 /* forward solve the lower triangular */ 4204 idx = 0; 4205 x[0] = b[0]; 4206 for (i=1; i<n; i++) { 4207 v = aa + ai[i]; 4208 vi = aj + ai[i]; 4209 nz = diag[i] - ai[i]; 4210 idx += 1; 4211 s1 = b[idx]; 4212 while (nz--) { 4213 jdx = *vi++; 4214 x1 = x[jdx]; 4215 s1 -= v[0]*x1; 4216 v += 1; 4217 } 4218 x[idx] = s1; 4219 } 4220 /* backward solve the upper triangular */ 4221 for (i=n-1; i>=0; i--){ 4222 v = aa + diag[i] + 1; 4223 vi = aj + diag[i] + 1; 4224 nz = ai[i+1] - diag[i] - 1; 4225 idt = i; 4226 s1 = x[idt]; 4227 while (nz--) { 4228 idx = *vi++; 4229 x1 = x[idx]; 4230 s1 -= v[0]*x1; 4231 v += 1; 4232 } 4233 v = aa + diag[i]; 4234 x[idt] = v[0]*s1; 4235 } 4236 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4238 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4239 PetscFunctionReturn(0); 4240 } 4241 4242 /* ----------------------------------------------------------------*/ 4243 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 4244 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 4245 4246 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 4247 extern PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat,Vec,Vec); 4248 4249 #undef __FUNCT__ 4250 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 4251 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 4252 { 4253 Mat C=B; 4254 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 4255 IS isrow = b->row,isicol = b->icol; 4256 PetscErrorCode ierr; 4257 const PetscInt *r,*ic,*ics; 4258 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 4259 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4260 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4261 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4262 MatScalar *v_work; 4263 4264 PetscFunctionBegin; 4265 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4266 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4267 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4268 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 4269 ics = ic; 4270 4271 /* generate work space needed by dense LU factorization */ 4272 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 4273 mwork = v_work + bs; 4274 v_pivots = (PetscInt*)(mwork + bs2); 4275 4276 for (i=0; i<n; i++){ 4277 /* zero rtmp */ 4278 /* L part */ 4279 nz = bi[i+1] - bi[i]; 4280 bjtmp = bj + bi[i]; 4281 for (j=0; j<nz; j++){ 4282 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4283 } 4284 4285 /* U part */ 4286 nz = bi[2*n-i+1] - bi[2*n-i]; 4287 bjtmp = bj + bi[2*n-i]; 4288 for (j=0; j<nz; j++){ 4289 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4290 } 4291 4292 /* load in initial (unfactored row) */ 4293 nz = ai[r[i]+1] - ai[r[i]]; 4294 ajtmp = aj + ai[r[i]]; 4295 v = aa + bs2*ai[r[i]]; 4296 for (j=0; j<nz; j++) { 4297 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 4298 } 4299 4300 /* elimination */ 4301 bjtmp = bj + bi[i]; 4302 nzL = bi[i+1] - bi[i]; 4303 for(k=0;k < nzL;k++) { 4304 row = bjtmp[k]; 4305 pc = rtmp + bs2*row; 4306 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4307 if (flg) { 4308 pv = b->a + bs2*bdiag[row]; 4309 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 4310 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 4311 pv = b->a + bs2*bi[2*n-row]; 4312 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 4313 for (j=0; j<nz; j++) { 4314 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4315 } 4316 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 4317 } 4318 } 4319 4320 /* finished row so stick it into b->a */ 4321 /* L part */ 4322 pv = b->a + bs2*bi[i] ; 4323 pj = b->j + bi[i] ; 4324 nz = bi[i+1] - bi[i]; 4325 for (j=0; j<nz; j++) { 4326 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4327 } 4328 4329 /* Mark diagonal and invert diagonal for simplier triangular solves */ 4330 pv = b->a + bs2*bdiag[i]; 4331 pj = b->j + bdiag[i]; 4332 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4333 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4334 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 4335 4336 /* U part */ 4337 pv = b->a + bs2*bi[2*n-i]; 4338 pj = b->j + bi[2*n-i]; 4339 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 4340 for (j=0; j<nz; j++){ 4341 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4342 } 4343 } 4344 4345 ierr = PetscFree(rtmp);CHKERRQ(ierr); 4346 ierr = PetscFree(v_work);CHKERRQ(ierr); 4347 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4348 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4349 4350 C->assembled = PETSC_TRUE; 4351 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 4352 PetscFunctionReturn(0); 4353 } 4354 4355 /* 4356 ilu(0) with natural ordering under new data structure. 4357 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 4358 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 4359 */ 4360 #undef __FUNCT__ 4361 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 4362 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4363 { 4364 4365 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4366 PetscErrorCode ierr; 4367 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 4368 PetscInt i,j,nz,*bi,*bj,*bdiag; 4369 4370 PetscFunctionBegin; 4371 /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */ 4372 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 4373 b = (Mat_SeqBAIJ*)(fact)->data; 4374 4375 /* allocate matrix arrays for new data structure */ 4376 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr); 4377 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr); 4378 b->singlemalloc = PETSC_TRUE; 4379 if (!b->diag){ 4380 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 4381 } 4382 bdiag = b->diag; 4383 4384 if (n > 0) { 4385 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 4386 } 4387 4388 /* set bi and bj with new data structure */ 4389 bi = b->i; 4390 bj = b->j; 4391 4392 /* L part */ 4393 bi[0] = 0; 4394 for (i=0; i<n; i++){ 4395 nz = adiag[i] - ai[i]; 4396 bi[i+1] = bi[i] + nz; 4397 aj = a->j + ai[i]; 4398 for (j=0; j<nz; j++){ 4399 *bj = aj[j]; bj++; 4400 } 4401 } 4402 4403 /* U part */ 4404 bi[n+1] = bi[n]; 4405 for (i=n-1; i>=0; i--){ 4406 nz = ai[i+1] - adiag[i] - 1; 4407 bi[2*n-i+1] = bi[2*n-i] + nz + 1; 4408 aj = a->j + adiag[i] + 1; 4409 for (j=0; j<nz; j++){ 4410 *bj = aj[j]; bj++; 4411 } 4412 /* diag[i] */ 4413 *bj = i; bj++; 4414 bdiag[i] = bi[2*n-i+1]-1; 4415 } 4416 PetscFunctionReturn(0); 4417 } 4418 4419 #undef __FUNCT__ 4420 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 4421 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4422 { 4423 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4424 IS isicol; 4425 PetscErrorCode ierr; 4426 const PetscInt *r,*ic; 4427 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 4428 PetscInt *bi,*cols,nnz,*cols_lvl; 4429 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 4430 PetscInt i,levels,diagonal_fill; 4431 PetscTruth col_identity,row_identity,both_identity; 4432 PetscReal f; 4433 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 4434 PetscBT lnkbt; 4435 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 4436 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 4437 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 4438 PetscTruth missing; 4439 PetscInt bs=A->rmap->bs,bs2=a->bs2; 4440 4441 PetscFunctionBegin; 4442 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 4443 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 4444 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 4445 4446 f = info->fill; 4447 levels = (PetscInt)info->levels; 4448 diagonal_fill = (PetscInt)info->diagonal_fill; 4449 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4450 4451 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4452 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4453 both_identity = (PetscTruth) (row_identity && col_identity); 4454 4455 if (!levels && both_identity) { 4456 /* special case: ilu(0) with natural ordering */ 4457 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4458 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 4459 /* set MatSolve routines */ 4460 switch (bs){ 4461 case 2: 4462 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 4463 break; 4464 case 3: 4465 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 4466 break; 4467 case 4: 4468 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 4469 break; 4470 case 5: 4471 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 4472 break; 4473 case 6: 4474 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 4475 break; 4476 case 7: 4477 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 4478 break; 4479 default: 4480 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 4481 break; 4482 } 4483 4484 fact->factor = MAT_FACTOR_ILU; 4485 (fact)->info.factor_mallocs = 0; 4486 (fact)->info.fill_ratio_given = info->fill; 4487 (fact)->info.fill_ratio_needed = 1.0; 4488 b = (Mat_SeqBAIJ*)(fact)->data; 4489 b->row = isrow; 4490 b->col = iscol; 4491 b->icol = isicol; 4492 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4493 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4494 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4495 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4496 PetscFunctionReturn(0); 4497 } 4498 4499 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4500 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4501 4502 /* get new row pointers */ 4503 ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 4504 bi[0] = 0; 4505 /* bdiag is location of diagonal in factor */ 4506 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 4507 bdiag[0] = 0; 4508 4509 ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 4510 bjlvl_ptr = (PetscInt**)(bj_ptr + n); 4511 4512 /* create a linked list for storing column indices of the active row */ 4513 nlnk = n + 1; 4514 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 4515 4516 /* initial FreeSpace size is f*(ai[n]+1) */ 4517 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 4518 current_space = free_space; 4519 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 4520 current_space_lvl = free_space_lvl; 4521 4522 for (i=0; i<n; i++) { 4523 nzi = 0; 4524 /* copy current row into linked list */ 4525 nnz = ai[r[i]+1] - ai[r[i]]; 4526 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 4527 cols = aj + ai[r[i]]; 4528 lnk[i] = -1; /* marker to indicate if diagonal exists */ 4529 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 4530 nzi += nlnk; 4531 4532 /* make sure diagonal entry is included */ 4533 if (diagonal_fill && lnk[i] == -1) { 4534 fm = n; 4535 while (lnk[fm] < i) fm = lnk[fm]; 4536 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 4537 lnk[fm] = i; 4538 lnk_lvl[i] = 0; 4539 nzi++; dcount++; 4540 } 4541 4542 /* add pivot rows into the active row */ 4543 nzbd = 0; 4544 prow = lnk[n]; 4545 while (prow < i) { 4546 nnz = bdiag[prow]; 4547 cols = bj_ptr[prow] + nnz + 1; 4548 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 4549 nnz = bi[prow+1] - bi[prow] - nnz - 1; 4550 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 4551 nzi += nlnk; 4552 prow = lnk[prow]; 4553 nzbd++; 4554 } 4555 bdiag[i] = nzbd; 4556 bi[i+1] = bi[i] + nzi; 4557 4558 /* if free space is not available, make more free space */ 4559 if (current_space->local_remaining<nzi) { 4560 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 4561 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 4562 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 4563 reallocs++; 4564 } 4565 4566 /* copy data into free_space and free_space_lvl, then initialize lnk */ 4567 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 4568 bj_ptr[i] = current_space->array; 4569 bjlvl_ptr[i] = current_space_lvl->array; 4570 4571 /* make sure the active row i has diagonal entry */ 4572 if (*(bj_ptr[i]+bdiag[i]) != i) { 4573 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 4574 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 4575 } 4576 4577 current_space->array += nzi; 4578 current_space->local_used += nzi; 4579 current_space->local_remaining -= nzi; 4580 current_space_lvl->array += nzi; 4581 current_space_lvl->local_used += nzi; 4582 current_space_lvl->local_remaining -= nzi; 4583 } 4584 4585 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4586 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4587 4588 /* destroy list of free space and other temporary arrays */ 4589 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 4590 4591 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 4592 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 4593 4594 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 4595 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 4596 ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 4597 4598 #if defined(PETSC_USE_INFO) 4599 { 4600 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 4601 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 4602 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4603 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 4604 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4605 if (diagonal_fill) { 4606 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 4607 } 4608 } 4609 #endif 4610 4611 /* put together the new matrix */ 4612 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4613 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4614 b = (Mat_SeqBAIJ*)(fact)->data; 4615 b->free_a = PETSC_TRUE; 4616 b->free_ij = PETSC_TRUE; 4617 b->singlemalloc = PETSC_FALSE; 4618 ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 4619 b->j = bj; 4620 b->i = bi; 4621 b->diag = bdiag; 4622 b->free_diag = PETSC_TRUE; 4623 b->ilen = 0; 4624 b->imax = 0; 4625 b->row = isrow; 4626 b->col = iscol; 4627 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4628 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4629 b->icol = isicol; 4630 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4631 /* In b structure: Free imax, ilen, old a, old j. 4632 Allocate bdiag, solve_work, new a, new j */ 4633 ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 4634 b->maxnz = b->nz = bi[2*n+1] ; 4635 (fact)->info.factor_mallocs = reallocs; 4636 (fact)->info.fill_ratio_given = f; 4637 (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]); 4638 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 4639 /* set MatSolve routines */ 4640 if (both_identity){ 4641 switch (bs){ 4642 case 2: 4643 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 4644 break; 4645 case 3: 4646 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 4647 break; 4648 case 4: 4649 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 4650 break; 4651 case 5: 4652 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 4653 break; 4654 case 6: 4655 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 4656 break; 4657 case 7: 4658 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 4659 break; 4660 default: 4661 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 4662 break; 4663 } 4664 } else { 4665 switch (bs){ 4666 case 2: 4667 fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct; 4668 break; 4669 case 3: 4670 fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct; 4671 break; 4672 case 4: 4673 fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct; 4674 break; 4675 case 5: 4676 fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct; 4677 break; 4678 case 6: 4679 fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct; 4680 break; 4681 case 7: 4682 fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct; 4683 break; 4684 default: 4685 fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 4686 break; 4687 } 4688 } 4689 PetscFunctionReturn(0); 4690 } 4691 4692 /* 4693 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 4694 except that the data structure of Mat_SeqAIJ is slightly different. 4695 Not a good example of code reuse. 4696 */ 4697 #undef __FUNCT__ 4698 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 4699 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4700 { 4701 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4702 IS isicol; 4703 PetscErrorCode ierr; 4704 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 4705 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 4706 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 4707 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 4708 PetscTruth col_identity,row_identity,both_identity,flg; 4709 PetscReal f; 4710 PetscTruth newdatastruct=PETSC_FALSE; 4711 4712 PetscFunctionBegin; 4713 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 4714 if (newdatastruct){ 4715 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4716 PetscFunctionReturn(0); 4717 } 4718 4719 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 4720 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 4721 4722 f = info->fill; 4723 levels = (PetscInt)info->levels; 4724 diagonal_fill = (PetscInt)info->diagonal_fill; 4725 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4726 4727 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4728 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4729 both_identity = (PetscTruth) (row_identity && col_identity); 4730 4731 if (!levels && both_identity) { /* special case copy the nonzero structure */ 4732 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 4733 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 4734 4735 fact->factor = MAT_FACTOR_ILU; 4736 b = (Mat_SeqBAIJ*)(fact)->data; 4737 b->row = isrow; 4738 b->col = iscol; 4739 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4740 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4741 b->icol = isicol; 4742 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4743 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4744 PetscFunctionReturn(0); 4745 } 4746 4747 /* general case perform the symbolic factorization */ 4748 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4749 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4750 4751 /* get new row pointers */ 4752 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 4753 ainew[0] = 0; 4754 /* don't know how many column pointers are needed so estimate */ 4755 jmax = (PetscInt)(f*ai[n] + 1); 4756 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 4757 /* ajfill is level of fill for each fill entry */ 4758 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 4759 /* fill is a linked list of nonzeros in active row */ 4760 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 4761 /* im is level for each filled value */ 4762 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 4763 /* dloc is location of diagonal in factor */ 4764 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 4765 dloc[0] = 0; 4766 for (prow=0; prow<n; prow++) { 4767 4768 /* copy prow into linked list */ 4769 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 4770 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 4771 xi = aj + ai[r[prow]]; 4772 fill[n] = n; 4773 fill[prow] = -1; /* marker for diagonal entry */ 4774 while (nz--) { 4775 fm = n; 4776 idx = ic[*xi++]; 4777 do { 4778 m = fm; 4779 fm = fill[m]; 4780 } while (fm < idx); 4781 fill[m] = idx; 4782 fill[idx] = fm; 4783 im[idx] = 0; 4784 } 4785 4786 /* make sure diagonal entry is included */ 4787 if (diagonal_fill && fill[prow] == -1) { 4788 fm = n; 4789 while (fill[fm] < prow) fm = fill[fm]; 4790 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 4791 fill[fm] = prow; 4792 im[prow] = 0; 4793 nzf++; 4794 dcount++; 4795 } 4796 4797 nzi = 0; 4798 row = fill[n]; 4799 while (row < prow) { 4800 incrlev = im[row] + 1; 4801 nz = dloc[row]; 4802 xi = ajnew + ainew[row] + nz + 1; 4803 flev = ajfill + ainew[row] + nz + 1; 4804 nnz = ainew[row+1] - ainew[row] - nz - 1; 4805 fm = row; 4806 while (nnz-- > 0) { 4807 idx = *xi++; 4808 if (*flev + incrlev > levels) { 4809 flev++; 4810 continue; 4811 } 4812 do { 4813 m = fm; 4814 fm = fill[m]; 4815 } while (fm < idx); 4816 if (fm != idx) { 4817 im[idx] = *flev + incrlev; 4818 fill[m] = idx; 4819 fill[idx] = fm; 4820 fm = idx; 4821 nzf++; 4822 } else { 4823 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 4824 } 4825 flev++; 4826 } 4827 row = fill[row]; 4828 nzi++; 4829 } 4830 /* copy new filled row into permanent storage */ 4831 ainew[prow+1] = ainew[prow] + nzf; 4832 if (ainew[prow+1] > jmax) { 4833 4834 /* estimate how much additional space we will need */ 4835 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 4836 /* just double the memory each time */ 4837 PetscInt maxadd = jmax; 4838 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 4839 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 4840 jmax += maxadd; 4841 4842 /* allocate a longer ajnew and ajfill */ 4843 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 4844 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4845 ierr = PetscFree(ajnew);CHKERRQ(ierr); 4846 ajnew = xitmp; 4847 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 4848 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4849 ierr = PetscFree(ajfill);CHKERRQ(ierr); 4850 ajfill = xitmp; 4851 reallocate++; /* count how many reallocations are needed */ 4852 } 4853 xitmp = ajnew + ainew[prow]; 4854 flev = ajfill + ainew[prow]; 4855 dloc[prow] = nzi; 4856 fm = fill[n]; 4857 while (nzf--) { 4858 *xitmp++ = fm; 4859 *flev++ = im[fm]; 4860 fm = fill[fm]; 4861 } 4862 /* make sure row has diagonal entry */ 4863 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 4864 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 4865 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 4866 } 4867 } 4868 ierr = PetscFree(ajfill);CHKERRQ(ierr); 4869 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4870 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4871 ierr = PetscFree(fill);CHKERRQ(ierr); 4872 ierr = PetscFree(im);CHKERRQ(ierr); 4873 4874 #if defined(PETSC_USE_INFO) 4875 { 4876 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 4877 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 4878 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4879 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 4880 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4881 if (diagonal_fill) { 4882 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 4883 } 4884 } 4885 #endif 4886 4887 /* put together the new matrix */ 4888 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4889 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4890 b = (Mat_SeqBAIJ*)(fact)->data; 4891 b->free_a = PETSC_TRUE; 4892 b->free_ij = PETSC_TRUE; 4893 b->singlemalloc = PETSC_FALSE; 4894 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 4895 b->j = ajnew; 4896 b->i = ainew; 4897 for (i=0; i<n; i++) dloc[i] += ainew[i]; 4898 b->diag = dloc; 4899 b->free_diag = PETSC_TRUE; 4900 b->ilen = 0; 4901 b->imax = 0; 4902 b->row = isrow; 4903 b->col = iscol; 4904 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4905 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4906 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4907 b->icol = isicol; 4908 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4909 /* In b structure: Free imax, ilen, old a, old j. 4910 Allocate dloc, solve_work, new a, new j */ 4911 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 4912 b->maxnz = b->nz = ainew[n]; 4913 4914 (fact)->info.factor_mallocs = reallocate; 4915 (fact)->info.fill_ratio_given = f; 4916 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 4917 4918 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 4919 PetscFunctionReturn(0); 4920 } 4921 4922 #undef __FUNCT__ 4923 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 4924 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 4925 { 4926 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 4927 /* int i,*AJ=a->j,nz=a->nz; */ 4928 PetscFunctionBegin; 4929 /* Undo Column scaling */ 4930 /* while (nz--) { */ 4931 /* AJ[i] = AJ[i]/4; */ 4932 /* } */ 4933 /* This should really invoke a push/pop logic, but we don't have that yet. */ 4934 A->ops->setunfactored = PETSC_NULL; 4935 PetscFunctionReturn(0); 4936 } 4937 4938 #undef __FUNCT__ 4939 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 4940 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 4941 { 4942 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4943 PetscInt *AJ=a->j,nz=a->nz; 4944 unsigned short *aj=(unsigned short *)AJ; 4945 PetscFunctionBegin; 4946 /* Is this really necessary? */ 4947 while (nz--) { 4948 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 4949 } 4950 A->ops->setunfactored = PETSC_NULL; 4951 PetscFunctionReturn(0); 4952 } 4953 4954 4955