1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124 PetscInt *diag = a->diag,oidx; 125 MatScalar *aa=a->a,*v; 126 PetscScalar s1,s2,s3,x1,x2,x3; 127 PetscScalar *x,*b; 128 129 PetscFunctionBegin; 130 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133 134 /* forward solve the U^T */ 135 idx = 0; 136 for (i=0; i<n; i++) { 137 138 v = aa + 9*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144 v += 9; 145 146 vi = aj + diag[i] + 1; 147 nz = ai[i+1] - diag[i] - 1; 148 while (nz--) { 149 oidx = 3*(*vi++); 150 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153 v += 9; 154 } 155 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156 idx += 3; 157 } 158 /* backward solve the L^T */ 159 for (i=n-1; i>=0; i--){ 160 v = aa + 9*diag[i] - 9; 161 vi = aj + diag[i] - 1; 162 nz = diag[i] - ai[i]; 163 idt = 3*i; 164 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165 while (nz--) { 166 idx = 3*(*vi--); 167 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170 v -= 9; 171 } 172 } 173 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182 { 183 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184 PetscErrorCode ierr; 185 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186 PetscInt *diag = a->diag,oidx; 187 MatScalar *aa=a->a,*v; 188 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 189 PetscScalar *x,*b; 190 191 PetscFunctionBegin; 192 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 193 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195 196 /* forward solve the U^T */ 197 idx = 0; 198 for (i=0; i<n; i++) { 199 200 v = aa + 16*diag[i]; 201 /* multiply by the inverse of the block diagonal */ 202 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207 v += 16; 208 209 vi = aj + diag[i] + 1; 210 nz = ai[i+1] - diag[i] - 1; 211 while (nz--) { 212 oidx = 4*(*vi++); 213 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217 v += 16; 218 } 219 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220 idx += 4; 221 } 222 /* backward solve the L^T */ 223 for (i=n-1; i>=0; i--){ 224 v = aa + 16*diag[i] - 16; 225 vi = aj + diag[i] - 1; 226 nz = diag[i] - ai[i]; 227 idt = 4*i; 228 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229 while (nz--) { 230 idx = 4*(*vi--); 231 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235 v -= 16; 236 } 237 } 238 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 239 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247 { 248 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249 PetscErrorCode ierr; 250 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251 PetscInt *diag = a->diag,oidx; 252 MatScalar *aa=a->a,*v; 253 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 254 PetscScalar *x,*b; 255 256 PetscFunctionBegin; 257 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 258 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 259 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260 261 /* forward solve the U^T */ 262 idx = 0; 263 for (i=0; i<n; i++) { 264 265 v = aa + 25*diag[i]; 266 /* multiply by the inverse of the block diagonal */ 267 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273 v += 25; 274 275 vi = aj + diag[i] + 1; 276 nz = ai[i+1] - diag[i] - 1; 277 while (nz--) { 278 oidx = 5*(*vi++); 279 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284 v += 25; 285 } 286 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287 idx += 5; 288 } 289 /* backward solve the L^T */ 290 for (i=n-1; i>=0; i--){ 291 v = aa + 25*diag[i] - 25; 292 vi = aj + diag[i] - 1; 293 nz = diag[i] - ai[i]; 294 idt = 5*i; 295 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296 while (nz--) { 297 idx = 5*(*vi--); 298 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303 v -= 25; 304 } 305 } 306 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 307 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311 312 #undef __FUNCT__ 313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315 { 316 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317 PetscErrorCode ierr; 318 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319 PetscInt *diag = a->diag,oidx; 320 MatScalar *aa=a->a,*v; 321 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 322 PetscScalar *x,*b; 323 324 PetscFunctionBegin; 325 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 326 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 327 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328 329 /* forward solve the U^T */ 330 idx = 0; 331 for (i=0; i<n; i++) { 332 333 v = aa + 36*diag[i]; 334 /* multiply by the inverse of the block diagonal */ 335 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336 x6 = x[5+idx]; 337 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343 v += 36; 344 345 vi = aj + diag[i] + 1; 346 nz = ai[i+1] - diag[i] - 1; 347 while (nz--) { 348 oidx = 6*(*vi++); 349 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355 v += 36; 356 } 357 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358 x[5+idx] = s6; 359 idx += 6; 360 } 361 /* backward solve the L^T */ 362 for (i=n-1; i>=0; i--){ 363 v = aa + 36*diag[i] - 36; 364 vi = aj + diag[i] - 1; 365 nz = diag[i] - ai[i]; 366 idt = 6*i; 367 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368 s6 = x[5+idt]; 369 while (nz--) { 370 idx = 6*(*vi--); 371 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377 v -= 36; 378 } 379 } 380 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389 { 390 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391 PetscErrorCode ierr; 392 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393 PetscInt *diag = a->diag,oidx; 394 MatScalar *aa=a->a,*v; 395 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 396 PetscScalar *x,*b; 397 398 PetscFunctionBegin; 399 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 400 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 401 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402 403 /* forward solve the U^T */ 404 idx = 0; 405 for (i=0; i<n; i++) { 406 407 v = aa + 49*diag[i]; 408 /* multiply by the inverse of the block diagonal */ 409 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410 x6 = x[5+idx]; x7 = x[6+idx]; 411 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418 v += 49; 419 420 vi = aj + diag[i] + 1; 421 nz = ai[i+1] - diag[i] - 1; 422 while (nz--) { 423 oidx = 7*(*vi++); 424 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431 v += 49; 432 } 433 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434 x[5+idx] = s6;x[6+idx] = s7; 435 idx += 7; 436 } 437 /* backward solve the L^T */ 438 for (i=n-1; i>=0; i--){ 439 v = aa + 49*diag[i] - 49; 440 vi = aj + diag[i] - 1; 441 nz = diag[i] - ai[i]; 442 idt = 7*i; 443 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444 s6 = x[5+idt];s7 = x[6+idt]; 445 while (nz--) { 446 idx = 7*(*vi--); 447 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454 v -= 49; 455 } 456 } 457 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460 PetscFunctionReturn(0); 461 } 462 463 /*---------------------------------------------------------------------------------------------*/ 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469 IS iscol=a->col,isrow=a->row; 470 PetscErrorCode ierr; 471 const PetscInt *r,*c,*rout,*cout; 472 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473 PetscInt *diag = a->diag; 474 MatScalar *aa=a->a,*v; 475 PetscScalar s1,*x,*b,*t; 476 477 PetscFunctionBegin; 478 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 482 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484 485 /* copy the b into temp work space according to permutation */ 486 for (i=0; i<n; i++) { 487 t[i] = b[c[i]]; 488 } 489 490 /* forward solve the U^T */ 491 for (i=0; i<n; i++) { 492 493 v = aa + diag[i]; 494 /* multiply by the inverse of the block diagonal */ 495 s1 = (*v++)*t[i]; 496 vi = aj + diag[i] + 1; 497 nz = ai[i+1] - diag[i] - 1; 498 while (nz--) { 499 t[*vi++] -= (*v++)*s1; 500 } 501 t[i] = s1; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--){ 505 v = aa + diag[i] - 1; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 s1 = t[i]; 509 while (nz--) { 510 t[*vi--] -= (*v--)*s1; 511 } 512 } 513 514 /* copy t into x according to permutation */ 515 for (i=0; i<n; i++) { 516 x[r[i]] = t[i]; 517 } 518 519 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 521 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530 { 531 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532 IS iscol=a->col,isrow=a->row; 533 PetscErrorCode ierr; 534 const PetscInt *r,*c,*rout,*cout; 535 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536 PetscInt *diag = a->diag,ii,ic,ir,oidx; 537 MatScalar *aa=a->a,*v; 538 PetscScalar s1,s2,x1,x2; 539 PetscScalar *x,*b,*t; 540 541 PetscFunctionBegin; 542 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544 t = a->solve_work; 545 546 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548 549 /* copy the b into temp work space according to permutation */ 550 ii = 0; 551 for (i=0; i<n; i++) { 552 ic = 2*c[i]; 553 t[ii] = b[ic]; 554 t[ii+1] = b[ic+1]; 555 ii += 2; 556 } 557 558 /* forward solve the U^T */ 559 idx = 0; 560 for (i=0; i<n; i++) { 561 562 v = aa + 4*diag[i]; 563 /* multiply by the inverse of the block diagonal */ 564 x1 = t[idx]; x2 = t[1+idx]; 565 s1 = v[0]*x1 + v[1]*x2; 566 s2 = v[2]*x1 + v[3]*x2; 567 v += 4; 568 569 vi = aj + diag[i] + 1; 570 nz = ai[i+1] - diag[i] - 1; 571 while (nz--) { 572 oidx = 2*(*vi++); 573 t[oidx] -= v[0]*s1 + v[1]*s2; 574 t[oidx+1] -= v[2]*s1 + v[3]*s2; 575 v += 4; 576 } 577 t[idx] = s1;t[1+idx] = s2; 578 idx += 2; 579 } 580 /* backward solve the L^T */ 581 for (i=n-1; i>=0; i--){ 582 v = aa + 4*diag[i] - 4; 583 vi = aj + diag[i] - 1; 584 nz = diag[i] - ai[i]; 585 idt = 2*i; 586 s1 = t[idt]; s2 = t[1+idt]; 587 while (nz--) { 588 idx = 2*(*vi--); 589 t[idx] -= v[0]*s1 + v[1]*s2; 590 t[idx+1] -= v[2]*s1 + v[3]*s2; 591 v -= 4; 592 } 593 } 594 595 /* copy t into x according to permutation */ 596 ii = 0; 597 for (i=0; i<n; i++) { 598 ir = 2*r[i]; 599 x[ir] = t[ii]; 600 x[ir+1] = t[ii+1]; 601 ii += 2; 602 } 603 604 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 606 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 #undef __FUNCT__ 613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615 { 616 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617 IS iscol=a->col,isrow=a->row; 618 PetscErrorCode ierr; 619 const PetscInt *r,*c,*rout,*cout; 620 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621 PetscInt *diag = a->diag,ii,ic,ir,oidx; 622 MatScalar *aa=a->a,*v; 623 PetscScalar s1,s2,s3,x1,x2,x3; 624 PetscScalar *x,*b,*t; 625 626 PetscFunctionBegin; 627 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 628 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629 t = a->solve_work; 630 631 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633 634 /* copy the b into temp work space according to permutation */ 635 ii = 0; 636 for (i=0; i<n; i++) { 637 ic = 3*c[i]; 638 t[ii] = b[ic]; 639 t[ii+1] = b[ic+1]; 640 t[ii+2] = b[ic+2]; 641 ii += 3; 642 } 643 644 /* forward solve the U^T */ 645 idx = 0; 646 for (i=0; i<n; i++) { 647 648 v = aa + 9*diag[i]; 649 /* multiply by the inverse of the block diagonal */ 650 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654 v += 9; 655 656 vi = aj + diag[i] + 1; 657 nz = ai[i+1] - diag[i] - 1; 658 while (nz--) { 659 oidx = 3*(*vi++); 660 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663 v += 9; 664 } 665 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666 idx += 3; 667 } 668 /* backward solve the L^T */ 669 for (i=n-1; i>=0; i--){ 670 v = aa + 9*diag[i] - 9; 671 vi = aj + diag[i] - 1; 672 nz = diag[i] - ai[i]; 673 idt = 3*i; 674 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675 while (nz--) { 676 idx = 3*(*vi--); 677 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680 v -= 9; 681 } 682 } 683 684 /* copy t into x according to permutation */ 685 ii = 0; 686 for (i=0; i<n; i++) { 687 ir = 3*r[i]; 688 x[ir] = t[ii]; 689 x[ir+1] = t[ii+1]; 690 x[ir+2] = t[ii+2]; 691 ii += 3; 692 } 693 694 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 696 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707 IS iscol=a->col,isrow=a->row; 708 PetscErrorCode ierr; 709 const PetscInt *r,*c,*rout,*cout; 710 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711 PetscInt *diag = a->diag,ii,ic,ir,oidx; 712 MatScalar *aa=a->a,*v; 713 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 714 PetscScalar *x,*b,*t; 715 716 PetscFunctionBegin; 717 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 t = a->solve_work; 720 721 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723 724 /* copy the b into temp work space according to permutation */ 725 ii = 0; 726 for (i=0; i<n; i++) { 727 ic = 4*c[i]; 728 t[ii] = b[ic]; 729 t[ii+1] = b[ic+1]; 730 t[ii+2] = b[ic+2]; 731 t[ii+3] = b[ic+3]; 732 ii += 4; 733 } 734 735 /* forward solve the U^T */ 736 idx = 0; 737 for (i=0; i<n; i++) { 738 739 v = aa + 16*diag[i]; 740 /* multiply by the inverse of the block diagonal */ 741 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746 v += 16; 747 748 vi = aj + diag[i] + 1; 749 nz = ai[i+1] - diag[i] - 1; 750 while (nz--) { 751 oidx = 4*(*vi++); 752 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756 v += 16; 757 } 758 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759 idx += 4; 760 } 761 /* backward solve the L^T */ 762 for (i=n-1; i>=0; i--){ 763 v = aa + 16*diag[i] - 16; 764 vi = aj + diag[i] - 1; 765 nz = diag[i] - ai[i]; 766 idt = 4*i; 767 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768 while (nz--) { 769 idx = 4*(*vi--); 770 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774 v -= 16; 775 } 776 } 777 778 /* copy t into x according to permutation */ 779 ii = 0; 780 for (i=0; i<n; i++) { 781 ir = 4*r[i]; 782 x[ir] = t[ii]; 783 x[ir+1] = t[ii+1]; 784 x[ir+2] = t[ii+2]; 785 x[ir+3] = t[ii+3]; 786 ii += 4; 787 } 788 789 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 791 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800 { 801 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802 IS iscol=a->col,isrow=a->row; 803 PetscErrorCode ierr; 804 const PetscInt *r,*c,*rout,*cout; 805 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806 PetscInt *diag = a->diag,ii,ic,ir,oidx; 807 MatScalar *aa=a->a,*v; 808 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 809 PetscScalar *x,*b,*t; 810 811 PetscFunctionBegin; 812 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 813 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814 t = a->solve_work; 815 816 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818 819 /* copy the b into temp work space according to permutation */ 820 ii = 0; 821 for (i=0; i<n; i++) { 822 ic = 5*c[i]; 823 t[ii] = b[ic]; 824 t[ii+1] = b[ic+1]; 825 t[ii+2] = b[ic+2]; 826 t[ii+3] = b[ic+3]; 827 t[ii+4] = b[ic+4]; 828 ii += 5; 829 } 830 831 /* forward solve the U^T */ 832 idx = 0; 833 for (i=0; i<n; i++) { 834 835 v = aa + 25*diag[i]; 836 /* multiply by the inverse of the block diagonal */ 837 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843 v += 25; 844 845 vi = aj + diag[i] + 1; 846 nz = ai[i+1] - diag[i] - 1; 847 while (nz--) { 848 oidx = 5*(*vi++); 849 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v += 25; 855 } 856 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857 idx += 5; 858 } 859 /* backward solve the L^T */ 860 for (i=n-1; i>=0; i--){ 861 v = aa + 25*diag[i] - 25; 862 vi = aj + diag[i] - 1; 863 nz = diag[i] - ai[i]; 864 idt = 5*i; 865 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866 while (nz--) { 867 idx = 5*(*vi--); 868 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873 v -= 25; 874 } 875 } 876 877 /* copy t into x according to permutation */ 878 ii = 0; 879 for (i=0; i<n; i++) { 880 ir = 5*r[i]; 881 x[ir] = t[ii]; 882 x[ir+1] = t[ii+1]; 883 x[ir+2] = t[ii+2]; 884 x[ir+3] = t[ii+3]; 885 x[ir+4] = t[ii+4]; 886 ii += 5; 887 } 888 889 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 891 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902 IS iscol=a->col,isrow=a->row; 903 PetscErrorCode ierr; 904 const PetscInt *r,*c,*rout,*cout; 905 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906 PetscInt *diag = a->diag,ii,ic,ir,oidx; 907 MatScalar *aa=a->a,*v; 908 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 909 PetscScalar *x,*b,*t; 910 911 PetscFunctionBegin; 912 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 t = a->solve_work; 915 916 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918 919 /* copy the b into temp work space according to permutation */ 920 ii = 0; 921 for (i=0; i<n; i++) { 922 ic = 6*c[i]; 923 t[ii] = b[ic]; 924 t[ii+1] = b[ic+1]; 925 t[ii+2] = b[ic+2]; 926 t[ii+3] = b[ic+3]; 927 t[ii+4] = b[ic+4]; 928 t[ii+5] = b[ic+5]; 929 ii += 6; 930 } 931 932 /* forward solve the U^T */ 933 idx = 0; 934 for (i=0; i<n; i++) { 935 936 v = aa + 36*diag[i]; 937 /* multiply by the inverse of the block diagonal */ 938 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939 x6 = t[5+idx]; 940 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946 v += 36; 947 948 vi = aj + diag[i] + 1; 949 nz = ai[i+1] - diag[i] - 1; 950 while (nz--) { 951 oidx = 6*(*vi++); 952 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958 v += 36; 959 } 960 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961 t[5+idx] = s6; 962 idx += 6; 963 } 964 /* backward solve the L^T */ 965 for (i=n-1; i>=0; i--){ 966 v = aa + 36*diag[i] - 36; 967 vi = aj + diag[i] - 1; 968 nz = diag[i] - ai[i]; 969 idt = 6*i; 970 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971 s6 = t[5+idt]; 972 while (nz--) { 973 idx = 6*(*vi--); 974 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980 v -= 36; 981 } 982 } 983 984 /* copy t into x according to permutation */ 985 ii = 0; 986 for (i=0; i<n; i++) { 987 ir = 6*r[i]; 988 x[ir] = t[ii]; 989 x[ir+1] = t[ii+1]; 990 x[ir+2] = t[ii+2]; 991 x[ir+3] = t[ii+3]; 992 x[ir+4] = t[ii+4]; 993 x[ir+5] = t[ii+5]; 994 ii += 6; 995 } 996 997 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 999 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1000 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008 { 1009 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010 IS iscol=a->col,isrow=a->row; 1011 PetscErrorCode ierr; 1012 const PetscInt *r,*c,*rout,*cout; 1013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015 MatScalar *aa=a->a,*v; 1016 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1017 PetscScalar *x,*b,*t; 1018 1019 PetscFunctionBegin; 1020 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022 t = a->solve_work; 1023 1024 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026 1027 /* copy the b into temp work space according to permutation */ 1028 ii = 0; 1029 for (i=0; i<n; i++) { 1030 ic = 7*c[i]; 1031 t[ii] = b[ic]; 1032 t[ii+1] = b[ic+1]; 1033 t[ii+2] = b[ic+2]; 1034 t[ii+3] = b[ic+3]; 1035 t[ii+4] = b[ic+4]; 1036 t[ii+5] = b[ic+5]; 1037 t[ii+6] = b[ic+6]; 1038 ii += 7; 1039 } 1040 1041 /* forward solve the U^T */ 1042 idx = 0; 1043 for (i=0; i<n; i++) { 1044 1045 v = aa + 49*diag[i]; 1046 /* multiply by the inverse of the block diagonal */ 1047 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048 x6 = t[5+idx]; x7 = t[6+idx]; 1049 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056 v += 49; 1057 1058 vi = aj + diag[i] + 1; 1059 nz = ai[i+1] - diag[i] - 1; 1060 while (nz--) { 1061 oidx = 7*(*vi++); 1062 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v += 49; 1070 } 1071 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072 t[5+idx] = s6;t[6+idx] = s7; 1073 idx += 7; 1074 } 1075 /* backward solve the L^T */ 1076 for (i=n-1; i>=0; i--){ 1077 v = aa + 49*diag[i] - 49; 1078 vi = aj + diag[i] - 1; 1079 nz = diag[i] - ai[i]; 1080 idt = 7*i; 1081 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082 s6 = t[5+idt];s7 = t[6+idt]; 1083 while (nz--) { 1084 idx = 7*(*vi--); 1085 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092 v -= 49; 1093 } 1094 } 1095 1096 /* copy t into x according to permutation */ 1097 ii = 0; 1098 for (i=0; i<n; i++) { 1099 ir = 7*r[i]; 1100 x[ir] = t[ii]; 1101 x[ir+1] = t[ii+1]; 1102 x[ir+2] = t[ii+2]; 1103 x[ir+3] = t[ii+3]; 1104 x[ir+4] = t[ii+4]; 1105 x[ir+5] = t[ii+5]; 1106 x[ir+6] = t[ii+6]; 1107 ii += 7; 1108 } 1109 1110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1112 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 /* ----------------------------------------------------------- */ 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1124 IS iscol=a->col,isrow=a->row; 1125 PetscErrorCode ierr; 1126 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1127 PetscInt i,n=a->mbs; 1128 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1129 MatScalar *aa=a->a,*v; 1130 PetscScalar *x,*b,*s,*t,*ls; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135 t = a->solve_work; 1136 1137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1139 1140 /* forward solve the lower triangular */ 1141 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1142 for (i=1; i<n; i++) { 1143 v = aa + bs2*ai[i]; 1144 vi = aj + ai[i]; 1145 nz = a->diag[i] - ai[i]; 1146 s = t + bs*i; 1147 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 while (nz--) { 1149 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1150 v += bs2; 1151 } 1152 } 1153 /* backward solve the upper triangular */ 1154 ls = a->solve_work + A->cmap->n; 1155 for (i=n-1; i>=0; i--){ 1156 v = aa + bs2*(a->diag[i] + 1); 1157 vi = aj + a->diag[i] + 1; 1158 nz = ai[i+1] - a->diag[i] - 1; 1159 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1160 while (nz--) { 1161 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1162 v += bs2; 1163 } 1164 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1165 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1166 } 1167 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1179 { 1180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1181 IS iscol=a->col,isrow=a->row; 1182 PetscErrorCode ierr; 1183 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1184 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1185 MatScalar *aa=a->a,*v; 1186 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1187 PetscScalar *x,*b,*t; 1188 1189 PetscFunctionBegin; 1190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192 t = a->solve_work; 1193 1194 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1195 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1196 1197 /* forward solve the lower triangular */ 1198 idx = 7*(*r++); 1199 t[0] = b[idx]; t[1] = b[1+idx]; 1200 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201 t[5] = b[5+idx]; t[6] = b[6+idx]; 1202 1203 for (i=1; i<n; i++) { 1204 v = aa + 49*ai[i]; 1205 vi = aj + ai[i]; 1206 nz = diag[i] - ai[i]; 1207 idx = 7*(*r++); 1208 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1210 while (nz--) { 1211 idx = 7*(*vi++); 1212 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213 x4 = t[3+idx];x5 = t[4+idx]; 1214 x6 = t[5+idx];x7 = t[6+idx]; 1215 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1222 v += 49; 1223 } 1224 idx = 7*i; 1225 t[idx] = s1;t[1+idx] = s2; 1226 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227 t[5+idx] = s6;t[6+idx] = s7; 1228 } 1229 /* backward solve the upper triangular */ 1230 for (i=n-1; i>=0; i--){ 1231 v = aa + 49*diag[i] + 49; 1232 vi = aj + diag[i] + 1; 1233 nz = ai[i+1] - diag[i] - 1; 1234 idt = 7*i; 1235 s1 = t[idt]; s2 = t[1+idt]; 1236 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237 s6 = t[5+idt];s7 = t[6+idt]; 1238 while (nz--) { 1239 idx = 7*(*vi++); 1240 x1 = t[idx]; x2 = t[1+idx]; 1241 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242 x6 = t[5+idx]; x7 = t[6+idx]; 1243 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1250 v += 49; 1251 } 1252 idc = 7*(*c--); 1253 v = aa + 49*diag[i]; 1254 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1268 } 1269 1270 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1271 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1272 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1273 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1275 PetscFunctionReturn(0); 1276 } 1277 1278 #undef __FUNCT__ 1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1280 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1281 { 1282 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1283 IS iscol=a->col,isrow=a->row; 1284 PetscErrorCode ierr; 1285 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 1286 PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 1287 MatScalar *aa=a->a,*v; 1288 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1289 PetscScalar *x,*b,*t; 1290 1291 PetscFunctionBegin; 1292 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1293 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1294 t = a->solve_work; 1295 1296 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1297 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1298 1299 /* forward solve the lower triangular */ 1300 idx = 7*r[0]; 1301 t[0] = b[idx]; t[1] = b[1+idx]; 1302 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1303 t[5] = b[5+idx]; t[6] = b[6+idx]; 1304 1305 for (i=1; i<n; i++) { 1306 v = aa + 49*ai[i]; 1307 vi = aj + ai[i]; 1308 nz = ai[i+1] - ai[i]; 1309 idx = 7*r[i]; 1310 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1311 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1312 for(m=0;m<nz;m++){ 1313 idx = 7*vi[m]; 1314 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1315 x4 = t[3+idx];x5 = t[4+idx]; 1316 x6 = t[5+idx];x7 = t[6+idx]; 1317 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1318 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1319 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1320 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1321 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1322 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1323 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1324 v += 49; 1325 } 1326 idx = 7*i; 1327 t[idx] = s1;t[1+idx] = s2; 1328 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1329 t[5+idx] = s6;t[6+idx] = s7; 1330 } 1331 /* backward solve the upper triangular */ 1332 for (i=n-1; i>=0; i--){ 1333 k = 2*n-i; 1334 v = aa + 49*ai[k]; 1335 vi = aj + ai[k]; 1336 nz = ai[k+1] - ai[k] - 1; 1337 idt = 7*i; 1338 s1 = t[idt]; s2 = t[1+idt]; 1339 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1340 s6 = t[5+idt];s7 = t[6+idt]; 1341 for(m=0;m<nz;m++){ 1342 idx = 7*vi[m]; 1343 x1 = t[idx]; x2 = t[1+idx]; 1344 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1345 x6 = t[5+idx]; x7 = t[6+idx]; 1346 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1347 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1348 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1349 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1350 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1351 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1352 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1353 v += 49; 1354 } 1355 idc = 7*c[i]; 1356 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1357 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1358 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1359 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1360 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1361 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1362 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1363 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1364 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1365 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1366 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1367 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1368 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1369 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1370 } 1371 1372 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1373 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1374 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1375 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1376 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1377 PetscFunctionReturn(0); 1378 } 1379 1380 #undef __FUNCT__ 1381 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1382 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1383 { 1384 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1385 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1386 PetscErrorCode ierr; 1387 PetscInt *diag = a->diag,jdx; 1388 const MatScalar *aa=a->a,*v; 1389 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1390 const PetscScalar *b; 1391 1392 PetscFunctionBegin; 1393 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1394 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1395 /* forward solve the lower triangular */ 1396 idx = 0; 1397 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1398 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1399 x[6] = b[6+idx]; 1400 for (i=1; i<n; i++) { 1401 v = aa + 49*ai[i]; 1402 vi = aj + ai[i]; 1403 nz = diag[i] - ai[i]; 1404 idx = 7*i; 1405 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1406 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1407 s7 = b[6+idx]; 1408 while (nz--) { 1409 jdx = 7*(*vi++); 1410 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1411 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1412 x7 = x[6+jdx]; 1413 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1414 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1415 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1416 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1417 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1418 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1419 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1420 v += 49; 1421 } 1422 x[idx] = s1; 1423 x[1+idx] = s2; 1424 x[2+idx] = s3; 1425 x[3+idx] = s4; 1426 x[4+idx] = s5; 1427 x[5+idx] = s6; 1428 x[6+idx] = s7; 1429 } 1430 /* backward solve the upper triangular */ 1431 for (i=n-1; i>=0; i--){ 1432 v = aa + 49*diag[i] + 49; 1433 vi = aj + diag[i] + 1; 1434 nz = ai[i+1] - diag[i] - 1; 1435 idt = 7*i; 1436 s1 = x[idt]; s2 = x[1+idt]; 1437 s3 = x[2+idt]; s4 = x[3+idt]; 1438 s5 = x[4+idt]; s6 = x[5+idt]; 1439 s7 = x[6+idt]; 1440 while (nz--) { 1441 idx = 7*(*vi++); 1442 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1443 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1444 x7 = x[6+idx]; 1445 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1446 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1447 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1448 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1449 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1450 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1451 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1452 v += 49; 1453 } 1454 v = aa + 49*diag[i]; 1455 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1456 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1457 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1458 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1459 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1460 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1461 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1462 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1463 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1464 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1465 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1466 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1467 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1468 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1469 } 1470 1471 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1472 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1473 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1474 PetscFunctionReturn(0); 1475 } 1476 1477 #undef __FUNCT__ 1478 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1479 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1480 { 1481 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1482 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1483 PetscErrorCode ierr; 1484 PetscInt idx,jdx,idt; 1485 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1486 const MatScalar *aa=a->a,*v; 1487 PetscScalar *x; 1488 const PetscScalar *b; 1489 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1490 1491 PetscFunctionBegin; 1492 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1493 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1494 /* forward solve the lower triangular */ 1495 idx = 0; 1496 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1497 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1498 for (i=1; i<n; i++) { 1499 v = aa + bs2*ai[i]; 1500 vi = aj + ai[i]; 1501 nz = ai[i+1] - ai[i]; 1502 idx = bs*i; 1503 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1504 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1505 for(k=0;k<nz;k++) { 1506 jdx = bs*vi[k]; 1507 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1508 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1509 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1510 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1511 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1512 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1513 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1514 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1515 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1516 v += bs2; 1517 } 1518 1519 x[idx] = s1; 1520 x[1+idx] = s2; 1521 x[2+idx] = s3; 1522 x[3+idx] = s4; 1523 x[4+idx] = s5; 1524 x[5+idx] = s6; 1525 x[6+idx] = s7; 1526 } 1527 1528 /* backward solve the upper triangular */ 1529 for (i=n-1; i>=0; i--){ 1530 v = aa + bs2*ai[2*n-i]; 1531 vi = aj + ai[2*n-i]; 1532 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1533 idt = bs*i; 1534 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1535 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1536 for(k=0;k<nz;k++) { 1537 idx = bs*vi[k]; 1538 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1539 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1540 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1541 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1542 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1543 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1544 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1545 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1546 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1547 v += bs2; 1548 } 1549 /* x = inv_diagonal*x */ 1550 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1551 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1552 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1553 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1554 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1555 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1556 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1557 } 1558 1559 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1560 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1561 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 #undef __FUNCT__ 1566 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2" 1567 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1568 { 1569 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1570 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 1571 PetscErrorCode ierr; 1572 PetscInt idx,jdx,idt; 1573 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1574 const MatScalar *aa=a->a,*v; 1575 PetscScalar *x; 1576 const PetscScalar *b; 1577 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1578 1579 PetscFunctionBegin; 1580 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1581 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1582 /* forward solve the lower triangular */ 1583 idx = 0; 1584 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1585 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1586 for (i=1; i<n; i++) { 1587 v = aa + bs2*ai[i]; 1588 vi = aj + ai[i]; 1589 nz = ai[i+1] - ai[i]; 1590 idx = bs*i; 1591 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1592 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1593 for(k=0;k<nz;k++) { 1594 jdx = bs*vi[k]; 1595 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1596 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1597 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1598 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1599 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1600 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1601 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1602 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1603 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1604 v += bs2; 1605 } 1606 1607 x[idx] = s1; 1608 x[1+idx] = s2; 1609 x[2+idx] = s3; 1610 x[3+idx] = s4; 1611 x[4+idx] = s5; 1612 x[5+idx] = s6; 1613 x[6+idx] = s7; 1614 } 1615 1616 /* backward solve the upper triangular */ 1617 for (i=n-1; i>=0; i--){ 1618 v = aa + bs2*(adiag[i+1]+1); 1619 vi = aj + adiag[i+1]+1; 1620 nz = adiag[i] - adiag[i+1]-1; 1621 idt = bs*i; 1622 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1623 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1624 for(k=0;k<nz;k++) { 1625 idx = bs*vi[k]; 1626 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1627 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1628 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1629 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1630 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1631 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1632 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1633 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1634 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1635 v += bs2; 1636 } 1637 /* x = inv_diagonal*x */ 1638 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1639 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1640 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1641 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1642 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1643 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1644 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1645 } 1646 1647 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1648 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1649 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1650 PetscFunctionReturn(0); 1651 } 1652 1653 #undef __FUNCT__ 1654 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1655 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1656 { 1657 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1658 IS iscol=a->col,isrow=a->row; 1659 PetscErrorCode ierr; 1660 const PetscInt *r,*c,*rout,*cout; 1661 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1662 const MatScalar *aa=a->a,*v; 1663 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1664 const PetscScalar *b; 1665 PetscFunctionBegin; 1666 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1667 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1668 t = a->solve_work; 1669 1670 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1671 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1672 1673 /* forward solve the lower triangular */ 1674 idx = 6*(*r++); 1675 t[0] = b[idx]; t[1] = b[1+idx]; 1676 t[2] = b[2+idx]; t[3] = b[3+idx]; 1677 t[4] = b[4+idx]; t[5] = b[5+idx]; 1678 for (i=1; i<n; i++) { 1679 v = aa + 36*ai[i]; 1680 vi = aj + ai[i]; 1681 nz = diag[i] - ai[i]; 1682 idx = 6*(*r++); 1683 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1684 s5 = b[4+idx]; s6 = b[5+idx]; 1685 while (nz--) { 1686 idx = 6*(*vi++); 1687 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1688 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1689 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1690 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1691 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1692 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1693 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1694 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1695 v += 36; 1696 } 1697 idx = 6*i; 1698 t[idx] = s1;t[1+idx] = s2; 1699 t[2+idx] = s3;t[3+idx] = s4; 1700 t[4+idx] = s5;t[5+idx] = s6; 1701 } 1702 /* backward solve the upper triangular */ 1703 for (i=n-1; i>=0; i--){ 1704 v = aa + 36*diag[i] + 36; 1705 vi = aj + diag[i] + 1; 1706 nz = ai[i+1] - diag[i] - 1; 1707 idt = 6*i; 1708 s1 = t[idt]; s2 = t[1+idt]; 1709 s3 = t[2+idt];s4 = t[3+idt]; 1710 s5 = t[4+idt];s6 = t[5+idt]; 1711 while (nz--) { 1712 idx = 6*(*vi++); 1713 x1 = t[idx]; x2 = t[1+idx]; 1714 x3 = t[2+idx]; x4 = t[3+idx]; 1715 x5 = t[4+idx]; x6 = t[5+idx]; 1716 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1717 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1718 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1719 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1720 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1721 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1722 v += 36; 1723 } 1724 idc = 6*(*c--); 1725 v = aa + 36*diag[i]; 1726 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1727 v[18]*s4+v[24]*s5+v[30]*s6; 1728 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1729 v[19]*s4+v[25]*s5+v[31]*s6; 1730 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1731 v[20]*s4+v[26]*s5+v[32]*s6; 1732 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1733 v[21]*s4+v[27]*s5+v[33]*s6; 1734 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1735 v[22]*s4+v[28]*s5+v[34]*s6; 1736 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1737 v[23]*s4+v[29]*s5+v[35]*s6; 1738 } 1739 1740 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1741 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1742 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1743 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1744 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1745 PetscFunctionReturn(0); 1746 } 1747 1748 #undef __FUNCT__ 1749 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 1750 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 1751 { 1752 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1753 IS iscol=a->col,isrow=a->row; 1754 PetscErrorCode ierr; 1755 const PetscInt *r,*c,*rout,*cout; 1756 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 1757 const MatScalar *aa=a->a,*v; 1758 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1759 const PetscScalar *b; 1760 PetscFunctionBegin; 1761 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1762 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1763 t = a->solve_work; 1764 1765 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1766 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1767 1768 /* forward solve the lower triangular */ 1769 idx = 6*r[0]; 1770 t[0] = b[idx]; t[1] = b[1+idx]; 1771 t[2] = b[2+idx]; t[3] = b[3+idx]; 1772 t[4] = b[4+idx]; t[5] = b[5+idx]; 1773 for (i=1; i<n; i++) { 1774 v = aa + 36*ai[i]; 1775 vi = aj + ai[i]; 1776 nz = ai[i+1] - ai[i]; 1777 idx = 6*r[i]; 1778 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1779 s5 = b[4+idx]; s6 = b[5+idx]; 1780 for(m=0;m<nz;m++){ 1781 idx = 6*vi[m]; 1782 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1783 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1784 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1785 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1786 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1787 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1788 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1789 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1790 v += 36; 1791 } 1792 idx = 6*i; 1793 t[idx] = s1;t[1+idx] = s2; 1794 t[2+idx] = s3;t[3+idx] = s4; 1795 t[4+idx] = s5;t[5+idx] = s6; 1796 } 1797 /* backward solve the upper triangular */ 1798 for (i=n-1; i>=0; i--){ 1799 k = 2*n-i; 1800 v = aa + 36*ai[k]; 1801 vi = aj + ai[k]; 1802 nz = ai[k+1] - ai[k] - 1; 1803 idt = 6*i; 1804 s1 = t[idt]; s2 = t[1+idt]; 1805 s3 = t[2+idt];s4 = t[3+idt]; 1806 s5 = t[4+idt];s6 = t[5+idt]; 1807 for(m=0;m<nz;m++){ 1808 idx = 6*vi[m]; 1809 x1 = t[idx]; x2 = t[1+idx]; 1810 x3 = t[2+idx]; x4 = t[3+idx]; 1811 x5 = t[4+idx]; x6 = t[5+idx]; 1812 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1813 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1814 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1815 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1816 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1817 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1818 v += 36; 1819 } 1820 idc = 6*c[i]; 1821 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1822 v[18]*s4+v[24]*s5+v[30]*s6; 1823 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1824 v[19]*s4+v[25]*s5+v[31]*s6; 1825 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1826 v[20]*s4+v[26]*s5+v[32]*s6; 1827 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1828 v[21]*s4+v[27]*s5+v[33]*s6; 1829 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1830 v[22]*s4+v[28]*s5+v[34]*s6; 1831 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1832 v[23]*s4+v[29]*s5+v[35]*s6; 1833 } 1834 1835 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1836 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1837 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1838 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1839 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1840 PetscFunctionReturn(0); 1841 } 1842 1843 #undef __FUNCT__ 1844 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2" 1845 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1846 { 1847 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1848 IS iscol=a->col,isrow=a->row; 1849 PetscErrorCode ierr; 1850 const PetscInt *r,*c,*rout,*cout; 1851 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 1852 const MatScalar *aa=a->a,*v; 1853 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1854 const PetscScalar *b; 1855 PetscFunctionBegin; 1856 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1857 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1858 t = a->solve_work; 1859 1860 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1861 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1862 1863 /* forward solve the lower triangular */ 1864 idx = 6*r[0]; 1865 t[0] = b[idx]; t[1] = b[1+idx]; 1866 t[2] = b[2+idx]; t[3] = b[3+idx]; 1867 t[4] = b[4+idx]; t[5] = b[5+idx]; 1868 for (i=1; i<n; i++) { 1869 v = aa + 36*ai[i]; 1870 vi = aj + ai[i]; 1871 nz = ai[i+1] - ai[i]; 1872 idx = 6*r[i]; 1873 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1874 s5 = b[4+idx]; s6 = b[5+idx]; 1875 for(m=0;m<nz;m++){ 1876 idx = 6*vi[m]; 1877 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1878 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1879 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1880 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1881 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1882 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1883 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1884 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1885 v += 36; 1886 } 1887 idx = 6*i; 1888 t[idx] = s1;t[1+idx] = s2; 1889 t[2+idx] = s3;t[3+idx] = s4; 1890 t[4+idx] = s5;t[5+idx] = s6; 1891 } 1892 /* backward solve the upper triangular */ 1893 for (i=n-1; i>=0; i--){ 1894 v = aa + 36*(adiag[i+1]+1); 1895 vi = aj + adiag[i+1]+1; 1896 nz = adiag[i] - adiag[i+1] - 1; 1897 idt = 6*i; 1898 s1 = t[idt]; s2 = t[1+idt]; 1899 s3 = t[2+idt];s4 = t[3+idt]; 1900 s5 = t[4+idt];s6 = t[5+idt]; 1901 for(m=0;m<nz;m++){ 1902 idx = 6*vi[m]; 1903 x1 = t[idx]; x2 = t[1+idx]; 1904 x3 = t[2+idx]; x4 = t[3+idx]; 1905 x5 = t[4+idx]; x6 = t[5+idx]; 1906 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1907 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1908 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1909 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1910 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1911 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1912 v += 36; 1913 } 1914 idc = 6*c[i]; 1915 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1916 v[18]*s4+v[24]*s5+v[30]*s6; 1917 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1918 v[19]*s4+v[25]*s5+v[31]*s6; 1919 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1920 v[20]*s4+v[26]*s5+v[32]*s6; 1921 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1922 v[21]*s4+v[27]*s5+v[33]*s6; 1923 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1924 v[22]*s4+v[28]*s5+v[34]*s6; 1925 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1926 v[23]*s4+v[29]*s5+v[35]*s6; 1927 } 1928 1929 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1930 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1931 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1932 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1933 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1934 PetscFunctionReturn(0); 1935 } 1936 1937 #undef __FUNCT__ 1938 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1939 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1940 { 1941 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1942 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1943 PetscErrorCode ierr; 1944 PetscInt *diag = a->diag,jdx; 1945 const MatScalar *aa=a->a,*v; 1946 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1947 const PetscScalar *b; 1948 1949 PetscFunctionBegin; 1950 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1951 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1952 /* forward solve the lower triangular */ 1953 idx = 0; 1954 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1955 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1956 for (i=1; i<n; i++) { 1957 v = aa + 36*ai[i]; 1958 vi = aj + ai[i]; 1959 nz = diag[i] - ai[i]; 1960 idx = 6*i; 1961 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1962 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1963 while (nz--) { 1964 jdx = 6*(*vi++); 1965 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1966 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1967 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1968 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1969 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1970 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1971 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1972 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1973 v += 36; 1974 } 1975 x[idx] = s1; 1976 x[1+idx] = s2; 1977 x[2+idx] = s3; 1978 x[3+idx] = s4; 1979 x[4+idx] = s5; 1980 x[5+idx] = s6; 1981 } 1982 /* backward solve the upper triangular */ 1983 for (i=n-1; i>=0; i--){ 1984 v = aa + 36*diag[i] + 36; 1985 vi = aj + diag[i] + 1; 1986 nz = ai[i+1] - diag[i] - 1; 1987 idt = 6*i; 1988 s1 = x[idt]; s2 = x[1+idt]; 1989 s3 = x[2+idt]; s4 = x[3+idt]; 1990 s5 = x[4+idt]; s6 = x[5+idt]; 1991 while (nz--) { 1992 idx = 6*(*vi++); 1993 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1994 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1995 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1996 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1997 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1998 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1999 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2000 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2001 v += 36; 2002 } 2003 v = aa + 36*diag[i]; 2004 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2005 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2006 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2007 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2008 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2009 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2010 } 2011 2012 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2013 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2014 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2015 PetscFunctionReturn(0); 2016 } 2017 2018 #undef __FUNCT__ 2019 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2020 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2021 { 2022 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2023 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2024 PetscErrorCode ierr; 2025 PetscInt idx,jdx,idt; 2026 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2027 const MatScalar *aa=a->a,*v; 2028 PetscScalar *x; 2029 const PetscScalar *b; 2030 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2031 2032 PetscFunctionBegin; 2033 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2034 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2035 /* forward solve the lower triangular */ 2036 idx = 0; 2037 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2038 x[4] = b[4+idx];x[5] = b[5+idx]; 2039 for (i=1; i<n; i++) { 2040 v = aa + bs2*ai[i]; 2041 vi = aj + ai[i]; 2042 nz = ai[i+1] - ai[i]; 2043 idx = bs*i; 2044 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2045 s5 = b[4+idx];s6 = b[5+idx]; 2046 for(k=0;k<nz;k++){ 2047 jdx = bs*vi[k]; 2048 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2049 x5 = x[4+jdx]; x6 = x[5+jdx]; 2050 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2051 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2052 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2053 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2054 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2055 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2056 v += bs2; 2057 } 2058 2059 x[idx] = s1; 2060 x[1+idx] = s2; 2061 x[2+idx] = s3; 2062 x[3+idx] = s4; 2063 x[4+idx] = s5; 2064 x[5+idx] = s6; 2065 } 2066 2067 /* backward solve the upper triangular */ 2068 for (i=n-1; i>=0; i--){ 2069 v = aa + bs2*ai[2*n-i]; 2070 vi = aj + ai[2*n-i]; 2071 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2072 idt = bs*i; 2073 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2074 s5 = x[4+idt];s6 = x[5+idt]; 2075 for(k=0;k<nz;k++){ 2076 idx = bs*vi[k]; 2077 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2078 x5 = x[4+idx];x6 = x[5+idx]; 2079 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2080 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2081 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2082 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2083 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2084 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2085 v += bs2; 2086 } 2087 /* x = inv_diagonal*x */ 2088 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2089 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2090 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2091 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2092 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2093 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2094 } 2095 2096 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2097 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2098 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2099 PetscFunctionReturn(0); 2100 } 2101 2102 #undef __FUNCT__ 2103 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2" 2104 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2105 { 2106 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2107 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2108 PetscErrorCode ierr; 2109 PetscInt idx,jdx,idt; 2110 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2111 const MatScalar *aa=a->a,*v; 2112 PetscScalar *x; 2113 const PetscScalar *b; 2114 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2115 2116 PetscFunctionBegin; 2117 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2118 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2119 /* forward solve the lower triangular */ 2120 idx = 0; 2121 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2122 x[4] = b[4+idx];x[5] = b[5+idx]; 2123 for (i=1; i<n; i++) { 2124 v = aa + bs2*ai[i]; 2125 vi = aj + ai[i]; 2126 nz = ai[i+1] - ai[i]; 2127 idx = bs*i; 2128 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2129 s5 = b[4+idx];s6 = b[5+idx]; 2130 for(k=0;k<nz;k++){ 2131 jdx = bs*vi[k]; 2132 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2133 x5 = x[4+jdx]; x6 = x[5+jdx]; 2134 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2135 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2136 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2137 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2138 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2139 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2140 v += bs2; 2141 } 2142 2143 x[idx] = s1; 2144 x[1+idx] = s2; 2145 x[2+idx] = s3; 2146 x[3+idx] = s4; 2147 x[4+idx] = s5; 2148 x[5+idx] = s6; 2149 } 2150 2151 /* backward solve the upper triangular */ 2152 for (i=n-1; i>=0; i--){ 2153 v = aa + bs2*(adiag[i+1]+1); 2154 vi = aj + adiag[i+1]+1; 2155 nz = adiag[i] - adiag[i+1]-1; 2156 idt = bs*i; 2157 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2158 s5 = x[4+idt];s6 = x[5+idt]; 2159 for(k=0;k<nz;k++){ 2160 idx = bs*vi[k]; 2161 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2162 x5 = x[4+idx];x6 = x[5+idx]; 2163 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2164 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2165 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2166 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2167 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2168 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2169 v += bs2; 2170 } 2171 /* x = inv_diagonal*x */ 2172 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2173 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2174 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2175 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2176 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2177 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2178 } 2179 2180 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2181 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2182 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2183 PetscFunctionReturn(0); 2184 } 2185 2186 #undef __FUNCT__ 2187 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2188 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 2189 { 2190 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2191 IS iscol=a->col,isrow=a->row; 2192 PetscErrorCode ierr; 2193 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 2194 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2195 const MatScalar *aa=a->a,*v; 2196 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2197 const PetscScalar *b; 2198 2199 PetscFunctionBegin; 2200 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2201 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2202 t = a->solve_work; 2203 2204 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2205 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2206 2207 /* forward solve the lower triangular */ 2208 idx = 5*(*r++); 2209 t[0] = b[idx]; t[1] = b[1+idx]; 2210 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2211 for (i=1; i<n; i++) { 2212 v = aa + 25*ai[i]; 2213 vi = aj + ai[i]; 2214 nz = diag[i] - ai[i]; 2215 idx = 5*(*r++); 2216 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2217 s5 = b[4+idx]; 2218 while (nz--) { 2219 idx = 5*(*vi++); 2220 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2221 x4 = t[3+idx];x5 = t[4+idx]; 2222 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2223 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2224 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2225 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2226 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2227 v += 25; 2228 } 2229 idx = 5*i; 2230 t[idx] = s1;t[1+idx] = s2; 2231 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2232 } 2233 /* backward solve the upper triangular */ 2234 for (i=n-1; i>=0; i--){ 2235 v = aa + 25*diag[i] + 25; 2236 vi = aj + diag[i] + 1; 2237 nz = ai[i+1] - diag[i] - 1; 2238 idt = 5*i; 2239 s1 = t[idt]; s2 = t[1+idt]; 2240 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2241 while (nz--) { 2242 idx = 5*(*vi++); 2243 x1 = t[idx]; x2 = t[1+idx]; 2244 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2245 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2246 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2247 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2248 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2249 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2250 v += 25; 2251 } 2252 idc = 5*(*c--); 2253 v = aa + 25*diag[i]; 2254 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2255 v[15]*s4+v[20]*s5; 2256 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2257 v[16]*s4+v[21]*s5; 2258 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2259 v[17]*s4+v[22]*s5; 2260 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2261 v[18]*s4+v[23]*s5; 2262 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2263 v[19]*s4+v[24]*s5; 2264 } 2265 2266 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2267 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2268 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2269 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2270 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2271 PetscFunctionReturn(0); 2272 } 2273 2274 #undef __FUNCT__ 2275 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2276 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2277 { 2278 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2279 IS iscol=a->col,isrow=a->row; 2280 PetscErrorCode ierr; 2281 const PetscInt *r,*c,*rout,*cout; 2282 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2283 const MatScalar *aa=a->a,*v; 2284 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2285 const PetscScalar *b; 2286 2287 PetscFunctionBegin; 2288 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2289 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2290 t = a->solve_work; 2291 2292 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2293 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2294 2295 /* forward solve the lower triangular */ 2296 idx = 5*r[0]; 2297 t[0] = b[idx]; t[1] = b[1+idx]; 2298 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2299 for (i=1; i<n; i++) { 2300 v = aa + 25*ai[i]; 2301 vi = aj + ai[i]; 2302 nz = ai[i+1] - ai[i]; 2303 idx = 5*r[i]; 2304 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2305 s5 = b[4+idx]; 2306 for(m=0;m<nz;m++){ 2307 idx = 5*vi[m]; 2308 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2309 x4 = t[3+idx];x5 = t[4+idx]; 2310 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2311 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2312 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2313 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2314 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2315 v += 25; 2316 } 2317 idx = 5*i; 2318 t[idx] = s1;t[1+idx] = s2; 2319 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2320 } 2321 /* backward solve the upper triangular */ 2322 for (i=n-1; i>=0; i--){ 2323 k = 2*n-i; 2324 v = aa + 25*ai[k]; 2325 vi = aj + ai[k]; 2326 nz = ai[k+1] - ai[k] - 1; 2327 idt = 5*i; 2328 s1 = t[idt]; s2 = t[1+idt]; 2329 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2330 for(m=0;m<nz;m++){ 2331 idx = 5*vi[m]; 2332 x1 = t[idx]; x2 = t[1+idx]; 2333 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2334 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2335 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2336 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2337 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2338 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2339 v += 25; 2340 } 2341 idc = 5*c[i]; 2342 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2343 v[15]*s4+v[20]*s5; 2344 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2345 v[16]*s4+v[21]*s5; 2346 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2347 v[17]*s4+v[22]*s5; 2348 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2349 v[18]*s4+v[23]*s5; 2350 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2351 v[19]*s4+v[24]*s5; 2352 } 2353 2354 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2355 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2356 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2357 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2358 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2359 PetscFunctionReturn(0); 2360 } 2361 2362 #undef __FUNCT__ 2363 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2" 2364 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2365 { 2366 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2367 IS iscol=a->col,isrow=a->row; 2368 PetscErrorCode ierr; 2369 const PetscInt *r,*c,*rout,*cout; 2370 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2371 const MatScalar *aa=a->a,*v; 2372 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2373 const PetscScalar *b; 2374 2375 PetscFunctionBegin; 2376 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2377 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2378 t = a->solve_work; 2379 2380 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2381 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2382 2383 /* forward solve the lower triangular */ 2384 idx = 5*r[0]; 2385 t[0] = b[idx]; t[1] = b[1+idx]; 2386 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2387 for (i=1; i<n; i++) { 2388 v = aa + 25*ai[i]; 2389 vi = aj + ai[i]; 2390 nz = ai[i+1] - ai[i]; 2391 idx = 5*r[i]; 2392 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2393 s5 = b[4+idx]; 2394 for(m=0;m<nz;m++){ 2395 idx = 5*vi[m]; 2396 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2397 x4 = t[3+idx];x5 = t[4+idx]; 2398 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2399 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2400 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2401 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2402 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2403 v += 25; 2404 } 2405 idx = 5*i; 2406 t[idx] = s1;t[1+idx] = s2; 2407 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2408 } 2409 /* backward solve the upper triangular */ 2410 for (i=n-1; i>=0; i--){ 2411 v = aa + 25*(adiag[i+1]+1); 2412 vi = aj + adiag[i+1]+1; 2413 nz = adiag[i] - adiag[i+1] - 1; 2414 idt = 5*i; 2415 s1 = t[idt]; s2 = t[1+idt]; 2416 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2417 for(m=0;m<nz;m++){ 2418 idx = 5*vi[m]; 2419 x1 = t[idx]; x2 = t[1+idx]; 2420 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2421 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2422 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2423 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2424 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2425 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2426 v += 25; 2427 } 2428 idc = 5*c[i]; 2429 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2430 v[15]*s4+v[20]*s5; 2431 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2432 v[16]*s4+v[21]*s5; 2433 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2434 v[17]*s4+v[22]*s5; 2435 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2436 v[18]*s4+v[23]*s5; 2437 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2438 v[19]*s4+v[24]*s5; 2439 } 2440 2441 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2442 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2443 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2444 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2445 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2446 PetscFunctionReturn(0); 2447 } 2448 2449 #undef __FUNCT__ 2450 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2451 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 2452 { 2453 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2454 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2455 PetscErrorCode ierr; 2456 PetscInt *diag = a->diag,jdx; 2457 const MatScalar *aa=a->a,*v; 2458 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2459 const PetscScalar *b; 2460 2461 PetscFunctionBegin; 2462 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2463 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2464 /* forward solve the lower triangular */ 2465 idx = 0; 2466 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2467 for (i=1; i<n; i++) { 2468 v = aa + 25*ai[i]; 2469 vi = aj + ai[i]; 2470 nz = diag[i] - ai[i]; 2471 idx = 5*i; 2472 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2473 while (nz--) { 2474 jdx = 5*(*vi++); 2475 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2476 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2477 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2478 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2479 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2480 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2481 v += 25; 2482 } 2483 x[idx] = s1; 2484 x[1+idx] = s2; 2485 x[2+idx] = s3; 2486 x[3+idx] = s4; 2487 x[4+idx] = s5; 2488 } 2489 /* backward solve the upper triangular */ 2490 for (i=n-1; i>=0; i--){ 2491 v = aa + 25*diag[i] + 25; 2492 vi = aj + diag[i] + 1; 2493 nz = ai[i+1] - diag[i] - 1; 2494 idt = 5*i; 2495 s1 = x[idt]; s2 = x[1+idt]; 2496 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2497 while (nz--) { 2498 idx = 5*(*vi++); 2499 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2500 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2501 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2502 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2503 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2504 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2505 v += 25; 2506 } 2507 v = aa + 25*diag[i]; 2508 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2509 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2510 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2511 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2512 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2513 } 2514 2515 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2516 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2517 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2518 PetscFunctionReturn(0); 2519 } 2520 2521 #undef __FUNCT__ 2522 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2523 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2524 { 2525 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2526 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2527 PetscErrorCode ierr; 2528 PetscInt jdx; 2529 const MatScalar *aa=a->a,*v; 2530 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2531 const PetscScalar *b; 2532 2533 PetscFunctionBegin; 2534 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2535 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2536 /* forward solve the lower triangular */ 2537 idx = 0; 2538 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2539 for (i=1; i<n; i++) { 2540 v = aa + 25*ai[i]; 2541 vi = aj + ai[i]; 2542 nz = ai[i+1] - ai[i]; 2543 idx = 5*i; 2544 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2545 for(k=0;k<nz;k++) { 2546 jdx = 5*vi[k]; 2547 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2548 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2549 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2550 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2551 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2552 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2553 v += 25; 2554 } 2555 x[idx] = s1; 2556 x[1+idx] = s2; 2557 x[2+idx] = s3; 2558 x[3+idx] = s4; 2559 x[4+idx] = s5; 2560 } 2561 2562 /* backward solve the upper triangular */ 2563 for (i=n-1; i>=0; i--){ 2564 v = aa + 25*ai[2*n-i]; 2565 vi = aj + ai[2*n-i]; 2566 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2567 idt = 5*i; 2568 s1 = x[idt]; s2 = x[1+idt]; 2569 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2570 for(k=0;k<nz;k++){ 2571 idx = 5*vi[k]; 2572 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2573 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2574 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2575 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2576 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2577 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2578 v += 25; 2579 } 2580 /* x = inv_diagonal*x */ 2581 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2582 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2583 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2584 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2585 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2586 } 2587 2588 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2589 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2590 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2591 PetscFunctionReturn(0); 2592 } 2593 2594 #undef __FUNCT__ 2595 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2" 2596 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2597 { 2598 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2599 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 2600 PetscErrorCode ierr; 2601 PetscInt jdx; 2602 const MatScalar *aa=a->a,*v; 2603 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2604 const PetscScalar *b; 2605 2606 PetscFunctionBegin; 2607 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2608 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2609 /* forward solve the lower triangular */ 2610 idx = 0; 2611 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2612 for (i=1; i<n; i++) { 2613 v = aa + 25*ai[i]; 2614 vi = aj + ai[i]; 2615 nz = ai[i+1] - ai[i]; 2616 idx = 5*i; 2617 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2618 for(k=0;k<nz;k++) { 2619 jdx = 5*vi[k]; 2620 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2621 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2622 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2623 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2624 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2625 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2626 v += 25; 2627 } 2628 x[idx] = s1; 2629 x[1+idx] = s2; 2630 x[2+idx] = s3; 2631 x[3+idx] = s4; 2632 x[4+idx] = s5; 2633 } 2634 2635 /* backward solve the upper triangular */ 2636 for (i=n-1; i>=0; i--){ 2637 v = aa + 25*(adiag[i+1]+1); 2638 vi = aj + adiag[i+1]+1; 2639 nz = adiag[i] - adiag[i+1]-1; 2640 idt = 5*i; 2641 s1 = x[idt]; s2 = x[1+idt]; 2642 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2643 for(k=0;k<nz;k++){ 2644 idx = 5*vi[k]; 2645 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2646 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2647 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2648 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2649 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2650 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2651 v += 25; 2652 } 2653 /* x = inv_diagonal*x */ 2654 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2655 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2656 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2657 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2658 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2659 } 2660 2661 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2662 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2663 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2664 PetscFunctionReturn(0); 2665 } 2666 2667 #undef __FUNCT__ 2668 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2669 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 2670 { 2671 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2672 IS iscol=a->col,isrow=a->row; 2673 PetscErrorCode ierr; 2674 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2675 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2676 const MatScalar *aa=a->a,*v; 2677 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2678 const PetscScalar *b; 2679 2680 PetscFunctionBegin; 2681 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2682 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2683 t = a->solve_work; 2684 2685 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2686 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2687 2688 /* forward solve the lower triangular */ 2689 idx = 4*(*r++); 2690 t[0] = b[idx]; t[1] = b[1+idx]; 2691 t[2] = b[2+idx]; t[3] = b[3+idx]; 2692 for (i=1; i<n; i++) { 2693 v = aa + 16*ai[i]; 2694 vi = aj + ai[i]; 2695 nz = diag[i] - ai[i]; 2696 idx = 4*(*r++); 2697 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2698 while (nz--) { 2699 idx = 4*(*vi++); 2700 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2701 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2702 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2703 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2704 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2705 v += 16; 2706 } 2707 idx = 4*i; 2708 t[idx] = s1;t[1+idx] = s2; 2709 t[2+idx] = s3;t[3+idx] = s4; 2710 } 2711 /* backward solve the upper triangular */ 2712 for (i=n-1; i>=0; i--){ 2713 v = aa + 16*diag[i] + 16; 2714 vi = aj + diag[i] + 1; 2715 nz = ai[i+1] - diag[i] - 1; 2716 idt = 4*i; 2717 s1 = t[idt]; s2 = t[1+idt]; 2718 s3 = t[2+idt];s4 = t[3+idt]; 2719 while (nz--) { 2720 idx = 4*(*vi++); 2721 x1 = t[idx]; x2 = t[1+idx]; 2722 x3 = t[2+idx]; x4 = t[3+idx]; 2723 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2724 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2725 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2726 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2727 v += 16; 2728 } 2729 idc = 4*(*c--); 2730 v = aa + 16*diag[i]; 2731 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2732 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2733 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2734 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2735 } 2736 2737 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2738 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2739 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2740 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2741 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2742 PetscFunctionReturn(0); 2743 } 2744 2745 #undef __FUNCT__ 2746 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2747 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 2748 { 2749 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2750 IS iscol=a->col,isrow=a->row; 2751 PetscErrorCode ierr; 2752 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2753 const PetscInt *r,*c,*rout,*cout; 2754 const MatScalar *aa=a->a,*v; 2755 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2756 const PetscScalar *b; 2757 2758 PetscFunctionBegin; 2759 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2760 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2761 t = a->solve_work; 2762 2763 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2764 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2765 2766 /* forward solve the lower triangular */ 2767 idx = 4*r[0]; 2768 t[0] = b[idx]; t[1] = b[1+idx]; 2769 t[2] = b[2+idx]; t[3] = b[3+idx]; 2770 for (i=1; i<n; i++) { 2771 v = aa + 16*ai[i]; 2772 vi = aj + ai[i]; 2773 nz = ai[i+1] - ai[i]; 2774 idx = 4*r[i]; 2775 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2776 for(m=0;m<nz;m++){ 2777 idx = 4*vi[m]; 2778 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2779 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2780 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2781 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2782 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2783 v += 16; 2784 } 2785 idx = 4*i; 2786 t[idx] = s1;t[1+idx] = s2; 2787 t[2+idx] = s3;t[3+idx] = s4; 2788 } 2789 /* backward solve the upper triangular */ 2790 for (i=n-1; i>=0; i--){ 2791 k = 2*n-i; 2792 v = aa + 16*ai[k]; 2793 vi = aj + ai[k]; 2794 nz = ai[k+1] - ai[k] - 1; 2795 idt = 4*i; 2796 s1 = t[idt]; s2 = t[1+idt]; 2797 s3 = t[2+idt];s4 = t[3+idt]; 2798 for(m=0;m<nz;m++){ 2799 idx = 4*vi[m]; 2800 x1 = t[idx]; x2 = t[1+idx]; 2801 x3 = t[2+idx]; x4 = t[3+idx]; 2802 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2803 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2804 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2805 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2806 v += 16; 2807 } 2808 idc = 4*c[i]; 2809 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2810 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2811 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2812 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2813 } 2814 2815 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2816 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2817 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2818 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2819 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2820 PetscFunctionReturn(0); 2821 } 2822 2823 #undef __FUNCT__ 2824 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2" 2825 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2826 { 2827 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2828 IS iscol=a->col,isrow=a->row; 2829 PetscErrorCode ierr; 2830 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2831 const PetscInt *r,*c,*rout,*cout; 2832 const MatScalar *aa=a->a,*v; 2833 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2834 const PetscScalar *b; 2835 2836 PetscFunctionBegin; 2837 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2838 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2839 t = a->solve_work; 2840 2841 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2842 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2843 2844 /* forward solve the lower triangular */ 2845 idx = 4*r[0]; 2846 t[0] = b[idx]; t[1] = b[1+idx]; 2847 t[2] = b[2+idx]; t[3] = b[3+idx]; 2848 for (i=1; i<n; i++) { 2849 v = aa + 16*ai[i]; 2850 vi = aj + ai[i]; 2851 nz = ai[i+1] - ai[i]; 2852 idx = 4*r[i]; 2853 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2854 for(m=0;m<nz;m++){ 2855 idx = 4*vi[m]; 2856 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2857 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2858 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2859 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2860 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2861 v += 16; 2862 } 2863 idx = 4*i; 2864 t[idx] = s1;t[1+idx] = s2; 2865 t[2+idx] = s3;t[3+idx] = s4; 2866 } 2867 /* backward solve the upper triangular */ 2868 for (i=n-1; i>=0; i--){ 2869 v = aa + 16*(adiag[i+1]+1); 2870 vi = aj + adiag[i+1]+1; 2871 nz = adiag[i] - adiag[i+1] - 1; 2872 idt = 4*i; 2873 s1 = t[idt]; s2 = t[1+idt]; 2874 s3 = t[2+idt];s4 = t[3+idt]; 2875 for(m=0;m<nz;m++){ 2876 idx = 4*vi[m]; 2877 x1 = t[idx]; x2 = t[1+idx]; 2878 x3 = t[2+idx]; x4 = t[3+idx]; 2879 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2880 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2881 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2882 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2883 v += 16; 2884 } 2885 idc = 4*c[i]; 2886 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2887 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2888 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2889 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2890 } 2891 2892 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2893 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2894 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2895 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2896 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2897 PetscFunctionReturn(0); 2898 } 2899 2900 #undef __FUNCT__ 2901 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2902 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2903 { 2904 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2905 IS iscol=a->col,isrow=a->row; 2906 PetscErrorCode ierr; 2907 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2908 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2909 const MatScalar *aa=a->a,*v; 2910 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2911 PetscScalar *x; 2912 const PetscScalar *b; 2913 2914 PetscFunctionBegin; 2915 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2916 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2917 t = (MatScalar *)a->solve_work; 2918 2919 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2920 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2921 2922 /* forward solve the lower triangular */ 2923 idx = 4*(*r++); 2924 t[0] = (MatScalar)b[idx]; 2925 t[1] = (MatScalar)b[1+idx]; 2926 t[2] = (MatScalar)b[2+idx]; 2927 t[3] = (MatScalar)b[3+idx]; 2928 for (i=1; i<n; i++) { 2929 v = aa + 16*ai[i]; 2930 vi = aj + ai[i]; 2931 nz = diag[i] - ai[i]; 2932 idx = 4*(*r++); 2933 s1 = (MatScalar)b[idx]; 2934 s2 = (MatScalar)b[1+idx]; 2935 s3 = (MatScalar)b[2+idx]; 2936 s4 = (MatScalar)b[3+idx]; 2937 while (nz--) { 2938 idx = 4*(*vi++); 2939 x1 = t[idx]; 2940 x2 = t[1+idx]; 2941 x3 = t[2+idx]; 2942 x4 = t[3+idx]; 2943 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2944 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2945 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2946 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2947 v += 16; 2948 } 2949 idx = 4*i; 2950 t[idx] = s1; 2951 t[1+idx] = s2; 2952 t[2+idx] = s3; 2953 t[3+idx] = s4; 2954 } 2955 /* backward solve the upper triangular */ 2956 for (i=n-1; i>=0; i--){ 2957 v = aa + 16*diag[i] + 16; 2958 vi = aj + diag[i] + 1; 2959 nz = ai[i+1] - diag[i] - 1; 2960 idt = 4*i; 2961 s1 = t[idt]; 2962 s2 = t[1+idt]; 2963 s3 = t[2+idt]; 2964 s4 = t[3+idt]; 2965 while (nz--) { 2966 idx = 4*(*vi++); 2967 x1 = t[idx]; 2968 x2 = t[1+idx]; 2969 x3 = t[2+idx]; 2970 x4 = t[3+idx]; 2971 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2972 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2973 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2974 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2975 v += 16; 2976 } 2977 idc = 4*(*c--); 2978 v = aa + 16*diag[i]; 2979 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2980 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2981 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2982 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2983 x[idc] = (PetscScalar)t[idt]; 2984 x[1+idc] = (PetscScalar)t[1+idt]; 2985 x[2+idc] = (PetscScalar)t[2+idt]; 2986 x[3+idc] = (PetscScalar)t[3+idt]; 2987 } 2988 2989 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2990 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2991 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2992 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2993 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2994 PetscFunctionReturn(0); 2995 } 2996 2997 #if defined (PETSC_HAVE_SSE) 2998 2999 #include PETSC_HAVE_SSE 3000 3001 #undef __FUNCT__ 3002 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3003 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3004 { 3005 /* 3006 Note: This code uses demotion of double 3007 to float when performing the mixed-mode computation. 3008 This may not be numerically reasonable for all applications. 3009 */ 3010 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3011 IS iscol=a->col,isrow=a->row; 3012 PetscErrorCode ierr; 3013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3014 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3015 MatScalar *aa=a->a,*v; 3016 PetscScalar *x,*b,*t; 3017 3018 /* Make space in temp stack for 16 Byte Aligned arrays */ 3019 float ssealignedspace[11],*tmps,*tmpx; 3020 unsigned long offset; 3021 3022 PetscFunctionBegin; 3023 SSE_SCOPE_BEGIN; 3024 3025 offset = (unsigned long)ssealignedspace % 16; 3026 if (offset) offset = (16 - offset)/4; 3027 tmps = &ssealignedspace[offset]; 3028 tmpx = &ssealignedspace[offset+4]; 3029 PREFETCH_NTA(aa+16*ai[1]); 3030 3031 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3032 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3033 t = a->solve_work; 3034 3035 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3036 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3037 3038 /* forward solve the lower triangular */ 3039 idx = 4*(*r++); 3040 t[0] = b[idx]; t[1] = b[1+idx]; 3041 t[2] = b[2+idx]; t[3] = b[3+idx]; 3042 v = aa + 16*ai[1]; 3043 3044 for (i=1; i<n;) { 3045 PREFETCH_NTA(&v[8]); 3046 vi = aj + ai[i]; 3047 nz = diag[i] - ai[i]; 3048 idx = 4*(*r++); 3049 3050 /* Demote sum from double to float */ 3051 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3052 LOAD_PS(tmps,XMM7); 3053 3054 while (nz--) { 3055 PREFETCH_NTA(&v[16]); 3056 idx = 4*(*vi++); 3057 3058 /* Demote solution (so far) from double to float */ 3059 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3060 3061 /* 4x4 Matrix-Vector product with negative accumulation: */ 3062 SSE_INLINE_BEGIN_2(tmpx,v) 3063 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3064 3065 /* First Column */ 3066 SSE_COPY_PS(XMM0,XMM6) 3067 SSE_SHUFFLE(XMM0,XMM0,0x00) 3068 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3069 SSE_SUB_PS(XMM7,XMM0) 3070 3071 /* Second Column */ 3072 SSE_COPY_PS(XMM1,XMM6) 3073 SSE_SHUFFLE(XMM1,XMM1,0x55) 3074 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3075 SSE_SUB_PS(XMM7,XMM1) 3076 3077 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3078 3079 /* Third Column */ 3080 SSE_COPY_PS(XMM2,XMM6) 3081 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3082 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3083 SSE_SUB_PS(XMM7,XMM2) 3084 3085 /* Fourth Column */ 3086 SSE_COPY_PS(XMM3,XMM6) 3087 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3088 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3089 SSE_SUB_PS(XMM7,XMM3) 3090 SSE_INLINE_END_2 3091 3092 v += 16; 3093 } 3094 idx = 4*i; 3095 v = aa + 16*ai[++i]; 3096 PREFETCH_NTA(v); 3097 STORE_PS(tmps,XMM7); 3098 3099 /* Promote result from float to double */ 3100 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3101 } 3102 /* backward solve the upper triangular */ 3103 idt = 4*(n-1); 3104 ai16 = 16*diag[n-1]; 3105 v = aa + ai16 + 16; 3106 for (i=n-1; i>=0;){ 3107 PREFETCH_NTA(&v[8]); 3108 vi = aj + diag[i] + 1; 3109 nz = ai[i+1] - diag[i] - 1; 3110 3111 /* Demote accumulator from double to float */ 3112 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3113 LOAD_PS(tmps,XMM7); 3114 3115 while (nz--) { 3116 PREFETCH_NTA(&v[16]); 3117 idx = 4*(*vi++); 3118 3119 /* Demote solution (so far) from double to float */ 3120 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3121 3122 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3123 SSE_INLINE_BEGIN_2(tmpx,v) 3124 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3125 3126 /* First Column */ 3127 SSE_COPY_PS(XMM0,XMM6) 3128 SSE_SHUFFLE(XMM0,XMM0,0x00) 3129 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3130 SSE_SUB_PS(XMM7,XMM0) 3131 3132 /* Second Column */ 3133 SSE_COPY_PS(XMM1,XMM6) 3134 SSE_SHUFFLE(XMM1,XMM1,0x55) 3135 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3136 SSE_SUB_PS(XMM7,XMM1) 3137 3138 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3139 3140 /* Third Column */ 3141 SSE_COPY_PS(XMM2,XMM6) 3142 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3143 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3144 SSE_SUB_PS(XMM7,XMM2) 3145 3146 /* Fourth Column */ 3147 SSE_COPY_PS(XMM3,XMM6) 3148 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3149 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3150 SSE_SUB_PS(XMM7,XMM3) 3151 SSE_INLINE_END_2 3152 v += 16; 3153 } 3154 v = aa + ai16; 3155 ai16 = 16*diag[--i]; 3156 PREFETCH_NTA(aa+ai16+16); 3157 /* 3158 Scale the result by the diagonal 4x4 block, 3159 which was inverted as part of the factorization 3160 */ 3161 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3162 /* First Column */ 3163 SSE_COPY_PS(XMM0,XMM7) 3164 SSE_SHUFFLE(XMM0,XMM0,0x00) 3165 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3166 3167 /* Second Column */ 3168 SSE_COPY_PS(XMM1,XMM7) 3169 SSE_SHUFFLE(XMM1,XMM1,0x55) 3170 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3171 SSE_ADD_PS(XMM0,XMM1) 3172 3173 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3174 3175 /* Third Column */ 3176 SSE_COPY_PS(XMM2,XMM7) 3177 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3178 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3179 SSE_ADD_PS(XMM0,XMM2) 3180 3181 /* Fourth Column */ 3182 SSE_COPY_PS(XMM3,XMM7) 3183 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3184 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3185 SSE_ADD_PS(XMM0,XMM3) 3186 3187 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3188 SSE_INLINE_END_3 3189 3190 /* Promote solution from float to double */ 3191 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3192 3193 /* Apply reordering to t and stream into x. */ 3194 /* This way, x doesn't pollute the cache. */ 3195 /* Be careful with size: 2 doubles = 4 floats! */ 3196 idc = 4*(*c--); 3197 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3198 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3199 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3200 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3201 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3202 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3203 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3204 SSE_INLINE_END_2 3205 v = aa + ai16 + 16; 3206 idt -= 4; 3207 } 3208 3209 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3210 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3211 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3212 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3213 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3214 SSE_SCOPE_END; 3215 PetscFunctionReturn(0); 3216 } 3217 3218 #endif 3219 3220 3221 /* 3222 Special case where the matrix was ILU(0) factored in the natural 3223 ordering. This eliminates the need for the column and row permutation. 3224 */ 3225 #undef __FUNCT__ 3226 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3227 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3228 { 3229 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3230 PetscInt n=a->mbs; 3231 const PetscInt *ai=a->i,*aj=a->j; 3232 PetscErrorCode ierr; 3233 const PetscInt *diag = a->diag; 3234 const MatScalar *aa=a->a; 3235 PetscScalar *x; 3236 const PetscScalar *b; 3237 3238 PetscFunctionBegin; 3239 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3240 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3241 3242 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3243 { 3244 static PetscScalar w[2000]; /* very BAD need to fix */ 3245 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3246 } 3247 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3248 { 3249 static PetscScalar w[2000]; /* very BAD need to fix */ 3250 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3251 } 3252 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3253 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3254 #else 3255 { 3256 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3257 const MatScalar *v; 3258 PetscInt jdx,idt,idx,nz,i,ai16; 3259 const PetscInt *vi; 3260 3261 /* forward solve the lower triangular */ 3262 idx = 0; 3263 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3264 for (i=1; i<n; i++) { 3265 v = aa + 16*ai[i]; 3266 vi = aj + ai[i]; 3267 nz = diag[i] - ai[i]; 3268 idx += 4; 3269 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3270 while (nz--) { 3271 jdx = 4*(*vi++); 3272 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3273 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3274 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3275 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3276 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3277 v += 16; 3278 } 3279 x[idx] = s1; 3280 x[1+idx] = s2; 3281 x[2+idx] = s3; 3282 x[3+idx] = s4; 3283 } 3284 /* backward solve the upper triangular */ 3285 idt = 4*(n-1); 3286 for (i=n-1; i>=0; i--){ 3287 ai16 = 16*diag[i]; 3288 v = aa + ai16 + 16; 3289 vi = aj + diag[i] + 1; 3290 nz = ai[i+1] - diag[i] - 1; 3291 s1 = x[idt]; s2 = x[1+idt]; 3292 s3 = x[2+idt];s4 = x[3+idt]; 3293 while (nz--) { 3294 idx = 4*(*vi++); 3295 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3296 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3297 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3298 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3299 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3300 v += 16; 3301 } 3302 v = aa + ai16; 3303 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3304 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3305 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3306 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3307 idt -= 4; 3308 } 3309 } 3310 #endif 3311 3312 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3313 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3314 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3315 PetscFunctionReturn(0); 3316 } 3317 3318 #undef __FUNCT__ 3319 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3320 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3321 { 3322 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3323 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3324 PetscErrorCode ierr; 3325 PetscInt idx,jdx,idt; 3326 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3327 const MatScalar *aa=a->a,*v; 3328 PetscScalar *x; 3329 const PetscScalar *b; 3330 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3331 3332 PetscFunctionBegin; 3333 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3334 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3335 /* forward solve the lower triangular */ 3336 idx = 0; 3337 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3338 for (i=1; i<n; i++) { 3339 v = aa + bs2*ai[i]; 3340 vi = aj + ai[i]; 3341 nz = ai[i+1] - ai[i]; 3342 idx = bs*i; 3343 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3344 for(k=0;k<nz;k++) { 3345 jdx = bs*vi[k]; 3346 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3347 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3348 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3349 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3350 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3351 3352 v += bs2; 3353 } 3354 3355 x[idx] = s1; 3356 x[1+idx] = s2; 3357 x[2+idx] = s3; 3358 x[3+idx] = s4; 3359 } 3360 3361 /* backward solve the upper triangular */ 3362 for (i=n-1; i>=0; i--){ 3363 v = aa + bs2*ai[2*n-i]; 3364 vi = aj + ai[2*n-i]; 3365 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3366 idt = bs*i; 3367 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3368 3369 for(k=0;k<nz;k++){ 3370 idx = bs*vi[k]; 3371 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3372 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3373 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3374 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3375 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3376 3377 v += bs2; 3378 } 3379 /* x = inv_diagonal*x */ 3380 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3381 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3382 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3383 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3384 3385 } 3386 3387 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3388 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3389 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3390 PetscFunctionReturn(0); 3391 } 3392 3393 #undef __FUNCT__ 3394 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 3395 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3396 { 3397 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3398 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3399 PetscErrorCode ierr; 3400 PetscInt idx,jdx,idt; 3401 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3402 const MatScalar *aa=a->a,*v; 3403 PetscScalar *x; 3404 const PetscScalar *b; 3405 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3406 3407 PetscFunctionBegin; 3408 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3409 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3410 /* forward solve the lower triangular */ 3411 idx = 0; 3412 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3413 for (i=1; i<n; i++) { 3414 v = aa + bs2*ai[i]; 3415 vi = aj + ai[i]; 3416 nz = ai[i+1] - ai[i]; 3417 idx = bs*i; 3418 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3419 for(k=0;k<nz;k++) { 3420 jdx = bs*vi[k]; 3421 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3422 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3423 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3424 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3425 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3426 3427 v += bs2; 3428 } 3429 3430 x[idx] = s1; 3431 x[1+idx] = s2; 3432 x[2+idx] = s3; 3433 x[3+idx] = s4; 3434 } 3435 3436 /* backward solve the upper triangular */ 3437 for (i=n-1; i>=0; i--){ 3438 v = aa + bs2*(adiag[i+1]+1); 3439 vi = aj + adiag[i+1]+1; 3440 nz = adiag[i] - adiag[i+1]-1; 3441 idt = bs*i; 3442 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3443 3444 for(k=0;k<nz;k++){ 3445 idx = bs*vi[k]; 3446 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3447 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3448 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3449 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3450 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3451 3452 v += bs2; 3453 } 3454 /* x = inv_diagonal*x */ 3455 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3456 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3457 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3458 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3459 3460 } 3461 3462 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3463 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3464 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3465 PetscFunctionReturn(0); 3466 } 3467 3468 #undef __FUNCT__ 3469 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3470 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3471 { 3472 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3473 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3474 PetscErrorCode ierr; 3475 PetscInt *diag = a->diag; 3476 MatScalar *aa=a->a; 3477 PetscScalar *x,*b; 3478 3479 PetscFunctionBegin; 3480 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3481 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3482 3483 { 3484 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3485 MatScalar *v,*t=(MatScalar *)x; 3486 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3487 3488 /* forward solve the lower triangular */ 3489 idx = 0; 3490 t[0] = (MatScalar)b[0]; 3491 t[1] = (MatScalar)b[1]; 3492 t[2] = (MatScalar)b[2]; 3493 t[3] = (MatScalar)b[3]; 3494 for (i=1; i<n; i++) { 3495 v = aa + 16*ai[i]; 3496 vi = aj + ai[i]; 3497 nz = diag[i] - ai[i]; 3498 idx += 4; 3499 s1 = (MatScalar)b[idx]; 3500 s2 = (MatScalar)b[1+idx]; 3501 s3 = (MatScalar)b[2+idx]; 3502 s4 = (MatScalar)b[3+idx]; 3503 while (nz--) { 3504 jdx = 4*(*vi++); 3505 x1 = t[jdx]; 3506 x2 = t[1+jdx]; 3507 x3 = t[2+jdx]; 3508 x4 = t[3+jdx]; 3509 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3510 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3511 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3512 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3513 v += 16; 3514 } 3515 t[idx] = s1; 3516 t[1+idx] = s2; 3517 t[2+idx] = s3; 3518 t[3+idx] = s4; 3519 } 3520 /* backward solve the upper triangular */ 3521 idt = 4*(n-1); 3522 for (i=n-1; i>=0; i--){ 3523 ai16 = 16*diag[i]; 3524 v = aa + ai16 + 16; 3525 vi = aj + diag[i] + 1; 3526 nz = ai[i+1] - diag[i] - 1; 3527 s1 = t[idt]; 3528 s2 = t[1+idt]; 3529 s3 = t[2+idt]; 3530 s4 = t[3+idt]; 3531 while (nz--) { 3532 idx = 4*(*vi++); 3533 x1 = (MatScalar)x[idx]; 3534 x2 = (MatScalar)x[1+idx]; 3535 x3 = (MatScalar)x[2+idx]; 3536 x4 = (MatScalar)x[3+idx]; 3537 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3538 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3539 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3540 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3541 v += 16; 3542 } 3543 v = aa + ai16; 3544 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3545 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3546 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3547 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3548 idt -= 4; 3549 } 3550 } 3551 3552 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3553 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3554 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3555 PetscFunctionReturn(0); 3556 } 3557 3558 #if defined (PETSC_HAVE_SSE) 3559 3560 #include PETSC_HAVE_SSE 3561 #undef __FUNCT__ 3562 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3563 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 3564 { 3565 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3566 unsigned short *aj=(unsigned short *)a->j; 3567 PetscErrorCode ierr; 3568 int *ai=a->i,n=a->mbs,*diag = a->diag; 3569 MatScalar *aa=a->a; 3570 PetscScalar *x,*b; 3571 3572 PetscFunctionBegin; 3573 SSE_SCOPE_BEGIN; 3574 /* 3575 Note: This code currently uses demotion of double 3576 to float when performing the mixed-mode computation. 3577 This may not be numerically reasonable for all applications. 3578 */ 3579 PREFETCH_NTA(aa+16*ai[1]); 3580 3581 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3583 { 3584 /* x will first be computed in single precision then promoted inplace to double */ 3585 MatScalar *v,*t=(MatScalar *)x; 3586 int nz,i,idt,ai16; 3587 unsigned int jdx,idx; 3588 unsigned short *vi; 3589 /* Forward solve the lower triangular factor. */ 3590 3591 /* First block is the identity. */ 3592 idx = 0; 3593 CONVERT_DOUBLE4_FLOAT4(t,b); 3594 v = aa + 16*((unsigned int)ai[1]); 3595 3596 for (i=1; i<n;) { 3597 PREFETCH_NTA(&v[8]); 3598 vi = aj + ai[i]; 3599 nz = diag[i] - ai[i]; 3600 idx += 4; 3601 3602 /* Demote RHS from double to float. */ 3603 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3604 LOAD_PS(&t[idx],XMM7); 3605 3606 while (nz--) { 3607 PREFETCH_NTA(&v[16]); 3608 jdx = 4*((unsigned int)(*vi++)); 3609 3610 /* 4x4 Matrix-Vector product with negative accumulation: */ 3611 SSE_INLINE_BEGIN_2(&t[jdx],v) 3612 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3613 3614 /* First Column */ 3615 SSE_COPY_PS(XMM0,XMM6) 3616 SSE_SHUFFLE(XMM0,XMM0,0x00) 3617 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3618 SSE_SUB_PS(XMM7,XMM0) 3619 3620 /* Second Column */ 3621 SSE_COPY_PS(XMM1,XMM6) 3622 SSE_SHUFFLE(XMM1,XMM1,0x55) 3623 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3624 SSE_SUB_PS(XMM7,XMM1) 3625 3626 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3627 3628 /* Third Column */ 3629 SSE_COPY_PS(XMM2,XMM6) 3630 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3631 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3632 SSE_SUB_PS(XMM7,XMM2) 3633 3634 /* Fourth Column */ 3635 SSE_COPY_PS(XMM3,XMM6) 3636 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3637 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3638 SSE_SUB_PS(XMM7,XMM3) 3639 SSE_INLINE_END_2 3640 3641 v += 16; 3642 } 3643 v = aa + 16*ai[++i]; 3644 PREFETCH_NTA(v); 3645 STORE_PS(&t[idx],XMM7); 3646 } 3647 3648 /* Backward solve the upper triangular factor.*/ 3649 3650 idt = 4*(n-1); 3651 ai16 = 16*diag[n-1]; 3652 v = aa + ai16 + 16; 3653 for (i=n-1; i>=0;){ 3654 PREFETCH_NTA(&v[8]); 3655 vi = aj + diag[i] + 1; 3656 nz = ai[i+1] - diag[i] - 1; 3657 3658 LOAD_PS(&t[idt],XMM7); 3659 3660 while (nz--) { 3661 PREFETCH_NTA(&v[16]); 3662 idx = 4*((unsigned int)(*vi++)); 3663 3664 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3665 SSE_INLINE_BEGIN_2(&t[idx],v) 3666 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3667 3668 /* First Column */ 3669 SSE_COPY_PS(XMM0,XMM6) 3670 SSE_SHUFFLE(XMM0,XMM0,0x00) 3671 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3672 SSE_SUB_PS(XMM7,XMM0) 3673 3674 /* Second Column */ 3675 SSE_COPY_PS(XMM1,XMM6) 3676 SSE_SHUFFLE(XMM1,XMM1,0x55) 3677 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3678 SSE_SUB_PS(XMM7,XMM1) 3679 3680 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3681 3682 /* Third Column */ 3683 SSE_COPY_PS(XMM2,XMM6) 3684 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3685 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3686 SSE_SUB_PS(XMM7,XMM2) 3687 3688 /* Fourth Column */ 3689 SSE_COPY_PS(XMM3,XMM6) 3690 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3691 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3692 SSE_SUB_PS(XMM7,XMM3) 3693 SSE_INLINE_END_2 3694 v += 16; 3695 } 3696 v = aa + ai16; 3697 ai16 = 16*diag[--i]; 3698 PREFETCH_NTA(aa+ai16+16); 3699 /* 3700 Scale the result by the diagonal 4x4 block, 3701 which was inverted as part of the factorization 3702 */ 3703 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3704 /* First Column */ 3705 SSE_COPY_PS(XMM0,XMM7) 3706 SSE_SHUFFLE(XMM0,XMM0,0x00) 3707 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3708 3709 /* Second Column */ 3710 SSE_COPY_PS(XMM1,XMM7) 3711 SSE_SHUFFLE(XMM1,XMM1,0x55) 3712 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3713 SSE_ADD_PS(XMM0,XMM1) 3714 3715 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3716 3717 /* Third Column */ 3718 SSE_COPY_PS(XMM2,XMM7) 3719 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3720 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3721 SSE_ADD_PS(XMM0,XMM2) 3722 3723 /* Fourth Column */ 3724 SSE_COPY_PS(XMM3,XMM7) 3725 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3726 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3727 SSE_ADD_PS(XMM0,XMM3) 3728 3729 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3730 SSE_INLINE_END_3 3731 3732 v = aa + ai16 + 16; 3733 idt -= 4; 3734 } 3735 3736 /* Convert t from single precision back to double precision (inplace)*/ 3737 idt = 4*(n-1); 3738 for (i=n-1;i>=0;i--) { 3739 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3740 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3741 PetscScalar *xtemp=&x[idt]; 3742 MatScalar *ttemp=&t[idt]; 3743 xtemp[3] = (PetscScalar)ttemp[3]; 3744 xtemp[2] = (PetscScalar)ttemp[2]; 3745 xtemp[1] = (PetscScalar)ttemp[1]; 3746 xtemp[0] = (PetscScalar)ttemp[0]; 3747 idt -= 4; 3748 } 3749 3750 } /* End of artificial scope. */ 3751 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3752 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3753 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3754 SSE_SCOPE_END; 3755 PetscFunctionReturn(0); 3756 } 3757 3758 #undef __FUNCT__ 3759 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3760 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 3761 { 3762 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3763 int *aj=a->j; 3764 PetscErrorCode ierr; 3765 int *ai=a->i,n=a->mbs,*diag = a->diag; 3766 MatScalar *aa=a->a; 3767 PetscScalar *x,*b; 3768 3769 PetscFunctionBegin; 3770 SSE_SCOPE_BEGIN; 3771 /* 3772 Note: This code currently uses demotion of double 3773 to float when performing the mixed-mode computation. 3774 This may not be numerically reasonable for all applications. 3775 */ 3776 PREFETCH_NTA(aa+16*ai[1]); 3777 3778 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3779 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3780 { 3781 /* x will first be computed in single precision then promoted inplace to double */ 3782 MatScalar *v,*t=(MatScalar *)x; 3783 int nz,i,idt,ai16; 3784 int jdx,idx; 3785 int *vi; 3786 /* Forward solve the lower triangular factor. */ 3787 3788 /* First block is the identity. */ 3789 idx = 0; 3790 CONVERT_DOUBLE4_FLOAT4(t,b); 3791 v = aa + 16*ai[1]; 3792 3793 for (i=1; i<n;) { 3794 PREFETCH_NTA(&v[8]); 3795 vi = aj + ai[i]; 3796 nz = diag[i] - ai[i]; 3797 idx += 4; 3798 3799 /* Demote RHS from double to float. */ 3800 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3801 LOAD_PS(&t[idx],XMM7); 3802 3803 while (nz--) { 3804 PREFETCH_NTA(&v[16]); 3805 jdx = 4*(*vi++); 3806 /* jdx = *vi++; */ 3807 3808 /* 4x4 Matrix-Vector product with negative accumulation: */ 3809 SSE_INLINE_BEGIN_2(&t[jdx],v) 3810 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3811 3812 /* First Column */ 3813 SSE_COPY_PS(XMM0,XMM6) 3814 SSE_SHUFFLE(XMM0,XMM0,0x00) 3815 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3816 SSE_SUB_PS(XMM7,XMM0) 3817 3818 /* Second Column */ 3819 SSE_COPY_PS(XMM1,XMM6) 3820 SSE_SHUFFLE(XMM1,XMM1,0x55) 3821 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3822 SSE_SUB_PS(XMM7,XMM1) 3823 3824 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3825 3826 /* Third Column */ 3827 SSE_COPY_PS(XMM2,XMM6) 3828 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3829 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3830 SSE_SUB_PS(XMM7,XMM2) 3831 3832 /* Fourth Column */ 3833 SSE_COPY_PS(XMM3,XMM6) 3834 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3835 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3836 SSE_SUB_PS(XMM7,XMM3) 3837 SSE_INLINE_END_2 3838 3839 v += 16; 3840 } 3841 v = aa + 16*ai[++i]; 3842 PREFETCH_NTA(v); 3843 STORE_PS(&t[idx],XMM7); 3844 } 3845 3846 /* Backward solve the upper triangular factor.*/ 3847 3848 idt = 4*(n-1); 3849 ai16 = 16*diag[n-1]; 3850 v = aa + ai16 + 16; 3851 for (i=n-1; i>=0;){ 3852 PREFETCH_NTA(&v[8]); 3853 vi = aj + diag[i] + 1; 3854 nz = ai[i+1] - diag[i] - 1; 3855 3856 LOAD_PS(&t[idt],XMM7); 3857 3858 while (nz--) { 3859 PREFETCH_NTA(&v[16]); 3860 idx = 4*(*vi++); 3861 /* idx = *vi++; */ 3862 3863 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3864 SSE_INLINE_BEGIN_2(&t[idx],v) 3865 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3866 3867 /* First Column */ 3868 SSE_COPY_PS(XMM0,XMM6) 3869 SSE_SHUFFLE(XMM0,XMM0,0x00) 3870 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3871 SSE_SUB_PS(XMM7,XMM0) 3872 3873 /* Second Column */ 3874 SSE_COPY_PS(XMM1,XMM6) 3875 SSE_SHUFFLE(XMM1,XMM1,0x55) 3876 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3877 SSE_SUB_PS(XMM7,XMM1) 3878 3879 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3880 3881 /* Third Column */ 3882 SSE_COPY_PS(XMM2,XMM6) 3883 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3884 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3885 SSE_SUB_PS(XMM7,XMM2) 3886 3887 /* Fourth Column */ 3888 SSE_COPY_PS(XMM3,XMM6) 3889 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3890 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3891 SSE_SUB_PS(XMM7,XMM3) 3892 SSE_INLINE_END_2 3893 v += 16; 3894 } 3895 v = aa + ai16; 3896 ai16 = 16*diag[--i]; 3897 PREFETCH_NTA(aa+ai16+16); 3898 /* 3899 Scale the result by the diagonal 4x4 block, 3900 which was inverted as part of the factorization 3901 */ 3902 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3903 /* First Column */ 3904 SSE_COPY_PS(XMM0,XMM7) 3905 SSE_SHUFFLE(XMM0,XMM0,0x00) 3906 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3907 3908 /* Second Column */ 3909 SSE_COPY_PS(XMM1,XMM7) 3910 SSE_SHUFFLE(XMM1,XMM1,0x55) 3911 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3912 SSE_ADD_PS(XMM0,XMM1) 3913 3914 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3915 3916 /* Third Column */ 3917 SSE_COPY_PS(XMM2,XMM7) 3918 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3919 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3920 SSE_ADD_PS(XMM0,XMM2) 3921 3922 /* Fourth Column */ 3923 SSE_COPY_PS(XMM3,XMM7) 3924 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3925 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3926 SSE_ADD_PS(XMM0,XMM3) 3927 3928 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3929 SSE_INLINE_END_3 3930 3931 v = aa + ai16 + 16; 3932 idt -= 4; 3933 } 3934 3935 /* Convert t from single precision back to double precision (inplace)*/ 3936 idt = 4*(n-1); 3937 for (i=n-1;i>=0;i--) { 3938 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3939 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3940 PetscScalar *xtemp=&x[idt]; 3941 MatScalar *ttemp=&t[idt]; 3942 xtemp[3] = (PetscScalar)ttemp[3]; 3943 xtemp[2] = (PetscScalar)ttemp[2]; 3944 xtemp[1] = (PetscScalar)ttemp[1]; 3945 xtemp[0] = (PetscScalar)ttemp[0]; 3946 idt -= 4; 3947 } 3948 3949 } /* End of artificial scope. */ 3950 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3951 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3952 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3953 SSE_SCOPE_END; 3954 PetscFunctionReturn(0); 3955 } 3956 3957 #endif 3958 3959 #undef __FUNCT__ 3960 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3961 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 3962 { 3963 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3964 IS iscol=a->col,isrow=a->row; 3965 PetscErrorCode ierr; 3966 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3967 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3968 const MatScalar *aa=a->a,*v; 3969 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3970 const PetscScalar *b; 3971 3972 PetscFunctionBegin; 3973 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3974 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3975 t = a->solve_work; 3976 3977 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3978 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3979 3980 /* forward solve the lower triangular */ 3981 idx = 3*(*r++); 3982 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3983 for (i=1; i<n; i++) { 3984 v = aa + 9*ai[i]; 3985 vi = aj + ai[i]; 3986 nz = diag[i] - ai[i]; 3987 idx = 3*(*r++); 3988 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3989 while (nz--) { 3990 idx = 3*(*vi++); 3991 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3992 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3993 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3994 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3995 v += 9; 3996 } 3997 idx = 3*i; 3998 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3999 } 4000 /* backward solve the upper triangular */ 4001 for (i=n-1; i>=0; i--){ 4002 v = aa + 9*diag[i] + 9; 4003 vi = aj + diag[i] + 1; 4004 nz = ai[i+1] - diag[i] - 1; 4005 idt = 3*i; 4006 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4007 while (nz--) { 4008 idx = 3*(*vi++); 4009 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4010 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4011 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4012 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4013 v += 9; 4014 } 4015 idc = 3*(*c--); 4016 v = aa + 9*diag[i]; 4017 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4018 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4019 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4020 } 4021 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4022 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4023 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4024 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4025 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4026 PetscFunctionReturn(0); 4027 } 4028 4029 #undef __FUNCT__ 4030 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 4031 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 4032 { 4033 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4034 IS iscol=a->col,isrow=a->row; 4035 PetscErrorCode ierr; 4036 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 4037 const PetscInt *r,*c,*rout,*cout; 4038 const MatScalar *aa=a->a,*v; 4039 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4040 const PetscScalar *b; 4041 4042 PetscFunctionBegin; 4043 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4044 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4045 t = a->solve_work; 4046 4047 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4048 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4049 4050 /* forward solve the lower triangular */ 4051 idx = 3*r[0]; 4052 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4053 for (i=1; i<n; i++) { 4054 v = aa + 9*ai[i]; 4055 vi = aj + ai[i]; 4056 nz = ai[i+1] - ai[i]; 4057 idx = 3*r[i]; 4058 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4059 for(m=0;m<nz;m++){ 4060 idx = 3*vi[m]; 4061 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4062 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4063 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4064 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4065 v += 9; 4066 } 4067 idx = 3*i; 4068 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4069 } 4070 /* backward solve the upper triangular */ 4071 for (i=n-1; i>=0; i--){ 4072 k = 2*n-i; 4073 v = aa + 9*ai[k]; 4074 vi = aj + ai[k]; 4075 nz = ai[k +1] - ai[k] - 1; 4076 idt = 3*i; 4077 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4078 for(m=0;m<nz;m++){ 4079 idx = 3*vi[m]; 4080 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4081 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4082 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4083 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4084 v += 9; 4085 } 4086 idc = 3*c[i]; 4087 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4088 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4089 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4090 } 4091 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4092 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4093 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4094 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4095 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4096 PetscFunctionReturn(0); 4097 } 4098 4099 #undef __FUNCT__ 4100 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2" 4101 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4102 { 4103 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4104 IS iscol=a->col,isrow=a->row; 4105 PetscErrorCode ierr; 4106 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 4107 const PetscInt *r,*c,*rout,*cout; 4108 const MatScalar *aa=a->a,*v; 4109 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4110 const PetscScalar *b; 4111 4112 PetscFunctionBegin; 4113 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4114 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4115 t = a->solve_work; 4116 4117 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4118 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4119 4120 /* forward solve the lower triangular */ 4121 idx = 3*r[0]; 4122 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4123 for (i=1; i<n; i++) { 4124 v = aa + 9*ai[i]; 4125 vi = aj + ai[i]; 4126 nz = ai[i+1] - ai[i]; 4127 idx = 3*r[i]; 4128 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4129 for(m=0;m<nz;m++){ 4130 idx = 3*vi[m]; 4131 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4132 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4133 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4134 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4135 v += 9; 4136 } 4137 idx = 3*i; 4138 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4139 } 4140 /* backward solve the upper triangular */ 4141 for (i=n-1; i>=0; i--){ 4142 v = aa + 9*(adiag[i+1]+1); 4143 vi = aj + adiag[i+1]+1; 4144 nz = adiag[i] - adiag[i+1] - 1; 4145 idt = 3*i; 4146 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4147 for(m=0;m<nz;m++){ 4148 idx = 3*vi[m]; 4149 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4150 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4151 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4152 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4153 v += 9; 4154 } 4155 idc = 3*c[i]; 4156 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4157 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4158 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4159 } 4160 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4161 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4162 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4163 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4164 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4165 PetscFunctionReturn(0); 4166 } 4167 4168 /* 4169 Special case where the matrix was ILU(0) factored in the natural 4170 ordering. This eliminates the need for the column and row permutation. 4171 */ 4172 #undef __FUNCT__ 4173 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4174 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4175 { 4176 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4177 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4178 PetscErrorCode ierr; 4179 PetscInt *diag = a->diag; 4180 const MatScalar *aa=a->a,*v; 4181 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4182 const PetscScalar *b; 4183 PetscInt jdx,idt,idx,nz,*vi,i; 4184 4185 PetscFunctionBegin; 4186 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4187 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4188 4189 /* forward solve the lower triangular */ 4190 idx = 0; 4191 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4192 for (i=1; i<n; i++) { 4193 v = aa + 9*ai[i]; 4194 vi = aj + ai[i]; 4195 nz = diag[i] - ai[i]; 4196 idx += 3; 4197 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4198 while (nz--) { 4199 jdx = 3*(*vi++); 4200 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4201 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4202 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4203 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4204 v += 9; 4205 } 4206 x[idx] = s1; 4207 x[1+idx] = s2; 4208 x[2+idx] = s3; 4209 } 4210 /* backward solve the upper triangular */ 4211 for (i=n-1; i>=0; i--){ 4212 v = aa + 9*diag[i] + 9; 4213 vi = aj + diag[i] + 1; 4214 nz = ai[i+1] - diag[i] - 1; 4215 idt = 3*i; 4216 s1 = x[idt]; s2 = x[1+idt]; 4217 s3 = x[2+idt]; 4218 while (nz--) { 4219 idx = 3*(*vi++); 4220 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4221 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4222 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4223 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4224 v += 9; 4225 } 4226 v = aa + 9*diag[i]; 4227 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4228 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4229 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4230 } 4231 4232 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4234 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4235 PetscFunctionReturn(0); 4236 } 4237 4238 #undef __FUNCT__ 4239 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4240 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4241 { 4242 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4243 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4244 PetscErrorCode ierr; 4245 PetscInt idx,jdx,idt; 4246 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4247 const MatScalar *aa=a->a,*v; 4248 PetscScalar *x; 4249 const PetscScalar *b; 4250 PetscScalar s1,s2,s3,x1,x2,x3; 4251 4252 PetscFunctionBegin; 4253 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4254 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4255 /* forward solve the lower triangular */ 4256 idx = 0; 4257 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4258 for (i=1; i<n; i++) { 4259 v = aa + bs2*ai[i]; 4260 vi = aj + ai[i]; 4261 nz = ai[i+1] - ai[i]; 4262 idx = bs*i; 4263 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4264 for(k=0;k<nz;k++){ 4265 jdx = bs*vi[k]; 4266 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4267 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4268 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4269 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4270 4271 v += bs2; 4272 } 4273 4274 x[idx] = s1; 4275 x[1+idx] = s2; 4276 x[2+idx] = s3; 4277 } 4278 4279 /* backward solve the upper triangular */ 4280 for (i=n-1; i>=0; i--){ 4281 v = aa + bs2*ai[2*n-i]; 4282 vi = aj + ai[2*n-i]; 4283 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4284 idt = bs*i; 4285 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4286 4287 for(k=0;k<nz;k++){ 4288 idx = bs*vi[k]; 4289 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4290 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4291 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4292 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4293 4294 v += bs2; 4295 } 4296 /* x = inv_diagonal*x */ 4297 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4298 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4299 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4300 4301 } 4302 4303 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4304 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4305 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4306 PetscFunctionReturn(0); 4307 } 4308 4309 #undef __FUNCT__ 4310 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 4311 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4312 { 4313 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4314 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4315 PetscErrorCode ierr; 4316 PetscInt idx,jdx,idt; 4317 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4318 const MatScalar *aa=a->a,*v; 4319 PetscScalar *x; 4320 const PetscScalar *b; 4321 PetscScalar s1,s2,s3,x1,x2,x3; 4322 4323 PetscFunctionBegin; 4324 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4326 /* forward solve the lower triangular */ 4327 idx = 0; 4328 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4329 for (i=1; i<n; i++) { 4330 v = aa + bs2*ai[i]; 4331 vi = aj + ai[i]; 4332 nz = ai[i+1] - ai[i]; 4333 idx = bs*i; 4334 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4335 for(k=0;k<nz;k++){ 4336 jdx = bs*vi[k]; 4337 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4338 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4339 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4340 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4341 4342 v += bs2; 4343 } 4344 4345 x[idx] = s1; 4346 x[1+idx] = s2; 4347 x[2+idx] = s3; 4348 } 4349 4350 /* backward solve the upper triangular */ 4351 for (i=n-1; i>=0; i--){ 4352 v = aa + bs2*(adiag[i+1]+1); 4353 vi = aj + adiag[i+1]+1; 4354 nz = adiag[i] - adiag[i+1]-1; 4355 idt = bs*i; 4356 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4357 4358 for(k=0;k<nz;k++){ 4359 idx = bs*vi[k]; 4360 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4361 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4362 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4363 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4364 4365 v += bs2; 4366 } 4367 /* x = inv_diagonal*x */ 4368 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4369 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4370 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4371 4372 } 4373 4374 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4375 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4376 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4377 PetscFunctionReturn(0); 4378 } 4379 4380 #undef __FUNCT__ 4381 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4382 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4383 { 4384 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4385 IS iscol=a->col,isrow=a->row; 4386 PetscErrorCode ierr; 4387 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4388 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4389 const MatScalar *aa=a->a,*v; 4390 PetscScalar *x,s1,s2,x1,x2,*t; 4391 const PetscScalar *b; 4392 4393 PetscFunctionBegin; 4394 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4395 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4396 t = a->solve_work; 4397 4398 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4399 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4400 4401 /* forward solve the lower triangular */ 4402 idx = 2*(*r++); 4403 t[0] = b[idx]; t[1] = b[1+idx]; 4404 for (i=1; i<n; i++) { 4405 v = aa + 4*ai[i]; 4406 vi = aj + ai[i]; 4407 nz = diag[i] - ai[i]; 4408 idx = 2*(*r++); 4409 s1 = b[idx]; s2 = b[1+idx]; 4410 while (nz--) { 4411 idx = 2*(*vi++); 4412 x1 = t[idx]; x2 = t[1+idx]; 4413 s1 -= v[0]*x1 + v[2]*x2; 4414 s2 -= v[1]*x1 + v[3]*x2; 4415 v += 4; 4416 } 4417 idx = 2*i; 4418 t[idx] = s1; t[1+idx] = s2; 4419 } 4420 /* backward solve the upper triangular */ 4421 for (i=n-1; i>=0; i--){ 4422 v = aa + 4*diag[i] + 4; 4423 vi = aj + diag[i] + 1; 4424 nz = ai[i+1] - diag[i] - 1; 4425 idt = 2*i; 4426 s1 = t[idt]; s2 = t[1+idt]; 4427 while (nz--) { 4428 idx = 2*(*vi++); 4429 x1 = t[idx]; x2 = t[1+idx]; 4430 s1 -= v[0]*x1 + v[2]*x2; 4431 s2 -= v[1]*x1 + v[3]*x2; 4432 v += 4; 4433 } 4434 idc = 2*(*c--); 4435 v = aa + 4*diag[i]; 4436 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4437 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4438 } 4439 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4440 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4441 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4443 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4444 PetscFunctionReturn(0); 4445 } 4446 4447 #undef __FUNCT__ 4448 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4449 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 4450 { 4451 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4452 IS iscol=a->col,isrow=a->row; 4453 PetscErrorCode ierr; 4454 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 4455 const PetscInt *r,*c,*rout,*cout; 4456 const MatScalar *aa=a->a,*v; 4457 PetscScalar *x,s1,s2,x1,x2,*t; 4458 const PetscScalar *b; 4459 4460 PetscFunctionBegin; 4461 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4462 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4463 t = a->solve_work; 4464 4465 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4466 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4467 4468 /* forward solve the lower triangular */ 4469 idx = 2*r[0]; 4470 t[0] = b[idx]; t[1] = b[1+idx]; 4471 for (i=1; i<n; i++) { 4472 v = aa + 4*ai[i]; 4473 vi = aj + ai[i]; 4474 nz = ai[i+1] - ai[i]; 4475 idx = 2*r[i]; 4476 s1 = b[idx]; s2 = b[1+idx]; 4477 for(m=0;m<nz;m++){ 4478 jdx = 2*vi[m]; 4479 x1 = t[jdx]; x2 = t[1+jdx]; 4480 s1 -= v[0]*x1 + v[2]*x2; 4481 s2 -= v[1]*x1 + v[3]*x2; 4482 v += 4; 4483 } 4484 idx = 2*i; 4485 t[idx] = s1; t[1+idx] = s2; 4486 } 4487 /* backward solve the upper triangular */ 4488 for (i=n-1; i>=0; i--){ 4489 k = 2*n-i; 4490 v = aa + 4*ai[k]; 4491 vi = aj + ai[k]; 4492 nz = ai[k +1] - ai[k] - 1; 4493 idt = 2*i; 4494 s1 = t[idt]; s2 = t[1+idt]; 4495 for(m=0;m<nz;m++){ 4496 idx = 2*vi[m]; 4497 x1 = t[idx]; x2 = t[1+idx]; 4498 s1 -= v[0]*x1 + v[2]*x2; 4499 s2 -= v[1]*x1 + v[3]*x2; 4500 v += 4; 4501 } 4502 idc = 2*c[i]; 4503 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4504 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4505 } 4506 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4507 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4508 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4509 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4510 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4511 PetscFunctionReturn(0); 4512 } 4513 4514 #undef __FUNCT__ 4515 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2" 4516 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4517 { 4518 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4519 IS iscol=a->col,isrow=a->row; 4520 PetscErrorCode ierr; 4521 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4522 const PetscInt *r,*c,*rout,*cout; 4523 const MatScalar *aa=a->a,*v; 4524 PetscScalar *x,s1,s2,x1,x2,*t; 4525 const PetscScalar *b; 4526 4527 PetscFunctionBegin; 4528 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4529 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4530 t = a->solve_work; 4531 4532 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4533 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4534 4535 /* forward solve the lower triangular */ 4536 idx = 2*r[0]; 4537 t[0] = b[idx]; t[1] = b[1+idx]; 4538 for (i=1; i<n; i++) { 4539 v = aa + 4*ai[i]; 4540 vi = aj + ai[i]; 4541 nz = ai[i+1] - ai[i]; 4542 idx = 2*r[i]; 4543 s1 = b[idx]; s2 = b[1+idx]; 4544 for(m=0;m<nz;m++){ 4545 jdx = 2*vi[m]; 4546 x1 = t[jdx]; x2 = t[1+jdx]; 4547 s1 -= v[0]*x1 + v[2]*x2; 4548 s2 -= v[1]*x1 + v[3]*x2; 4549 v += 4; 4550 } 4551 idx = 2*i; 4552 t[idx] = s1; t[1+idx] = s2; 4553 } 4554 /* backward solve the upper triangular */ 4555 for (i=n-1; i>=0; i--){ 4556 v = aa + 4*(adiag[i+1]+1); 4557 vi = aj + adiag[i+1]+1; 4558 nz = adiag[i] - adiag[i+1] - 1; 4559 idt = 2*i; 4560 s1 = t[idt]; s2 = t[1+idt]; 4561 for(m=0;m<nz;m++){ 4562 idx = 2*vi[m]; 4563 x1 = t[idx]; x2 = t[1+idx]; 4564 s1 -= v[0]*x1 + v[2]*x2; 4565 s2 -= v[1]*x1 + v[3]*x2; 4566 v += 4; 4567 } 4568 idc = 2*c[i]; 4569 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4570 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4571 } 4572 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4573 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4574 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4575 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4576 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4577 PetscFunctionReturn(0); 4578 } 4579 4580 /* 4581 Special case where the matrix was ILU(0) factored in the natural 4582 ordering. This eliminates the need for the column and row permutation. 4583 */ 4584 #undef __FUNCT__ 4585 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4586 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 4587 { 4588 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4589 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4590 PetscErrorCode ierr; 4591 PetscInt *diag = a->diag; 4592 const MatScalar *aa=a->a,*v; 4593 PetscScalar *x,s1,s2,x1,x2; 4594 const PetscScalar *b; 4595 PetscInt jdx,idt,idx,nz,*vi,i; 4596 4597 PetscFunctionBegin; 4598 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4600 4601 /* forward solve the lower triangular */ 4602 idx = 0; 4603 x[0] = b[0]; x[1] = b[1]; 4604 for (i=1; i<n; i++) { 4605 v = aa + 4*ai[i]; 4606 vi = aj + ai[i]; 4607 nz = diag[i] - ai[i]; 4608 idx += 2; 4609 s1 = b[idx];s2 = b[1+idx]; 4610 while (nz--) { 4611 jdx = 2*(*vi++); 4612 x1 = x[jdx];x2 = x[1+jdx]; 4613 s1 -= v[0]*x1 + v[2]*x2; 4614 s2 -= v[1]*x1 + v[3]*x2; 4615 v += 4; 4616 } 4617 x[idx] = s1; 4618 x[1+idx] = s2; 4619 } 4620 /* backward solve the upper triangular */ 4621 for (i=n-1; i>=0; i--){ 4622 v = aa + 4*diag[i] + 4; 4623 vi = aj + diag[i] + 1; 4624 nz = ai[i+1] - diag[i] - 1; 4625 idt = 2*i; 4626 s1 = x[idt]; s2 = x[1+idt]; 4627 while (nz--) { 4628 idx = 2*(*vi++); 4629 x1 = x[idx]; x2 = x[1+idx]; 4630 s1 -= v[0]*x1 + v[2]*x2; 4631 s2 -= v[1]*x1 + v[3]*x2; 4632 v += 4; 4633 } 4634 v = aa + 4*diag[i]; 4635 x[idt] = v[0]*s1 + v[2]*s2; 4636 x[1+idt] = v[1]*s1 + v[3]*s2; 4637 } 4638 4639 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4640 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4641 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4642 PetscFunctionReturn(0); 4643 } 4644 4645 #undef __FUNCT__ 4646 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4647 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4648 { 4649 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4650 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4651 PetscErrorCode ierr; 4652 PetscInt jdx; 4653 const MatScalar *aa=a->a,*v; 4654 PetscScalar *x,s1,s2,x1,x2; 4655 const PetscScalar *b; 4656 4657 PetscFunctionBegin; 4658 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4659 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4660 /* forward solve the lower triangular */ 4661 idx = 0; 4662 x[0] = b[idx]; x[1] = b[1+idx]; 4663 for (i=1; i<n; i++) { 4664 v = aa + 4*ai[i]; 4665 vi = aj + ai[i]; 4666 nz = ai[i+1] - ai[i]; 4667 idx = 2*i; 4668 s1 = b[idx];s2 = b[1+idx]; 4669 for(k=0;k<nz;k++){ 4670 jdx = 2*vi[k]; 4671 x1 = x[jdx];x2 = x[1+jdx]; 4672 s1 -= v[0]*x1 + v[2]*x2; 4673 s2 -= v[1]*x1 + v[3]*x2; 4674 v += 4; 4675 } 4676 x[idx] = s1; 4677 x[1+idx] = s2; 4678 } 4679 4680 /* backward solve the upper triangular */ 4681 for (i=n-1; i>=0; i--){ 4682 v = aa + 4*ai[2*n-i]; 4683 vi = aj + ai[2*n-i]; 4684 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4685 idt = 2*i; 4686 s1 = x[idt]; s2 = x[1+idt]; 4687 for(k=0;k<nz;k++){ 4688 idx = 2*vi[k]; 4689 x1 = x[idx]; x2 = x[1+idx]; 4690 s1 -= v[0]*x1 + v[2]*x2; 4691 s2 -= v[1]*x1 + v[3]*x2; 4692 v += 4; 4693 } 4694 /* x = inv_diagonal*x */ 4695 x[idt] = v[0]*s1 + v[2]*s2; 4696 x[1+idt] = v[1]*s1 + v[3]*s2; 4697 } 4698 4699 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4700 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4701 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4702 PetscFunctionReturn(0); 4703 } 4704 4705 #undef __FUNCT__ 4706 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4707 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4708 { 4709 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4710 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4711 PetscErrorCode ierr; 4712 PetscInt jdx; 4713 const MatScalar *aa=a->a,*v; 4714 PetscScalar *x,s1,s2,x1,x2; 4715 const PetscScalar *b; 4716 4717 PetscFunctionBegin; 4718 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4719 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4720 /* forward solve the lower triangular */ 4721 idx = 0; 4722 x[0] = b[idx]; x[1] = b[1+idx]; 4723 for (i=1; i<n; i++) { 4724 v = aa + 4*ai[i]; 4725 vi = aj + ai[i]; 4726 nz = ai[i+1] - ai[i]; 4727 idx = 2*i; 4728 s1 = b[idx];s2 = b[1+idx]; 4729 for(k=0;k<nz;k++){ 4730 jdx = 2*vi[k]; 4731 x1 = x[jdx];x2 = x[1+jdx]; 4732 s1 -= v[0]*x1 + v[2]*x2; 4733 s2 -= v[1]*x1 + v[3]*x2; 4734 v += 4; 4735 } 4736 x[idx] = s1; 4737 x[1+idx] = s2; 4738 } 4739 4740 /* backward solve the upper triangular */ 4741 for (i=n-1; i>=0; i--){ 4742 v = aa + 4*(adiag[i+1]+1); 4743 vi = aj + adiag[i+1]+1; 4744 nz = adiag[i] - adiag[i+1]-1; 4745 idt = 2*i; 4746 s1 = x[idt]; s2 = x[1+idt]; 4747 for(k=0;k<nz;k++){ 4748 idx = 2*vi[k]; 4749 x1 = x[idx]; x2 = x[1+idx]; 4750 s1 -= v[0]*x1 + v[2]*x2; 4751 s2 -= v[1]*x1 + v[3]*x2; 4752 v += 4; 4753 } 4754 /* x = inv_diagonal*x */ 4755 x[idt] = v[0]*s1 + v[2]*s2; 4756 x[1+idt] = v[1]*s1 + v[3]*s2; 4757 } 4758 4759 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4760 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4761 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4762 PetscFunctionReturn(0); 4763 } 4764 4765 #undef __FUNCT__ 4766 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4767 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4768 { 4769 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4770 IS iscol=a->col,isrow=a->row; 4771 PetscErrorCode ierr; 4772 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4773 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4774 MatScalar *aa=a->a,*v; 4775 PetscScalar *x,*b,s1,*t; 4776 4777 PetscFunctionBegin; 4778 if (!n) PetscFunctionReturn(0); 4779 4780 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4781 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4782 t = a->solve_work; 4783 4784 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4785 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4786 4787 /* forward solve the lower triangular */ 4788 t[0] = b[*r++]; 4789 for (i=1; i<n; i++) { 4790 v = aa + ai[i]; 4791 vi = aj + ai[i]; 4792 nz = diag[i] - ai[i]; 4793 s1 = b[*r++]; 4794 while (nz--) { 4795 s1 -= (*v++)*t[*vi++]; 4796 } 4797 t[i] = s1; 4798 } 4799 /* backward solve the upper triangular */ 4800 for (i=n-1; i>=0; i--){ 4801 v = aa + diag[i] + 1; 4802 vi = aj + diag[i] + 1; 4803 nz = ai[i+1] - diag[i] - 1; 4804 s1 = t[i]; 4805 while (nz--) { 4806 s1 -= (*v++)*t[*vi++]; 4807 } 4808 x[*c--] = t[i] = aa[diag[i]]*s1; 4809 } 4810 4811 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4812 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4813 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4814 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4815 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4816 PetscFunctionReturn(0); 4817 } 4818 /* 4819 Special case where the matrix was ILU(0) factored in the natural 4820 ordering. This eliminates the need for the column and row permutation. 4821 */ 4822 #undef __FUNCT__ 4823 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4824 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 4825 { 4826 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4827 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4828 PetscErrorCode ierr; 4829 PetscInt *diag = a->diag; 4830 MatScalar *aa=a->a; 4831 PetscScalar *x,*b; 4832 PetscScalar s1,x1; 4833 MatScalar *v; 4834 PetscInt jdx,idt,idx,nz,*vi,i; 4835 4836 PetscFunctionBegin; 4837 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4838 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4839 4840 /* forward solve the lower triangular */ 4841 idx = 0; 4842 x[0] = b[0]; 4843 for (i=1; i<n; i++) { 4844 v = aa + ai[i]; 4845 vi = aj + ai[i]; 4846 nz = diag[i] - ai[i]; 4847 idx += 1; 4848 s1 = b[idx]; 4849 while (nz--) { 4850 jdx = *vi++; 4851 x1 = x[jdx]; 4852 s1 -= v[0]*x1; 4853 v += 1; 4854 } 4855 x[idx] = s1; 4856 } 4857 /* backward solve the upper triangular */ 4858 for (i=n-1; i>=0; i--){ 4859 v = aa + diag[i] + 1; 4860 vi = aj + diag[i] + 1; 4861 nz = ai[i+1] - diag[i] - 1; 4862 idt = i; 4863 s1 = x[idt]; 4864 while (nz--) { 4865 idx = *vi++; 4866 x1 = x[idx]; 4867 s1 -= v[0]*x1; 4868 v += 1; 4869 } 4870 v = aa + diag[i]; 4871 x[idt] = v[0]*s1; 4872 } 4873 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4874 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4875 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4876 PetscFunctionReturn(0); 4877 } 4878 4879 /* ----------------------------------------------------------------*/ 4880 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 4881 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 4882 4883 #undef __FUNCT__ 4884 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 4885 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 4886 { 4887 Mat C=B; 4888 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 4889 IS isrow = b->row,isicol = b->icol; 4890 PetscErrorCode ierr; 4891 const PetscInt *r,*ic,*ics; 4892 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 4893 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4894 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4895 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4896 MatScalar *v_work; 4897 4898 PetscFunctionBegin; 4899 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4900 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4901 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4902 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 4903 ics = ic; 4904 4905 /* generate work space needed by dense LU factorization */ 4906 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 4907 mwork = v_work + bs; 4908 v_pivots = (PetscInt*)(mwork + bs2); 4909 4910 for (i=0; i<n; i++){ 4911 /* zero rtmp */ 4912 /* L part */ 4913 nz = bi[i+1] - bi[i]; 4914 bjtmp = bj + bi[i]; 4915 for (j=0; j<nz; j++){ 4916 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4917 } 4918 4919 /* U part */ 4920 nz = bi[2*n-i+1] - bi[2*n-i]; 4921 bjtmp = bj + bi[2*n-i]; 4922 for (j=0; j<nz; j++){ 4923 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4924 } 4925 4926 /* load in initial (unfactored row) */ 4927 nz = ai[r[i]+1] - ai[r[i]]; 4928 ajtmp = aj + ai[r[i]]; 4929 v = aa + bs2*ai[r[i]]; 4930 for (j=0; j<nz; j++) { 4931 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 4932 } 4933 4934 /* elimination */ 4935 bjtmp = bj + bi[i]; 4936 nzL = bi[i+1] - bi[i]; 4937 for(k=0;k < nzL;k++) { 4938 row = bjtmp[k]; 4939 pc = rtmp + bs2*row; 4940 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4941 if (flg) { 4942 pv = b->a + bs2*bdiag[row]; 4943 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 4944 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 4945 pv = b->a + bs2*bi[2*n-row]; 4946 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 4947 for (j=0; j<nz; j++) { 4948 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4949 } 4950 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 4951 } 4952 } 4953 4954 /* finished row so stick it into b->a */ 4955 /* L part */ 4956 pv = b->a + bs2*bi[i] ; 4957 pj = b->j + bi[i] ; 4958 nz = bi[i+1] - bi[i]; 4959 for (j=0; j<nz; j++) { 4960 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4961 } 4962 4963 /* Mark diagonal and invert diagonal for simplier triangular solves */ 4964 pv = b->a + bs2*bdiag[i]; 4965 pj = b->j + bdiag[i]; 4966 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4967 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4968 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 4969 4970 /* U part */ 4971 pv = b->a + bs2*bi[2*n-i]; 4972 pj = b->j + bi[2*n-i]; 4973 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 4974 for (j=0; j<nz; j++){ 4975 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4976 } 4977 } 4978 4979 ierr = PetscFree(rtmp);CHKERRQ(ierr); 4980 ierr = PetscFree(v_work);CHKERRQ(ierr); 4981 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4982 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4983 4984 C->assembled = PETSC_TRUE; 4985 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 4986 PetscFunctionReturn(0); 4987 } 4988 4989 #undef __FUNCT__ 4990 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2" 4991 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 4992 { 4993 Mat C=B; 4994 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 4995 IS isrow = b->row,isicol = b->icol; 4996 PetscErrorCode ierr; 4997 const PetscInt *r,*ic,*ics; 4998 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 4999 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5000 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5001 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5002 MatScalar *v_work; 5003 5004 PetscFunctionBegin; 5005 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5006 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5007 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5008 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 5009 ics = ic; 5010 5011 /* generate work space needed by dense LU factorization */ 5012 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 5013 mwork = v_work + bs; 5014 v_pivots = (PetscInt*)(mwork + bs2); 5015 5016 for (i=0; i<n; i++){ 5017 /* zero rtmp */ 5018 /* L part */ 5019 nz = bi[i+1] - bi[i]; 5020 bjtmp = bj + bi[i]; 5021 for (j=0; j<nz; j++){ 5022 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5023 } 5024 5025 /* U part */ 5026 nz = bdiag[i] - bdiag[i+1]; 5027 bjtmp = bj + bdiag[i+1]+1; 5028 for (j=0; j<nz; j++){ 5029 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5030 } 5031 5032 /* load in initial (unfactored row) */ 5033 nz = ai[r[i]+1] - ai[r[i]]; 5034 ajtmp = aj + ai[r[i]]; 5035 v = aa + bs2*ai[r[i]]; 5036 for (j=0; j<nz; j++) { 5037 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5038 } 5039 5040 /* elimination */ 5041 bjtmp = bj + bi[i]; 5042 nzL = bi[i+1] - bi[i]; 5043 for(k=0;k < nzL;k++) { 5044 row = bjtmp[k]; 5045 pc = rtmp + bs2*row; 5046 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5047 if (flg) { 5048 pv = b->a + bs2*bdiag[row]; 5049 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5050 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5051 pv = b->a + bs2*(bdiag[row+1]+1); 5052 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5053 for (j=0; j<nz; j++) { 5054 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5055 } 5056 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5057 } 5058 } 5059 5060 /* finished row so stick it into b->a */ 5061 /* L part */ 5062 pv = b->a + bs2*bi[i] ; 5063 pj = b->j + bi[i] ; 5064 nz = bi[i+1] - bi[i]; 5065 for (j=0; j<nz; j++) { 5066 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5067 } 5068 5069 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5070 pv = b->a + bs2*bdiag[i]; 5071 pj = b->j + bdiag[i]; 5072 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5073 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5074 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5075 5076 /* U part */ 5077 pv = b->a + bs2*(bdiag[i+1]+1); 5078 pj = b->j + bdiag[i+1]+1; 5079 nz = bdiag[i] - bdiag[i+1] - 1; 5080 for (j=0; j<nz; j++){ 5081 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5082 } 5083 } 5084 5085 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5086 ierr = PetscFree(v_work);CHKERRQ(ierr); 5087 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5088 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5089 5090 C->assembled = PETSC_TRUE; 5091 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5092 PetscFunctionReturn(0); 5093 } 5094 5095 /* 5096 ilu(0) with natural ordering under new data structure. 5097 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 5098 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 5099 */ 5100 #undef __FUNCT__ 5101 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 5102 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5103 { 5104 5105 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5106 PetscErrorCode ierr; 5107 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5108 PetscInt i,j,nz,*bi,*bj,*bdiag; 5109 5110 PetscFunctionBegin; 5111 /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */ 5112 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5113 b = (Mat_SeqBAIJ*)(fact)->data; 5114 5115 /* allocate matrix arrays for new data structure */ 5116 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr); 5117 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr); 5118 b->singlemalloc = PETSC_TRUE; 5119 if (!b->diag){ 5120 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5121 } 5122 bdiag = b->diag; 5123 5124 if (n > 0) { 5125 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5126 } 5127 5128 /* set bi and bj with new data structure */ 5129 bi = b->i; 5130 bj = b->j; 5131 5132 /* L part */ 5133 bi[0] = 0; 5134 for (i=0; i<n; i++){ 5135 nz = adiag[i] - ai[i]; 5136 bi[i+1] = bi[i] + nz; 5137 aj = a->j + ai[i]; 5138 for (j=0; j<nz; j++){ 5139 *bj = aj[j]; bj++; 5140 } 5141 } 5142 5143 /* U part */ 5144 bi[n+1] = bi[n]; 5145 for (i=n-1; i>=0; i--){ 5146 nz = ai[i+1] - adiag[i] - 1; 5147 bi[2*n-i+1] = bi[2*n-i] + nz + 1; 5148 aj = a->j + adiag[i] + 1; 5149 for (j=0; j<nz; j++){ 5150 *bj = aj[j]; bj++; 5151 } 5152 /* diag[i] */ 5153 *bj = i; bj++; 5154 bdiag[i] = bi[2*n-i+1]-1; 5155 } 5156 PetscFunctionReturn(0); 5157 } 5158 5159 #undef __FUNCT__ 5160 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 5161 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5162 { 5163 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5164 IS isicol; 5165 PetscErrorCode ierr; 5166 const PetscInt *r,*ic; 5167 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5168 PetscInt *bi,*cols,nnz,*cols_lvl; 5169 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5170 PetscInt i,levels,diagonal_fill; 5171 PetscTruth col_identity,row_identity,both_identity; 5172 PetscReal f; 5173 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5174 PetscBT lnkbt; 5175 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5176 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5177 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5178 PetscTruth missing; 5179 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5180 5181 PetscFunctionBegin; 5182 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5183 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5184 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5185 5186 f = info->fill; 5187 levels = (PetscInt)info->levels; 5188 diagonal_fill = (PetscInt)info->diagonal_fill; 5189 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5190 5191 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5192 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5193 both_identity = (PetscTruth) (row_identity && col_identity); 5194 5195 if (!levels && both_identity) { 5196 /* special case: ilu(0) with natural ordering */ 5197 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5198 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 5199 /* set MatSolve routines */ 5200 switch (bs){ 5201 case 2: 5202 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 5203 break; 5204 case 3: 5205 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 5206 break; 5207 case 4: 5208 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 5209 break; 5210 case 5: 5211 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 5212 break; 5213 case 6: 5214 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 5215 break; 5216 case 7: 5217 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 5218 break; 5219 default: 5220 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 5221 break; 5222 } 5223 5224 fact->factor = MAT_FACTOR_ILU; 5225 (fact)->info.factor_mallocs = 0; 5226 (fact)->info.fill_ratio_given = info->fill; 5227 (fact)->info.fill_ratio_needed = 1.0; 5228 b = (Mat_SeqBAIJ*)(fact)->data; 5229 b->row = isrow; 5230 b->col = iscol; 5231 b->icol = isicol; 5232 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5233 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5234 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5235 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5236 PetscFunctionReturn(0); 5237 } 5238 5239 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5240 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5241 5242 /* get new row pointers */ 5243 ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5244 bi[0] = 0; 5245 /* bdiag is location of diagonal in factor */ 5246 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5247 bdiag[0] = 0; 5248 5249 ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 5250 bjlvl_ptr = (PetscInt**)(bj_ptr + n); 5251 5252 /* create a linked list for storing column indices of the active row */ 5253 nlnk = n + 1; 5254 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5255 5256 /* initial FreeSpace size is f*(ai[n]+1) */ 5257 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5258 current_space = free_space; 5259 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5260 current_space_lvl = free_space_lvl; 5261 5262 for (i=0; i<n; i++) { 5263 nzi = 0; 5264 /* copy current row into linked list */ 5265 nnz = ai[r[i]+1] - ai[r[i]]; 5266 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5267 cols = aj + ai[r[i]]; 5268 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5269 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5270 nzi += nlnk; 5271 5272 /* make sure diagonal entry is included */ 5273 if (diagonal_fill && lnk[i] == -1) { 5274 fm = n; 5275 while (lnk[fm] < i) fm = lnk[fm]; 5276 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5277 lnk[fm] = i; 5278 lnk_lvl[i] = 0; 5279 nzi++; dcount++; 5280 } 5281 5282 /* add pivot rows into the active row */ 5283 nzbd = 0; 5284 prow = lnk[n]; 5285 while (prow < i) { 5286 nnz = bdiag[prow]; 5287 cols = bj_ptr[prow] + nnz + 1; 5288 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5289 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5290 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5291 nzi += nlnk; 5292 prow = lnk[prow]; 5293 nzbd++; 5294 } 5295 bdiag[i] = nzbd; 5296 bi[i+1] = bi[i] + nzi; 5297 5298 /* if free space is not available, make more free space */ 5299 if (current_space->local_remaining<nzi) { 5300 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5301 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5302 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5303 reallocs++; 5304 } 5305 5306 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5307 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5308 bj_ptr[i] = current_space->array; 5309 bjlvl_ptr[i] = current_space_lvl->array; 5310 5311 /* make sure the active row i has diagonal entry */ 5312 if (*(bj_ptr[i]+bdiag[i]) != i) { 5313 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5314 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5315 } 5316 5317 current_space->array += nzi; 5318 current_space->local_used += nzi; 5319 current_space->local_remaining -= nzi; 5320 current_space_lvl->array += nzi; 5321 current_space_lvl->local_used += nzi; 5322 current_space_lvl->local_remaining -= nzi; 5323 } 5324 5325 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5326 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5327 5328 /* destroy list of free space and other temporary arrays */ 5329 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5330 5331 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5332 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5333 5334 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5335 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5336 ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 5337 5338 #if defined(PETSC_USE_INFO) 5339 { 5340 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5341 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5342 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5343 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5344 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5345 if (diagonal_fill) { 5346 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5347 } 5348 } 5349 #endif 5350 5351 /* put together the new matrix */ 5352 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5353 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5354 b = (Mat_SeqBAIJ*)(fact)->data; 5355 b->free_a = PETSC_TRUE; 5356 b->free_ij = PETSC_TRUE; 5357 b->singlemalloc = PETSC_FALSE; 5358 ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5359 b->j = bj; 5360 b->i = bi; 5361 b->diag = bdiag; 5362 b->free_diag = PETSC_TRUE; 5363 b->ilen = 0; 5364 b->imax = 0; 5365 b->row = isrow; 5366 b->col = iscol; 5367 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5368 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5369 b->icol = isicol; 5370 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5371 /* In b structure: Free imax, ilen, old a, old j. 5372 Allocate bdiag, solve_work, new a, new j */ 5373 ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5374 b->maxnz = b->nz = bi[2*n+1] ; 5375 (fact)->info.factor_mallocs = reallocs; 5376 (fact)->info.fill_ratio_given = f; 5377 (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]); 5378 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 5379 /* set MatSolve routines */ 5380 if (both_identity){ 5381 switch (bs){ 5382 case 2: 5383 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 5384 break; 5385 case 3: 5386 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 5387 break; 5388 case 4: 5389 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 5390 break; 5391 case 5: 5392 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 5393 break; 5394 case 6: 5395 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 5396 break; 5397 case 7: 5398 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 5399 break; 5400 default: 5401 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 5402 break; 5403 } 5404 } else { 5405 switch (bs){ 5406 case 2: 5407 fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct; 5408 break; 5409 case 3: 5410 fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct; 5411 break; 5412 case 4: 5413 fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct; 5414 break; 5415 case 5: 5416 fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct; 5417 break; 5418 case 6: 5419 fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct; 5420 break; 5421 case 7: 5422 fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct; 5423 break; 5424 default: 5425 fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 5426 break; 5427 } 5428 } 5429 PetscFunctionReturn(0); 5430 } 5431 5432 /* 5433 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5434 except that the data structure of Mat_SeqAIJ is slightly different. 5435 Not a good example of code reuse. 5436 */ 5437 #undef __FUNCT__ 5438 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5439 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5440 { 5441 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5442 IS isicol; 5443 PetscErrorCode ierr; 5444 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5445 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5446 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5447 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5448 PetscTruth col_identity,row_identity,both_identity,flg; 5449 PetscReal f; 5450 PetscTruth newdatastruct=PETSC_FALSE; 5451 5452 PetscFunctionBegin; 5453 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 5454 if (newdatastruct){ 5455 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5456 PetscFunctionReturn(0); 5457 } 5458 5459 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5460 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5461 5462 f = info->fill; 5463 levels = (PetscInt)info->levels; 5464 diagonal_fill = (PetscInt)info->diagonal_fill; 5465 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5466 5467 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5468 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5469 both_identity = (PetscTruth) (row_identity && col_identity); 5470 5471 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5472 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5473 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5474 5475 fact->factor = MAT_FACTOR_ILU; 5476 b = (Mat_SeqBAIJ*)(fact)->data; 5477 b->row = isrow; 5478 b->col = iscol; 5479 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5480 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5481 b->icol = isicol; 5482 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5483 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5484 PetscFunctionReturn(0); 5485 } 5486 5487 /* general case perform the symbolic factorization */ 5488 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5489 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5490 5491 /* get new row pointers */ 5492 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5493 ainew[0] = 0; 5494 /* don't know how many column pointers are needed so estimate */ 5495 jmax = (PetscInt)(f*ai[n] + 1); 5496 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5497 /* ajfill is level of fill for each fill entry */ 5498 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5499 /* fill is a linked list of nonzeros in active row */ 5500 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5501 /* im is level for each filled value */ 5502 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5503 /* dloc is location of diagonal in factor */ 5504 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5505 dloc[0] = 0; 5506 for (prow=0; prow<n; prow++) { 5507 5508 /* copy prow into linked list */ 5509 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5510 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5511 xi = aj + ai[r[prow]]; 5512 fill[n] = n; 5513 fill[prow] = -1; /* marker for diagonal entry */ 5514 while (nz--) { 5515 fm = n; 5516 idx = ic[*xi++]; 5517 do { 5518 m = fm; 5519 fm = fill[m]; 5520 } while (fm < idx); 5521 fill[m] = idx; 5522 fill[idx] = fm; 5523 im[idx] = 0; 5524 } 5525 5526 /* make sure diagonal entry is included */ 5527 if (diagonal_fill && fill[prow] == -1) { 5528 fm = n; 5529 while (fill[fm] < prow) fm = fill[fm]; 5530 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5531 fill[fm] = prow; 5532 im[prow] = 0; 5533 nzf++; 5534 dcount++; 5535 } 5536 5537 nzi = 0; 5538 row = fill[n]; 5539 while (row < prow) { 5540 incrlev = im[row] + 1; 5541 nz = dloc[row]; 5542 xi = ajnew + ainew[row] + nz + 1; 5543 flev = ajfill + ainew[row] + nz + 1; 5544 nnz = ainew[row+1] - ainew[row] - nz - 1; 5545 fm = row; 5546 while (nnz-- > 0) { 5547 idx = *xi++; 5548 if (*flev + incrlev > levels) { 5549 flev++; 5550 continue; 5551 } 5552 do { 5553 m = fm; 5554 fm = fill[m]; 5555 } while (fm < idx); 5556 if (fm != idx) { 5557 im[idx] = *flev + incrlev; 5558 fill[m] = idx; 5559 fill[idx] = fm; 5560 fm = idx; 5561 nzf++; 5562 } else { 5563 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5564 } 5565 flev++; 5566 } 5567 row = fill[row]; 5568 nzi++; 5569 } 5570 /* copy new filled row into permanent storage */ 5571 ainew[prow+1] = ainew[prow] + nzf; 5572 if (ainew[prow+1] > jmax) { 5573 5574 /* estimate how much additional space we will need */ 5575 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5576 /* just double the memory each time */ 5577 PetscInt maxadd = jmax; 5578 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5579 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5580 jmax += maxadd; 5581 5582 /* allocate a longer ajnew and ajfill */ 5583 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5584 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5585 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5586 ajnew = xitmp; 5587 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5588 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5589 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5590 ajfill = xitmp; 5591 reallocate++; /* count how many reallocations are needed */ 5592 } 5593 xitmp = ajnew + ainew[prow]; 5594 flev = ajfill + ainew[prow]; 5595 dloc[prow] = nzi; 5596 fm = fill[n]; 5597 while (nzf--) { 5598 *xitmp++ = fm; 5599 *flev++ = im[fm]; 5600 fm = fill[fm]; 5601 } 5602 /* make sure row has diagonal entry */ 5603 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5604 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5605 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5606 } 5607 } 5608 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5609 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5610 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5611 ierr = PetscFree(fill);CHKERRQ(ierr); 5612 ierr = PetscFree(im);CHKERRQ(ierr); 5613 5614 #if defined(PETSC_USE_INFO) 5615 { 5616 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5617 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5618 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5619 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5620 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5621 if (diagonal_fill) { 5622 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5623 } 5624 } 5625 #endif 5626 5627 /* put together the new matrix */ 5628 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5629 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5630 b = (Mat_SeqBAIJ*)(fact)->data; 5631 b->free_a = PETSC_TRUE; 5632 b->free_ij = PETSC_TRUE; 5633 b->singlemalloc = PETSC_FALSE; 5634 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5635 b->j = ajnew; 5636 b->i = ainew; 5637 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5638 b->diag = dloc; 5639 b->free_diag = PETSC_TRUE; 5640 b->ilen = 0; 5641 b->imax = 0; 5642 b->row = isrow; 5643 b->col = iscol; 5644 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5645 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5646 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5647 b->icol = isicol; 5648 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5649 /* In b structure: Free imax, ilen, old a, old j. 5650 Allocate dloc, solve_work, new a, new j */ 5651 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5652 b->maxnz = b->nz = ainew[n]; 5653 5654 (fact)->info.factor_mallocs = reallocate; 5655 (fact)->info.fill_ratio_given = f; 5656 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5657 5658 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5659 PetscFunctionReturn(0); 5660 } 5661 5662 #undef __FUNCT__ 5663 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5664 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5665 { 5666 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5667 /* int i,*AJ=a->j,nz=a->nz; */ 5668 PetscFunctionBegin; 5669 /* Undo Column scaling */ 5670 /* while (nz--) { */ 5671 /* AJ[i] = AJ[i]/4; */ 5672 /* } */ 5673 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5674 A->ops->setunfactored = PETSC_NULL; 5675 PetscFunctionReturn(0); 5676 } 5677 5678 #undef __FUNCT__ 5679 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5680 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5681 { 5682 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5683 PetscInt *AJ=a->j,nz=a->nz; 5684 unsigned short *aj=(unsigned short *)AJ; 5685 PetscFunctionBegin; 5686 /* Is this really necessary? */ 5687 while (nz--) { 5688 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5689 } 5690 A->ops->setunfactored = PETSC_NULL; 5691 PetscFunctionReturn(0); 5692 } 5693 5694 5695