1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124 PetscInt *diag = a->diag,oidx; 125 MatScalar *aa=a->a,*v; 126 PetscScalar s1,s2,s3,x1,x2,x3; 127 PetscScalar *x,*b; 128 129 PetscFunctionBegin; 130 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133 134 /* forward solve the U^T */ 135 idx = 0; 136 for (i=0; i<n; i++) { 137 138 v = aa + 9*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144 v += 9; 145 146 vi = aj + diag[i] + 1; 147 nz = ai[i+1] - diag[i] - 1; 148 while (nz--) { 149 oidx = 3*(*vi++); 150 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153 v += 9; 154 } 155 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156 idx += 3; 157 } 158 /* backward solve the L^T */ 159 for (i=n-1; i>=0; i--){ 160 v = aa + 9*diag[i] - 9; 161 vi = aj + diag[i] - 1; 162 nz = diag[i] - ai[i]; 163 idt = 3*i; 164 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165 while (nz--) { 166 idx = 3*(*vi--); 167 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170 v -= 9; 171 } 172 } 173 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182 { 183 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184 PetscErrorCode ierr; 185 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186 PetscInt *diag = a->diag,oidx; 187 MatScalar *aa=a->a,*v; 188 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 189 PetscScalar *x,*b; 190 191 PetscFunctionBegin; 192 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 193 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195 196 /* forward solve the U^T */ 197 idx = 0; 198 for (i=0; i<n; i++) { 199 200 v = aa + 16*diag[i]; 201 /* multiply by the inverse of the block diagonal */ 202 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207 v += 16; 208 209 vi = aj + diag[i] + 1; 210 nz = ai[i+1] - diag[i] - 1; 211 while (nz--) { 212 oidx = 4*(*vi++); 213 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217 v += 16; 218 } 219 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220 idx += 4; 221 } 222 /* backward solve the L^T */ 223 for (i=n-1; i>=0; i--){ 224 v = aa + 16*diag[i] - 16; 225 vi = aj + diag[i] - 1; 226 nz = diag[i] - ai[i]; 227 idt = 4*i; 228 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229 while (nz--) { 230 idx = 4*(*vi--); 231 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235 v -= 16; 236 } 237 } 238 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 239 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247 { 248 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249 PetscErrorCode ierr; 250 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251 PetscInt *diag = a->diag,oidx; 252 MatScalar *aa=a->a,*v; 253 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 254 PetscScalar *x,*b; 255 256 PetscFunctionBegin; 257 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 258 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 259 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260 261 /* forward solve the U^T */ 262 idx = 0; 263 for (i=0; i<n; i++) { 264 265 v = aa + 25*diag[i]; 266 /* multiply by the inverse of the block diagonal */ 267 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273 v += 25; 274 275 vi = aj + diag[i] + 1; 276 nz = ai[i+1] - diag[i] - 1; 277 while (nz--) { 278 oidx = 5*(*vi++); 279 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284 v += 25; 285 } 286 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287 idx += 5; 288 } 289 /* backward solve the L^T */ 290 for (i=n-1; i>=0; i--){ 291 v = aa + 25*diag[i] - 25; 292 vi = aj + diag[i] - 1; 293 nz = diag[i] - ai[i]; 294 idt = 5*i; 295 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296 while (nz--) { 297 idx = 5*(*vi--); 298 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303 v -= 25; 304 } 305 } 306 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 307 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311 312 #undef __FUNCT__ 313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315 { 316 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317 PetscErrorCode ierr; 318 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319 PetscInt *diag = a->diag,oidx; 320 MatScalar *aa=a->a,*v; 321 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 322 PetscScalar *x,*b; 323 324 PetscFunctionBegin; 325 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 326 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 327 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328 329 /* forward solve the U^T */ 330 idx = 0; 331 for (i=0; i<n; i++) { 332 333 v = aa + 36*diag[i]; 334 /* multiply by the inverse of the block diagonal */ 335 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336 x6 = x[5+idx]; 337 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343 v += 36; 344 345 vi = aj + diag[i] + 1; 346 nz = ai[i+1] - diag[i] - 1; 347 while (nz--) { 348 oidx = 6*(*vi++); 349 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355 v += 36; 356 } 357 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358 x[5+idx] = s6; 359 idx += 6; 360 } 361 /* backward solve the L^T */ 362 for (i=n-1; i>=0; i--){ 363 v = aa + 36*diag[i] - 36; 364 vi = aj + diag[i] - 1; 365 nz = diag[i] - ai[i]; 366 idt = 6*i; 367 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368 s6 = x[5+idt]; 369 while (nz--) { 370 idx = 6*(*vi--); 371 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377 v -= 36; 378 } 379 } 380 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389 { 390 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391 PetscErrorCode ierr; 392 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393 PetscInt *diag = a->diag,oidx; 394 MatScalar *aa=a->a,*v; 395 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 396 PetscScalar *x,*b; 397 398 PetscFunctionBegin; 399 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 400 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 401 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402 403 /* forward solve the U^T */ 404 idx = 0; 405 for (i=0; i<n; i++) { 406 407 v = aa + 49*diag[i]; 408 /* multiply by the inverse of the block diagonal */ 409 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410 x6 = x[5+idx]; x7 = x[6+idx]; 411 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418 v += 49; 419 420 vi = aj + diag[i] + 1; 421 nz = ai[i+1] - diag[i] - 1; 422 while (nz--) { 423 oidx = 7*(*vi++); 424 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431 v += 49; 432 } 433 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434 x[5+idx] = s6;x[6+idx] = s7; 435 idx += 7; 436 } 437 /* backward solve the L^T */ 438 for (i=n-1; i>=0; i--){ 439 v = aa + 49*diag[i] - 49; 440 vi = aj + diag[i] - 1; 441 nz = diag[i] - ai[i]; 442 idt = 7*i; 443 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444 s6 = x[5+idt];s7 = x[6+idt]; 445 while (nz--) { 446 idx = 7*(*vi--); 447 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454 v -= 49; 455 } 456 } 457 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460 PetscFunctionReturn(0); 461 } 462 463 /*---------------------------------------------------------------------------------------------*/ 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469 IS iscol=a->col,isrow=a->row; 470 PetscErrorCode ierr; 471 const PetscInt *r,*c,*rout,*cout; 472 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473 PetscInt *diag = a->diag; 474 MatScalar *aa=a->a,*v; 475 PetscScalar s1,*x,*b,*t; 476 477 PetscFunctionBegin; 478 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 482 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484 485 /* copy the b into temp work space according to permutation */ 486 for (i=0; i<n; i++) { 487 t[i] = b[c[i]]; 488 } 489 490 /* forward solve the U^T */ 491 for (i=0; i<n; i++) { 492 493 v = aa + diag[i]; 494 /* multiply by the inverse of the block diagonal */ 495 s1 = (*v++)*t[i]; 496 vi = aj + diag[i] + 1; 497 nz = ai[i+1] - diag[i] - 1; 498 while (nz--) { 499 t[*vi++] -= (*v++)*s1; 500 } 501 t[i] = s1; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--){ 505 v = aa + diag[i] - 1; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 s1 = t[i]; 509 while (nz--) { 510 t[*vi--] -= (*v--)*s1; 511 } 512 } 513 514 /* copy t into x according to permutation */ 515 for (i=0; i<n; i++) { 516 x[r[i]] = t[i]; 517 } 518 519 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 521 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530 { 531 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532 IS iscol=a->col,isrow=a->row; 533 PetscErrorCode ierr; 534 const PetscInt *r,*c,*rout,*cout; 535 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536 PetscInt *diag = a->diag,ii,ic,ir,oidx; 537 MatScalar *aa=a->a,*v; 538 PetscScalar s1,s2,x1,x2; 539 PetscScalar *x,*b,*t; 540 541 PetscFunctionBegin; 542 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544 t = a->solve_work; 545 546 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548 549 /* copy the b into temp work space according to permutation */ 550 ii = 0; 551 for (i=0; i<n; i++) { 552 ic = 2*c[i]; 553 t[ii] = b[ic]; 554 t[ii+1] = b[ic+1]; 555 ii += 2; 556 } 557 558 /* forward solve the U^T */ 559 idx = 0; 560 for (i=0; i<n; i++) { 561 562 v = aa + 4*diag[i]; 563 /* multiply by the inverse of the block diagonal */ 564 x1 = t[idx]; x2 = t[1+idx]; 565 s1 = v[0]*x1 + v[1]*x2; 566 s2 = v[2]*x1 + v[3]*x2; 567 v += 4; 568 569 vi = aj + diag[i] + 1; 570 nz = ai[i+1] - diag[i] - 1; 571 while (nz--) { 572 oidx = 2*(*vi++); 573 t[oidx] -= v[0]*s1 + v[1]*s2; 574 t[oidx+1] -= v[2]*s1 + v[3]*s2; 575 v += 4; 576 } 577 t[idx] = s1;t[1+idx] = s2; 578 idx += 2; 579 } 580 /* backward solve the L^T */ 581 for (i=n-1; i>=0; i--){ 582 v = aa + 4*diag[i] - 4; 583 vi = aj + diag[i] - 1; 584 nz = diag[i] - ai[i]; 585 idt = 2*i; 586 s1 = t[idt]; s2 = t[1+idt]; 587 while (nz--) { 588 idx = 2*(*vi--); 589 t[idx] -= v[0]*s1 + v[1]*s2; 590 t[idx+1] -= v[2]*s1 + v[3]*s2; 591 v -= 4; 592 } 593 } 594 595 /* copy t into x according to permutation */ 596 ii = 0; 597 for (i=0; i<n; i++) { 598 ir = 2*r[i]; 599 x[ir] = t[ii]; 600 x[ir+1] = t[ii+1]; 601 ii += 2; 602 } 603 604 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 606 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 #undef __FUNCT__ 613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615 { 616 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617 IS iscol=a->col,isrow=a->row; 618 PetscErrorCode ierr; 619 const PetscInt *r,*c,*rout,*cout; 620 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621 PetscInt *diag = a->diag,ii,ic,ir,oidx; 622 MatScalar *aa=a->a,*v; 623 PetscScalar s1,s2,s3,x1,x2,x3; 624 PetscScalar *x,*b,*t; 625 626 PetscFunctionBegin; 627 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 628 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629 t = a->solve_work; 630 631 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633 634 /* copy the b into temp work space according to permutation */ 635 ii = 0; 636 for (i=0; i<n; i++) { 637 ic = 3*c[i]; 638 t[ii] = b[ic]; 639 t[ii+1] = b[ic+1]; 640 t[ii+2] = b[ic+2]; 641 ii += 3; 642 } 643 644 /* forward solve the U^T */ 645 idx = 0; 646 for (i=0; i<n; i++) { 647 648 v = aa + 9*diag[i]; 649 /* multiply by the inverse of the block diagonal */ 650 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654 v += 9; 655 656 vi = aj + diag[i] + 1; 657 nz = ai[i+1] - diag[i] - 1; 658 while (nz--) { 659 oidx = 3*(*vi++); 660 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663 v += 9; 664 } 665 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666 idx += 3; 667 } 668 /* backward solve the L^T */ 669 for (i=n-1; i>=0; i--){ 670 v = aa + 9*diag[i] - 9; 671 vi = aj + diag[i] - 1; 672 nz = diag[i] - ai[i]; 673 idt = 3*i; 674 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675 while (nz--) { 676 idx = 3*(*vi--); 677 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680 v -= 9; 681 } 682 } 683 684 /* copy t into x according to permutation */ 685 ii = 0; 686 for (i=0; i<n; i++) { 687 ir = 3*r[i]; 688 x[ir] = t[ii]; 689 x[ir+1] = t[ii+1]; 690 x[ir+2] = t[ii+2]; 691 ii += 3; 692 } 693 694 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 696 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707 IS iscol=a->col,isrow=a->row; 708 PetscErrorCode ierr; 709 const PetscInt *r,*c,*rout,*cout; 710 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711 PetscInt *diag = a->diag,ii,ic,ir,oidx; 712 MatScalar *aa=a->a,*v; 713 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 714 PetscScalar *x,*b,*t; 715 716 PetscFunctionBegin; 717 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 t = a->solve_work; 720 721 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723 724 /* copy the b into temp work space according to permutation */ 725 ii = 0; 726 for (i=0; i<n; i++) { 727 ic = 4*c[i]; 728 t[ii] = b[ic]; 729 t[ii+1] = b[ic+1]; 730 t[ii+2] = b[ic+2]; 731 t[ii+3] = b[ic+3]; 732 ii += 4; 733 } 734 735 /* forward solve the U^T */ 736 idx = 0; 737 for (i=0; i<n; i++) { 738 739 v = aa + 16*diag[i]; 740 /* multiply by the inverse of the block diagonal */ 741 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746 v += 16; 747 748 vi = aj + diag[i] + 1; 749 nz = ai[i+1] - diag[i] - 1; 750 while (nz--) { 751 oidx = 4*(*vi++); 752 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756 v += 16; 757 } 758 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759 idx += 4; 760 } 761 /* backward solve the L^T */ 762 for (i=n-1; i>=0; i--){ 763 v = aa + 16*diag[i] - 16; 764 vi = aj + diag[i] - 1; 765 nz = diag[i] - ai[i]; 766 idt = 4*i; 767 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768 while (nz--) { 769 idx = 4*(*vi--); 770 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774 v -= 16; 775 } 776 } 777 778 /* copy t into x according to permutation */ 779 ii = 0; 780 for (i=0; i<n; i++) { 781 ir = 4*r[i]; 782 x[ir] = t[ii]; 783 x[ir+1] = t[ii+1]; 784 x[ir+2] = t[ii+2]; 785 x[ir+3] = t[ii+3]; 786 ii += 4; 787 } 788 789 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 791 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800 { 801 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802 IS iscol=a->col,isrow=a->row; 803 PetscErrorCode ierr; 804 const PetscInt *r,*c,*rout,*cout; 805 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806 PetscInt *diag = a->diag,ii,ic,ir,oidx; 807 MatScalar *aa=a->a,*v; 808 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 809 PetscScalar *x,*b,*t; 810 811 PetscFunctionBegin; 812 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 813 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814 t = a->solve_work; 815 816 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818 819 /* copy the b into temp work space according to permutation */ 820 ii = 0; 821 for (i=0; i<n; i++) { 822 ic = 5*c[i]; 823 t[ii] = b[ic]; 824 t[ii+1] = b[ic+1]; 825 t[ii+2] = b[ic+2]; 826 t[ii+3] = b[ic+3]; 827 t[ii+4] = b[ic+4]; 828 ii += 5; 829 } 830 831 /* forward solve the U^T */ 832 idx = 0; 833 for (i=0; i<n; i++) { 834 835 v = aa + 25*diag[i]; 836 /* multiply by the inverse of the block diagonal */ 837 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843 v += 25; 844 845 vi = aj + diag[i] + 1; 846 nz = ai[i+1] - diag[i] - 1; 847 while (nz--) { 848 oidx = 5*(*vi++); 849 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v += 25; 855 } 856 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857 idx += 5; 858 } 859 /* backward solve the L^T */ 860 for (i=n-1; i>=0; i--){ 861 v = aa + 25*diag[i] - 25; 862 vi = aj + diag[i] - 1; 863 nz = diag[i] - ai[i]; 864 idt = 5*i; 865 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866 while (nz--) { 867 idx = 5*(*vi--); 868 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873 v -= 25; 874 } 875 } 876 877 /* copy t into x according to permutation */ 878 ii = 0; 879 for (i=0; i<n; i++) { 880 ir = 5*r[i]; 881 x[ir] = t[ii]; 882 x[ir+1] = t[ii+1]; 883 x[ir+2] = t[ii+2]; 884 x[ir+3] = t[ii+3]; 885 x[ir+4] = t[ii+4]; 886 ii += 5; 887 } 888 889 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 891 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902 IS iscol=a->col,isrow=a->row; 903 PetscErrorCode ierr; 904 const PetscInt *r,*c,*rout,*cout; 905 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906 PetscInt *diag = a->diag,ii,ic,ir,oidx; 907 MatScalar *aa=a->a,*v; 908 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 909 PetscScalar *x,*b,*t; 910 911 PetscFunctionBegin; 912 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 t = a->solve_work; 915 916 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918 919 /* copy the b into temp work space according to permutation */ 920 ii = 0; 921 for (i=0; i<n; i++) { 922 ic = 6*c[i]; 923 t[ii] = b[ic]; 924 t[ii+1] = b[ic+1]; 925 t[ii+2] = b[ic+2]; 926 t[ii+3] = b[ic+3]; 927 t[ii+4] = b[ic+4]; 928 t[ii+5] = b[ic+5]; 929 ii += 6; 930 } 931 932 /* forward solve the U^T */ 933 idx = 0; 934 for (i=0; i<n; i++) { 935 936 v = aa + 36*diag[i]; 937 /* multiply by the inverse of the block diagonal */ 938 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939 x6 = t[5+idx]; 940 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946 v += 36; 947 948 vi = aj + diag[i] + 1; 949 nz = ai[i+1] - diag[i] - 1; 950 while (nz--) { 951 oidx = 6*(*vi++); 952 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958 v += 36; 959 } 960 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961 t[5+idx] = s6; 962 idx += 6; 963 } 964 /* backward solve the L^T */ 965 for (i=n-1; i>=0; i--){ 966 v = aa + 36*diag[i] - 36; 967 vi = aj + diag[i] - 1; 968 nz = diag[i] - ai[i]; 969 idt = 6*i; 970 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971 s6 = t[5+idt]; 972 while (nz--) { 973 idx = 6*(*vi--); 974 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980 v -= 36; 981 } 982 } 983 984 /* copy t into x according to permutation */ 985 ii = 0; 986 for (i=0; i<n; i++) { 987 ir = 6*r[i]; 988 x[ir] = t[ii]; 989 x[ir+1] = t[ii+1]; 990 x[ir+2] = t[ii+2]; 991 x[ir+3] = t[ii+3]; 992 x[ir+4] = t[ii+4]; 993 x[ir+5] = t[ii+5]; 994 ii += 6; 995 } 996 997 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 999 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1000 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008 { 1009 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010 IS iscol=a->col,isrow=a->row; 1011 PetscErrorCode ierr; 1012 const PetscInt *r,*c,*rout,*cout; 1013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015 MatScalar *aa=a->a,*v; 1016 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1017 PetscScalar *x,*b,*t; 1018 1019 PetscFunctionBegin; 1020 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022 t = a->solve_work; 1023 1024 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026 1027 /* copy the b into temp work space according to permutation */ 1028 ii = 0; 1029 for (i=0; i<n; i++) { 1030 ic = 7*c[i]; 1031 t[ii] = b[ic]; 1032 t[ii+1] = b[ic+1]; 1033 t[ii+2] = b[ic+2]; 1034 t[ii+3] = b[ic+3]; 1035 t[ii+4] = b[ic+4]; 1036 t[ii+5] = b[ic+5]; 1037 t[ii+6] = b[ic+6]; 1038 ii += 7; 1039 } 1040 1041 /* forward solve the U^T */ 1042 idx = 0; 1043 for (i=0; i<n; i++) { 1044 1045 v = aa + 49*diag[i]; 1046 /* multiply by the inverse of the block diagonal */ 1047 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048 x6 = t[5+idx]; x7 = t[6+idx]; 1049 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056 v += 49; 1057 1058 vi = aj + diag[i] + 1; 1059 nz = ai[i+1] - diag[i] - 1; 1060 while (nz--) { 1061 oidx = 7*(*vi++); 1062 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v += 49; 1070 } 1071 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072 t[5+idx] = s6;t[6+idx] = s7; 1073 idx += 7; 1074 } 1075 /* backward solve the L^T */ 1076 for (i=n-1; i>=0; i--){ 1077 v = aa + 49*diag[i] - 49; 1078 vi = aj + diag[i] - 1; 1079 nz = diag[i] - ai[i]; 1080 idt = 7*i; 1081 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082 s6 = t[5+idt];s7 = t[6+idt]; 1083 while (nz--) { 1084 idx = 7*(*vi--); 1085 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092 v -= 49; 1093 } 1094 } 1095 1096 /* copy t into x according to permutation */ 1097 ii = 0; 1098 for (i=0; i<n; i++) { 1099 ir = 7*r[i]; 1100 x[ir] = t[ii]; 1101 x[ir+1] = t[ii+1]; 1102 x[ir+2] = t[ii+2]; 1103 x[ir+3] = t[ii+3]; 1104 x[ir+4] = t[ii+4]; 1105 x[ir+5] = t[ii+5]; 1106 x[ir+6] = t[ii+6]; 1107 ii += 7; 1108 } 1109 1110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1112 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 /* ----------------------------------------------------------- */ 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1124 IS iscol=a->col,isrow=a->row; 1125 PetscErrorCode ierr; 1126 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1127 PetscInt i,n=a->mbs; 1128 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1129 MatScalar *aa=a->a,*v; 1130 PetscScalar *x,*b,*s,*t,*ls; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135 t = a->solve_work; 1136 1137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1139 1140 /* forward solve the lower triangular */ 1141 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1142 for (i=1; i<n; i++) { 1143 v = aa + bs2*ai[i]; 1144 vi = aj + ai[i]; 1145 nz = a->diag[i] - ai[i]; 1146 s = t + bs*i; 1147 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 while (nz--) { 1149 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1150 v += bs2; 1151 } 1152 } 1153 /* backward solve the upper triangular */ 1154 ls = a->solve_work + A->cmap->n; 1155 for (i=n-1; i>=0; i--){ 1156 v = aa + bs2*(a->diag[i] + 1); 1157 vi = aj + a->diag[i] + 1; 1158 nz = ai[i+1] - a->diag[i] - 1; 1159 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1160 while (nz--) { 1161 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1162 v += bs2; 1163 } 1164 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1165 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1166 } 1167 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1179 { 1180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1181 IS iscol=a->col,isrow=a->row; 1182 PetscErrorCode ierr; 1183 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1184 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1185 MatScalar *aa=a->a,*v; 1186 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1187 PetscScalar *x,*b,*t; 1188 1189 PetscFunctionBegin; 1190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192 t = a->solve_work; 1193 1194 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1195 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1196 1197 /* forward solve the lower triangular */ 1198 idx = 7*(*r++); 1199 t[0] = b[idx]; t[1] = b[1+idx]; 1200 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201 t[5] = b[5+idx]; t[6] = b[6+idx]; 1202 1203 for (i=1; i<n; i++) { 1204 v = aa + 49*ai[i]; 1205 vi = aj + ai[i]; 1206 nz = diag[i] - ai[i]; 1207 idx = 7*(*r++); 1208 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1210 while (nz--) { 1211 idx = 7*(*vi++); 1212 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213 x4 = t[3+idx];x5 = t[4+idx]; 1214 x6 = t[5+idx];x7 = t[6+idx]; 1215 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1222 v += 49; 1223 } 1224 idx = 7*i; 1225 t[idx] = s1;t[1+idx] = s2; 1226 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227 t[5+idx] = s6;t[6+idx] = s7; 1228 } 1229 /* backward solve the upper triangular */ 1230 for (i=n-1; i>=0; i--){ 1231 v = aa + 49*diag[i] + 49; 1232 vi = aj + diag[i] + 1; 1233 nz = ai[i+1] - diag[i] - 1; 1234 idt = 7*i; 1235 s1 = t[idt]; s2 = t[1+idt]; 1236 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237 s6 = t[5+idt];s7 = t[6+idt]; 1238 while (nz--) { 1239 idx = 7*(*vi++); 1240 x1 = t[idx]; x2 = t[1+idx]; 1241 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242 x6 = t[5+idx]; x7 = t[6+idx]; 1243 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1250 v += 49; 1251 } 1252 idc = 7*(*c--); 1253 v = aa + 49*diag[i]; 1254 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1268 } 1269 1270 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1271 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1272 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1273 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1275 PetscFunctionReturn(0); 1276 } 1277 1278 #undef __FUNCT__ 1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1280 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1281 { 1282 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1283 IS iscol=a->col,isrow=a->row; 1284 PetscErrorCode ierr; 1285 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 1286 PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 1287 MatScalar *aa=a->a,*v; 1288 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1289 PetscScalar *x,*b,*t; 1290 1291 PetscFunctionBegin; 1292 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1293 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1294 t = a->solve_work; 1295 1296 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1297 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1298 1299 /* forward solve the lower triangular */ 1300 idx = 7*r[0]; 1301 t[0] = b[idx]; t[1] = b[1+idx]; 1302 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1303 t[5] = b[5+idx]; t[6] = b[6+idx]; 1304 1305 for (i=1; i<n; i++) { 1306 v = aa + 49*ai[i]; 1307 vi = aj + ai[i]; 1308 nz = ai[i+1] - ai[i]; 1309 idx = 7*r[i]; 1310 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1311 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1312 for(m=0;m<nz;m++){ 1313 idx = 7*vi[m]; 1314 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1315 x4 = t[3+idx];x5 = t[4+idx]; 1316 x6 = t[5+idx];x7 = t[6+idx]; 1317 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1318 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1319 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1320 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1321 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1322 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1323 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1324 v += 49; 1325 } 1326 idx = 7*i; 1327 t[idx] = s1;t[1+idx] = s2; 1328 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1329 t[5+idx] = s6;t[6+idx] = s7; 1330 } 1331 /* backward solve the upper triangular */ 1332 for (i=n-1; i>=0; i--){ 1333 k = 2*n-i; 1334 v = aa + 49*ai[k]; 1335 vi = aj + ai[k]; 1336 nz = ai[k+1] - ai[k] - 1; 1337 idt = 7*i; 1338 s1 = t[idt]; s2 = t[1+idt]; 1339 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1340 s6 = t[5+idt];s7 = t[6+idt]; 1341 for(m=0;m<nz;m++){ 1342 idx = 7*vi[m]; 1343 x1 = t[idx]; x2 = t[1+idx]; 1344 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1345 x6 = t[5+idx]; x7 = t[6+idx]; 1346 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1347 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1348 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1349 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1350 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1351 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1352 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1353 v += 49; 1354 } 1355 idc = 7*c[i]; 1356 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1357 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1358 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1359 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1360 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1361 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1362 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1363 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1364 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1365 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1366 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1367 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1368 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1369 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1370 } 1371 1372 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1373 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1374 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1375 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1376 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1377 PetscFunctionReturn(0); 1378 } 1379 1380 #undef __FUNCT__ 1381 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1382 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1383 { 1384 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1385 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1386 PetscErrorCode ierr; 1387 PetscInt *diag = a->diag,jdx; 1388 const MatScalar *aa=a->a,*v; 1389 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1390 const PetscScalar *b; 1391 1392 PetscFunctionBegin; 1393 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1394 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1395 /* forward solve the lower triangular */ 1396 idx = 0; 1397 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1398 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1399 x[6] = b[6+idx]; 1400 for (i=1; i<n; i++) { 1401 v = aa + 49*ai[i]; 1402 vi = aj + ai[i]; 1403 nz = diag[i] - ai[i]; 1404 idx = 7*i; 1405 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1406 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1407 s7 = b[6+idx]; 1408 while (nz--) { 1409 jdx = 7*(*vi++); 1410 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1411 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1412 x7 = x[6+jdx]; 1413 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1414 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1415 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1416 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1417 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1418 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1419 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1420 v += 49; 1421 } 1422 x[idx] = s1; 1423 x[1+idx] = s2; 1424 x[2+idx] = s3; 1425 x[3+idx] = s4; 1426 x[4+idx] = s5; 1427 x[5+idx] = s6; 1428 x[6+idx] = s7; 1429 } 1430 /* backward solve the upper triangular */ 1431 for (i=n-1; i>=0; i--){ 1432 v = aa + 49*diag[i] + 49; 1433 vi = aj + diag[i] + 1; 1434 nz = ai[i+1] - diag[i] - 1; 1435 idt = 7*i; 1436 s1 = x[idt]; s2 = x[1+idt]; 1437 s3 = x[2+idt]; s4 = x[3+idt]; 1438 s5 = x[4+idt]; s6 = x[5+idt]; 1439 s7 = x[6+idt]; 1440 while (nz--) { 1441 idx = 7*(*vi++); 1442 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1443 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1444 x7 = x[6+idx]; 1445 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1446 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1447 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1448 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1449 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1450 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1451 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1452 v += 49; 1453 } 1454 v = aa + 49*diag[i]; 1455 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1456 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1457 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1458 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1459 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1460 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1461 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1462 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1463 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1464 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1465 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1466 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1467 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1468 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1469 } 1470 1471 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1472 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1473 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1474 PetscFunctionReturn(0); 1475 } 1476 1477 #undef __FUNCT__ 1478 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1479 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1480 { 1481 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1482 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1483 PetscErrorCode ierr; 1484 PetscInt idx,jdx,idt; 1485 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1486 const MatScalar *aa=a->a,*v; 1487 PetscScalar *x; 1488 const PetscScalar *b; 1489 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1490 1491 PetscFunctionBegin; 1492 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1493 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1494 /* forward solve the lower triangular */ 1495 idx = 0; 1496 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1497 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1498 for (i=1; i<n; i++) { 1499 v = aa + bs2*ai[i]; 1500 vi = aj + ai[i]; 1501 nz = ai[i+1] - ai[i]; 1502 idx = bs*i; 1503 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1504 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1505 for(k=0;k<nz;k++) { 1506 jdx = bs*vi[k]; 1507 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1508 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1509 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1510 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1511 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1512 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1513 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1514 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1515 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1516 v += bs2; 1517 } 1518 1519 x[idx] = s1; 1520 x[1+idx] = s2; 1521 x[2+idx] = s3; 1522 x[3+idx] = s4; 1523 x[4+idx] = s5; 1524 x[5+idx] = s6; 1525 x[6+idx] = s7; 1526 } 1527 1528 /* backward solve the upper triangular */ 1529 for (i=n-1; i>=0; i--){ 1530 v = aa + bs2*ai[2*n-i]; 1531 vi = aj + ai[2*n-i]; 1532 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1533 idt = bs*i; 1534 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1535 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1536 for(k=0;k<nz;k++) { 1537 idx = bs*vi[k]; 1538 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1539 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1540 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1541 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1542 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1543 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1544 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1545 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1546 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1547 v += bs2; 1548 } 1549 /* x = inv_diagonal*x */ 1550 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1551 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1552 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1553 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1554 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1555 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1556 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1557 } 1558 1559 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1560 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1561 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 #undef __FUNCT__ 1566 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2" 1567 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1568 { 1569 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1570 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 1571 PetscErrorCode ierr; 1572 PetscInt idx,jdx,idt; 1573 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1574 const MatScalar *aa=a->a,*v; 1575 PetscScalar *x; 1576 const PetscScalar *b; 1577 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1578 1579 PetscFunctionBegin; 1580 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1581 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1582 /* forward solve the lower triangular */ 1583 idx = 0; 1584 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1585 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1586 for (i=1; i<n; i++) { 1587 v = aa + bs2*ai[i]; 1588 vi = aj + ai[i]; 1589 nz = ai[i+1] - ai[i]; 1590 idx = bs*i; 1591 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1592 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1593 for(k=0;k<nz;k++) { 1594 jdx = bs*vi[k]; 1595 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1596 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1597 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1598 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1599 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1600 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1601 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1602 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1603 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1604 v += bs2; 1605 } 1606 1607 x[idx] = s1; 1608 x[1+idx] = s2; 1609 x[2+idx] = s3; 1610 x[3+idx] = s4; 1611 x[4+idx] = s5; 1612 x[5+idx] = s6; 1613 x[6+idx] = s7; 1614 } 1615 1616 /* backward solve the upper triangular */ 1617 for (i=n-1; i>=0; i--){ 1618 v = aa + bs2*(adiag[i+1]+1); 1619 vi = aj + adiag[i+1]+1; 1620 nz = adiag[i] - adiag[i+1]-1; 1621 idt = bs*i; 1622 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1623 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1624 for(k=0;k<nz;k++) { 1625 idx = bs*vi[k]; 1626 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1627 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1628 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1629 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1630 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1631 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1632 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1633 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1634 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1635 v += bs2; 1636 } 1637 /* x = inv_diagonal*x */ 1638 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1639 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1640 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1641 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1642 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1643 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1644 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1645 } 1646 1647 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1648 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1649 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1650 PetscFunctionReturn(0); 1651 } 1652 1653 #undef __FUNCT__ 1654 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1655 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1656 { 1657 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1658 IS iscol=a->col,isrow=a->row; 1659 PetscErrorCode ierr; 1660 const PetscInt *r,*c,*rout,*cout; 1661 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1662 const MatScalar *aa=a->a,*v; 1663 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1664 const PetscScalar *b; 1665 PetscFunctionBegin; 1666 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1667 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1668 t = a->solve_work; 1669 1670 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1671 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1672 1673 /* forward solve the lower triangular */ 1674 idx = 6*(*r++); 1675 t[0] = b[idx]; t[1] = b[1+idx]; 1676 t[2] = b[2+idx]; t[3] = b[3+idx]; 1677 t[4] = b[4+idx]; t[5] = b[5+idx]; 1678 for (i=1; i<n; i++) { 1679 v = aa + 36*ai[i]; 1680 vi = aj + ai[i]; 1681 nz = diag[i] - ai[i]; 1682 idx = 6*(*r++); 1683 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1684 s5 = b[4+idx]; s6 = b[5+idx]; 1685 while (nz--) { 1686 idx = 6*(*vi++); 1687 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1688 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1689 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1690 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1691 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1692 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1693 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1694 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1695 v += 36; 1696 } 1697 idx = 6*i; 1698 t[idx] = s1;t[1+idx] = s2; 1699 t[2+idx] = s3;t[3+idx] = s4; 1700 t[4+idx] = s5;t[5+idx] = s6; 1701 } 1702 /* backward solve the upper triangular */ 1703 for (i=n-1; i>=0; i--){ 1704 v = aa + 36*diag[i] + 36; 1705 vi = aj + diag[i] + 1; 1706 nz = ai[i+1] - diag[i] - 1; 1707 idt = 6*i; 1708 s1 = t[idt]; s2 = t[1+idt]; 1709 s3 = t[2+idt];s4 = t[3+idt]; 1710 s5 = t[4+idt];s6 = t[5+idt]; 1711 while (nz--) { 1712 idx = 6*(*vi++); 1713 x1 = t[idx]; x2 = t[1+idx]; 1714 x3 = t[2+idx]; x4 = t[3+idx]; 1715 x5 = t[4+idx]; x6 = t[5+idx]; 1716 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1717 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1718 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1719 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1720 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1721 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1722 v += 36; 1723 } 1724 idc = 6*(*c--); 1725 v = aa + 36*diag[i]; 1726 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1727 v[18]*s4+v[24]*s5+v[30]*s6; 1728 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1729 v[19]*s4+v[25]*s5+v[31]*s6; 1730 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1731 v[20]*s4+v[26]*s5+v[32]*s6; 1732 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1733 v[21]*s4+v[27]*s5+v[33]*s6; 1734 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1735 v[22]*s4+v[28]*s5+v[34]*s6; 1736 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1737 v[23]*s4+v[29]*s5+v[35]*s6; 1738 } 1739 1740 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1741 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1742 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1743 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1744 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1745 PetscFunctionReturn(0); 1746 } 1747 1748 #undef __FUNCT__ 1749 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 1750 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 1751 { 1752 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1753 IS iscol=a->col,isrow=a->row; 1754 PetscErrorCode ierr; 1755 const PetscInt *r,*c,*rout,*cout; 1756 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 1757 const MatScalar *aa=a->a,*v; 1758 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1759 const PetscScalar *b; 1760 PetscFunctionBegin; 1761 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1762 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1763 t = a->solve_work; 1764 1765 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1766 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1767 1768 /* forward solve the lower triangular */ 1769 idx = 6*r[0]; 1770 t[0] = b[idx]; t[1] = b[1+idx]; 1771 t[2] = b[2+idx]; t[3] = b[3+idx]; 1772 t[4] = b[4+idx]; t[5] = b[5+idx]; 1773 for (i=1; i<n; i++) { 1774 v = aa + 36*ai[i]; 1775 vi = aj + ai[i]; 1776 nz = ai[i+1] - ai[i]; 1777 idx = 6*r[i]; 1778 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1779 s5 = b[4+idx]; s6 = b[5+idx]; 1780 for(m=0;m<nz;m++){ 1781 idx = 6*vi[m]; 1782 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1783 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1784 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1785 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1786 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1787 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1788 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1789 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1790 v += 36; 1791 } 1792 idx = 6*i; 1793 t[idx] = s1;t[1+idx] = s2; 1794 t[2+idx] = s3;t[3+idx] = s4; 1795 t[4+idx] = s5;t[5+idx] = s6; 1796 } 1797 /* backward solve the upper triangular */ 1798 for (i=n-1; i>=0; i--){ 1799 k = 2*n-i; 1800 v = aa + 36*ai[k]; 1801 vi = aj + ai[k]; 1802 nz = ai[k+1] - ai[k] - 1; 1803 idt = 6*i; 1804 s1 = t[idt]; s2 = t[1+idt]; 1805 s3 = t[2+idt];s4 = t[3+idt]; 1806 s5 = t[4+idt];s6 = t[5+idt]; 1807 for(m=0;m<nz;m++){ 1808 idx = 6*vi[m]; 1809 x1 = t[idx]; x2 = t[1+idx]; 1810 x3 = t[2+idx]; x4 = t[3+idx]; 1811 x5 = t[4+idx]; x6 = t[5+idx]; 1812 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1813 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1814 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1815 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1816 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1817 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1818 v += 36; 1819 } 1820 idc = 6*c[i]; 1821 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1822 v[18]*s4+v[24]*s5+v[30]*s6; 1823 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1824 v[19]*s4+v[25]*s5+v[31]*s6; 1825 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1826 v[20]*s4+v[26]*s5+v[32]*s6; 1827 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1828 v[21]*s4+v[27]*s5+v[33]*s6; 1829 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1830 v[22]*s4+v[28]*s5+v[34]*s6; 1831 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1832 v[23]*s4+v[29]*s5+v[35]*s6; 1833 } 1834 1835 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1836 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1837 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1838 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1839 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1840 PetscFunctionReturn(0); 1841 } 1842 1843 1844 #undef __FUNCT__ 1845 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1846 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1847 { 1848 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1849 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1850 PetscErrorCode ierr; 1851 PetscInt *diag = a->diag,jdx; 1852 const MatScalar *aa=a->a,*v; 1853 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1854 const PetscScalar *b; 1855 1856 PetscFunctionBegin; 1857 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1858 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1859 /* forward solve the lower triangular */ 1860 idx = 0; 1861 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1862 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1863 for (i=1; i<n; i++) { 1864 v = aa + 36*ai[i]; 1865 vi = aj + ai[i]; 1866 nz = diag[i] - ai[i]; 1867 idx = 6*i; 1868 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1869 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1870 while (nz--) { 1871 jdx = 6*(*vi++); 1872 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1873 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1874 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1875 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1876 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1877 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1878 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1879 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1880 v += 36; 1881 } 1882 x[idx] = s1; 1883 x[1+idx] = s2; 1884 x[2+idx] = s3; 1885 x[3+idx] = s4; 1886 x[4+idx] = s5; 1887 x[5+idx] = s6; 1888 } 1889 /* backward solve the upper triangular */ 1890 for (i=n-1; i>=0; i--){ 1891 v = aa + 36*diag[i] + 36; 1892 vi = aj + diag[i] + 1; 1893 nz = ai[i+1] - diag[i] - 1; 1894 idt = 6*i; 1895 s1 = x[idt]; s2 = x[1+idt]; 1896 s3 = x[2+idt]; s4 = x[3+idt]; 1897 s5 = x[4+idt]; s6 = x[5+idt]; 1898 while (nz--) { 1899 idx = 6*(*vi++); 1900 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1901 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1902 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1903 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1904 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1905 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1906 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1907 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1908 v += 36; 1909 } 1910 v = aa + 36*diag[i]; 1911 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1912 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1913 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1914 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1915 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1916 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1917 } 1918 1919 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1920 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1921 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1922 PetscFunctionReturn(0); 1923 } 1924 1925 #undef __FUNCT__ 1926 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1927 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1928 { 1929 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1930 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1931 PetscErrorCode ierr; 1932 PetscInt idx,jdx,idt; 1933 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1934 const MatScalar *aa=a->a,*v; 1935 PetscScalar *x; 1936 const PetscScalar *b; 1937 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1938 1939 PetscFunctionBegin; 1940 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1941 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1942 /* forward solve the lower triangular */ 1943 idx = 0; 1944 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1945 x[4] = b[4+idx];x[5] = b[5+idx]; 1946 for (i=1; i<n; i++) { 1947 v = aa + bs2*ai[i]; 1948 vi = aj + ai[i]; 1949 nz = ai[i+1] - ai[i]; 1950 idx = bs*i; 1951 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1952 s5 = b[4+idx];s6 = b[5+idx]; 1953 for(k=0;k<nz;k++){ 1954 jdx = bs*vi[k]; 1955 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1956 x5 = x[4+jdx]; x6 = x[5+jdx]; 1957 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1958 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1959 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1960 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1961 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1962 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1963 v += bs2; 1964 } 1965 1966 x[idx] = s1; 1967 x[1+idx] = s2; 1968 x[2+idx] = s3; 1969 x[3+idx] = s4; 1970 x[4+idx] = s5; 1971 x[5+idx] = s6; 1972 } 1973 1974 /* backward solve the upper triangular */ 1975 for (i=n-1; i>=0; i--){ 1976 v = aa + bs2*ai[2*n-i]; 1977 vi = aj + ai[2*n-i]; 1978 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1979 idt = bs*i; 1980 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1981 s5 = x[4+idt];s6 = x[5+idt]; 1982 for(k=0;k<nz;k++){ 1983 idx = bs*vi[k]; 1984 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1985 x5 = x[4+idx];x6 = x[5+idx]; 1986 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1987 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1988 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1989 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1990 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1991 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1992 v += bs2; 1993 } 1994 /* x = inv_diagonal*x */ 1995 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1996 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1997 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1998 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1999 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2000 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2001 } 2002 2003 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2004 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2005 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2006 PetscFunctionReturn(0); 2007 } 2008 2009 #undef __FUNCT__ 2010 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2" 2011 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2012 { 2013 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2014 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2015 PetscErrorCode ierr; 2016 PetscInt idx,jdx,idt; 2017 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2018 const MatScalar *aa=a->a,*v; 2019 PetscScalar *x; 2020 const PetscScalar *b; 2021 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2022 2023 PetscFunctionBegin; 2024 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2025 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2026 /* forward solve the lower triangular */ 2027 idx = 0; 2028 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2029 x[4] = b[4+idx];x[5] = b[5+idx]; 2030 for (i=1; i<n; i++) { 2031 v = aa + bs2*ai[i]; 2032 vi = aj + ai[i]; 2033 nz = ai[i+1] - ai[i]; 2034 idx = bs*i; 2035 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2036 s5 = b[4+idx];s6 = b[5+idx]; 2037 for(k=0;k<nz;k++){ 2038 jdx = bs*vi[k]; 2039 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2040 x5 = x[4+jdx]; x6 = x[5+jdx]; 2041 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2042 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2043 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2044 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2045 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2046 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2047 v += bs2; 2048 } 2049 2050 x[idx] = s1; 2051 x[1+idx] = s2; 2052 x[2+idx] = s3; 2053 x[3+idx] = s4; 2054 x[4+idx] = s5; 2055 x[5+idx] = s6; 2056 } 2057 2058 /* backward solve the upper triangular */ 2059 for (i=n-1; i>=0; i--){ 2060 v = aa + bs2*(adiag[i+1]+1); 2061 vi = aj + adiag[i+1]+1; 2062 nz = adiag[i] - adiag[i+1]-1; 2063 idt = bs*i; 2064 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2065 s5 = x[4+idt];s6 = x[5+idt]; 2066 for(k=0;k<nz;k++){ 2067 idx = bs*vi[k]; 2068 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2069 x5 = x[4+idx];x6 = x[5+idx]; 2070 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2071 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2072 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2073 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2074 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2075 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2076 v += bs2; 2077 } 2078 /* x = inv_diagonal*x */ 2079 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2080 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2081 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2082 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2083 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2084 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2085 } 2086 2087 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2088 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2089 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2090 PetscFunctionReturn(0); 2091 } 2092 2093 #undef __FUNCT__ 2094 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2095 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 2096 { 2097 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2098 IS iscol=a->col,isrow=a->row; 2099 PetscErrorCode ierr; 2100 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 2101 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2102 const MatScalar *aa=a->a,*v; 2103 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2104 const PetscScalar *b; 2105 2106 PetscFunctionBegin; 2107 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2108 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2109 t = a->solve_work; 2110 2111 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2112 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2113 2114 /* forward solve the lower triangular */ 2115 idx = 5*(*r++); 2116 t[0] = b[idx]; t[1] = b[1+idx]; 2117 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2118 for (i=1; i<n; i++) { 2119 v = aa + 25*ai[i]; 2120 vi = aj + ai[i]; 2121 nz = diag[i] - ai[i]; 2122 idx = 5*(*r++); 2123 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2124 s5 = b[4+idx]; 2125 while (nz--) { 2126 idx = 5*(*vi++); 2127 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2128 x4 = t[3+idx];x5 = t[4+idx]; 2129 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2130 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2131 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2132 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2133 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2134 v += 25; 2135 } 2136 idx = 5*i; 2137 t[idx] = s1;t[1+idx] = s2; 2138 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2139 } 2140 /* backward solve the upper triangular */ 2141 for (i=n-1; i>=0; i--){ 2142 v = aa + 25*diag[i] + 25; 2143 vi = aj + diag[i] + 1; 2144 nz = ai[i+1] - diag[i] - 1; 2145 idt = 5*i; 2146 s1 = t[idt]; s2 = t[1+idt]; 2147 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2148 while (nz--) { 2149 idx = 5*(*vi++); 2150 x1 = t[idx]; x2 = t[1+idx]; 2151 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2152 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2153 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2154 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2155 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2156 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2157 v += 25; 2158 } 2159 idc = 5*(*c--); 2160 v = aa + 25*diag[i]; 2161 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2162 v[15]*s4+v[20]*s5; 2163 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2164 v[16]*s4+v[21]*s5; 2165 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2166 v[17]*s4+v[22]*s5; 2167 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2168 v[18]*s4+v[23]*s5; 2169 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2170 v[19]*s4+v[24]*s5; 2171 } 2172 2173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2177 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2178 PetscFunctionReturn(0); 2179 } 2180 2181 #undef __FUNCT__ 2182 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2183 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2184 { 2185 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2186 IS iscol=a->col,isrow=a->row; 2187 PetscErrorCode ierr; 2188 const PetscInt *r,*c,*rout,*cout; 2189 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2190 const MatScalar *aa=a->a,*v; 2191 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2192 const PetscScalar *b; 2193 2194 PetscFunctionBegin; 2195 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2196 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2197 t = a->solve_work; 2198 2199 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2200 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2201 2202 /* forward solve the lower triangular */ 2203 idx = 5*r[0]; 2204 t[0] = b[idx]; t[1] = b[1+idx]; 2205 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2206 for (i=1; i<n; i++) { 2207 v = aa + 25*ai[i]; 2208 vi = aj + ai[i]; 2209 nz = ai[i+1] - ai[i]; 2210 idx = 5*r[i]; 2211 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2212 s5 = b[4+idx]; 2213 for(m=0;m<nz;m++){ 2214 idx = 5*vi[m]; 2215 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2216 x4 = t[3+idx];x5 = t[4+idx]; 2217 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2218 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2219 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2220 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2221 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2222 v += 25; 2223 } 2224 idx = 5*i; 2225 t[idx] = s1;t[1+idx] = s2; 2226 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2227 } 2228 /* backward solve the upper triangular */ 2229 for (i=n-1; i>=0; i--){ 2230 k = 2*n-i; 2231 v = aa + 25*ai[k]; 2232 vi = aj + ai[k]; 2233 nz = ai[k+1] - ai[k] - 1; 2234 idt = 5*i; 2235 s1 = t[idt]; s2 = t[1+idt]; 2236 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2237 for(m=0;m<nz;m++){ 2238 idx = 5*vi[m]; 2239 x1 = t[idx]; x2 = t[1+idx]; 2240 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2241 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2242 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2243 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2244 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2245 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2246 v += 25; 2247 } 2248 idc = 5*c[i]; 2249 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2250 v[15]*s4+v[20]*s5; 2251 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2252 v[16]*s4+v[21]*s5; 2253 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2254 v[17]*s4+v[22]*s5; 2255 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2256 v[18]*s4+v[23]*s5; 2257 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2258 v[19]*s4+v[24]*s5; 2259 } 2260 2261 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2262 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2263 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2264 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2265 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2266 PetscFunctionReturn(0); 2267 } 2268 2269 #undef __FUNCT__ 2270 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2" 2271 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2272 { 2273 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2274 IS iscol=a->col,isrow=a->row; 2275 PetscErrorCode ierr; 2276 const PetscInt *r,*c,*rout,*cout; 2277 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2278 const MatScalar *aa=a->a,*v; 2279 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2280 const PetscScalar *b; 2281 2282 PetscFunctionBegin; 2283 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2284 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2285 t = a->solve_work; 2286 2287 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2288 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2289 2290 /* forward solve the lower triangular */ 2291 idx = 5*r[0]; 2292 t[0] = b[idx]; t[1] = b[1+idx]; 2293 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2294 for (i=1; i<n; i++) { 2295 v = aa + 25*ai[i]; 2296 vi = aj + ai[i]; 2297 nz = ai[i+1] - ai[i]; 2298 idx = 5*r[i]; 2299 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2300 s5 = b[4+idx]; 2301 for(m=0;m<nz;m++){ 2302 idx = 5*vi[m]; 2303 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2304 x4 = t[3+idx];x5 = t[4+idx]; 2305 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2306 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2307 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2308 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2309 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2310 v += 25; 2311 } 2312 idx = 5*i; 2313 t[idx] = s1;t[1+idx] = s2; 2314 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2315 } 2316 /* backward solve the upper triangular */ 2317 for (i=n-1; i>=0; i--){ 2318 v = aa + 25*(adiag[i+1]+1); 2319 vi = aj + adiag[i+1]+1; 2320 nz = adiag[i] - adiag[i+1] - 1; 2321 idt = 5*i; 2322 s1 = t[idt]; s2 = t[1+idt]; 2323 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2324 for(m=0;m<nz;m++){ 2325 idx = 5*vi[m]; 2326 x1 = t[idx]; x2 = t[1+idx]; 2327 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2328 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2329 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2330 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2331 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2332 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2333 v += 25; 2334 } 2335 idc = 5*c[i]; 2336 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2337 v[15]*s4+v[20]*s5; 2338 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2339 v[16]*s4+v[21]*s5; 2340 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2341 v[17]*s4+v[22]*s5; 2342 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2343 v[18]*s4+v[23]*s5; 2344 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2345 v[19]*s4+v[24]*s5; 2346 } 2347 2348 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2349 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2350 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2351 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2352 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2353 PetscFunctionReturn(0); 2354 } 2355 2356 #undef __FUNCT__ 2357 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2358 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 2359 { 2360 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2361 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2362 PetscErrorCode ierr; 2363 PetscInt *diag = a->diag,jdx; 2364 const MatScalar *aa=a->a,*v; 2365 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2366 const PetscScalar *b; 2367 2368 PetscFunctionBegin; 2369 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2370 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2371 /* forward solve the lower triangular */ 2372 idx = 0; 2373 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2374 for (i=1; i<n; i++) { 2375 v = aa + 25*ai[i]; 2376 vi = aj + ai[i]; 2377 nz = diag[i] - ai[i]; 2378 idx = 5*i; 2379 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2380 while (nz--) { 2381 jdx = 5*(*vi++); 2382 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2383 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2384 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2385 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2386 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2387 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2388 v += 25; 2389 } 2390 x[idx] = s1; 2391 x[1+idx] = s2; 2392 x[2+idx] = s3; 2393 x[3+idx] = s4; 2394 x[4+idx] = s5; 2395 } 2396 /* backward solve the upper triangular */ 2397 for (i=n-1; i>=0; i--){ 2398 v = aa + 25*diag[i] + 25; 2399 vi = aj + diag[i] + 1; 2400 nz = ai[i+1] - diag[i] - 1; 2401 idt = 5*i; 2402 s1 = x[idt]; s2 = x[1+idt]; 2403 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2404 while (nz--) { 2405 idx = 5*(*vi++); 2406 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2407 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2408 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2409 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2410 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2411 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2412 v += 25; 2413 } 2414 v = aa + 25*diag[i]; 2415 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2416 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2417 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2418 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2419 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2420 } 2421 2422 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2423 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2424 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2425 PetscFunctionReturn(0); 2426 } 2427 2428 #undef __FUNCT__ 2429 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2430 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2431 { 2432 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2433 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2434 PetscErrorCode ierr; 2435 PetscInt jdx; 2436 const MatScalar *aa=a->a,*v; 2437 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2438 const PetscScalar *b; 2439 2440 PetscFunctionBegin; 2441 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2442 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2443 /* forward solve the lower triangular */ 2444 idx = 0; 2445 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2446 for (i=1; i<n; i++) { 2447 v = aa + 25*ai[i]; 2448 vi = aj + ai[i]; 2449 nz = ai[i+1] - ai[i]; 2450 idx = 5*i; 2451 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2452 for(k=0;k<nz;k++) { 2453 jdx = 5*vi[k]; 2454 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2455 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2456 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2457 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2458 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2459 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2460 v += 25; 2461 } 2462 x[idx] = s1; 2463 x[1+idx] = s2; 2464 x[2+idx] = s3; 2465 x[3+idx] = s4; 2466 x[4+idx] = s5; 2467 } 2468 2469 /* backward solve the upper triangular */ 2470 for (i=n-1; i>=0; i--){ 2471 v = aa + 25*ai[2*n-i]; 2472 vi = aj + ai[2*n-i]; 2473 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2474 idt = 5*i; 2475 s1 = x[idt]; s2 = x[1+idt]; 2476 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2477 for(k=0;k<nz;k++){ 2478 idx = 5*vi[k]; 2479 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2480 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2481 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2482 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2483 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2484 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2485 v += 25; 2486 } 2487 /* x = inv_diagonal*x */ 2488 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2489 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2490 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2491 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2492 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2493 } 2494 2495 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2496 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2497 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2498 PetscFunctionReturn(0); 2499 } 2500 2501 #undef __FUNCT__ 2502 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2" 2503 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2504 { 2505 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2506 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 2507 PetscErrorCode ierr; 2508 PetscInt jdx; 2509 const MatScalar *aa=a->a,*v; 2510 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2511 const PetscScalar *b; 2512 2513 PetscFunctionBegin; 2514 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2515 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2516 /* forward solve the lower triangular */ 2517 idx = 0; 2518 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2519 for (i=1; i<n; i++) { 2520 v = aa + 25*ai[i]; 2521 vi = aj + ai[i]; 2522 nz = ai[i+1] - ai[i]; 2523 idx = 5*i; 2524 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2525 for(k=0;k<nz;k++) { 2526 jdx = 5*vi[k]; 2527 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2528 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2529 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2530 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2531 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2532 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2533 v += 25; 2534 } 2535 x[idx] = s1; 2536 x[1+idx] = s2; 2537 x[2+idx] = s3; 2538 x[3+idx] = s4; 2539 x[4+idx] = s5; 2540 } 2541 2542 /* backward solve the upper triangular */ 2543 for (i=n-1; i>=0; i--){ 2544 v = aa + 25*(adiag[i+1]+1); 2545 vi = aj + adiag[i+1]+1; 2546 nz = adiag[i] - adiag[i+1]-1; 2547 idt = 5*i; 2548 s1 = x[idt]; s2 = x[1+idt]; 2549 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2550 for(k=0;k<nz;k++){ 2551 idx = 5*vi[k]; 2552 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2553 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2554 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2555 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2556 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2557 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2558 v += 25; 2559 } 2560 /* x = inv_diagonal*x */ 2561 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2562 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2563 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2564 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2565 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2566 } 2567 2568 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2569 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2570 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2571 PetscFunctionReturn(0); 2572 } 2573 2574 #undef __FUNCT__ 2575 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2576 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 2577 { 2578 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2579 IS iscol=a->col,isrow=a->row; 2580 PetscErrorCode ierr; 2581 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2582 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2583 const MatScalar *aa=a->a,*v; 2584 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2585 const PetscScalar *b; 2586 2587 PetscFunctionBegin; 2588 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2589 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2590 t = a->solve_work; 2591 2592 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2593 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2594 2595 /* forward solve the lower triangular */ 2596 idx = 4*(*r++); 2597 t[0] = b[idx]; t[1] = b[1+idx]; 2598 t[2] = b[2+idx]; t[3] = b[3+idx]; 2599 for (i=1; i<n; i++) { 2600 v = aa + 16*ai[i]; 2601 vi = aj + ai[i]; 2602 nz = diag[i] - ai[i]; 2603 idx = 4*(*r++); 2604 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2605 while (nz--) { 2606 idx = 4*(*vi++); 2607 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2608 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2609 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2610 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2611 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2612 v += 16; 2613 } 2614 idx = 4*i; 2615 t[idx] = s1;t[1+idx] = s2; 2616 t[2+idx] = s3;t[3+idx] = s4; 2617 } 2618 /* backward solve the upper triangular */ 2619 for (i=n-1; i>=0; i--){ 2620 v = aa + 16*diag[i] + 16; 2621 vi = aj + diag[i] + 1; 2622 nz = ai[i+1] - diag[i] - 1; 2623 idt = 4*i; 2624 s1 = t[idt]; s2 = t[1+idt]; 2625 s3 = t[2+idt];s4 = t[3+idt]; 2626 while (nz--) { 2627 idx = 4*(*vi++); 2628 x1 = t[idx]; x2 = t[1+idx]; 2629 x3 = t[2+idx]; x4 = t[3+idx]; 2630 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2631 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2632 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2633 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2634 v += 16; 2635 } 2636 idc = 4*(*c--); 2637 v = aa + 16*diag[i]; 2638 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2639 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2640 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2641 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2642 } 2643 2644 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2645 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2646 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2647 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2648 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2649 PetscFunctionReturn(0); 2650 } 2651 2652 #undef __FUNCT__ 2653 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2654 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 2655 { 2656 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2657 IS iscol=a->col,isrow=a->row; 2658 PetscErrorCode ierr; 2659 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2660 const PetscInt *r,*c,*rout,*cout; 2661 const MatScalar *aa=a->a,*v; 2662 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2663 const PetscScalar *b; 2664 2665 PetscFunctionBegin; 2666 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2667 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2668 t = a->solve_work; 2669 2670 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2671 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2672 2673 /* forward solve the lower triangular */ 2674 idx = 4*r[0]; 2675 t[0] = b[idx]; t[1] = b[1+idx]; 2676 t[2] = b[2+idx]; t[3] = b[3+idx]; 2677 for (i=1; i<n; i++) { 2678 v = aa + 16*ai[i]; 2679 vi = aj + ai[i]; 2680 nz = ai[i+1] - ai[i]; 2681 idx = 4*r[i]; 2682 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2683 for(m=0;m<nz;m++){ 2684 idx = 4*vi[m]; 2685 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2686 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2687 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2688 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2689 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2690 v += 16; 2691 } 2692 idx = 4*i; 2693 t[idx] = s1;t[1+idx] = s2; 2694 t[2+idx] = s3;t[3+idx] = s4; 2695 } 2696 /* backward solve the upper triangular */ 2697 for (i=n-1; i>=0; i--){ 2698 k = 2*n-i; 2699 v = aa + 16*ai[k]; 2700 vi = aj + ai[k]; 2701 nz = ai[k+1] - ai[k] - 1; 2702 idt = 4*i; 2703 s1 = t[idt]; s2 = t[1+idt]; 2704 s3 = t[2+idt];s4 = t[3+idt]; 2705 for(m=0;m<nz;m++){ 2706 idx = 4*vi[m]; 2707 x1 = t[idx]; x2 = t[1+idx]; 2708 x3 = t[2+idx]; x4 = t[3+idx]; 2709 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2710 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2711 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2712 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2713 v += 16; 2714 } 2715 idc = 4*c[i]; 2716 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2717 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2718 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2719 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2720 } 2721 2722 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2723 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2724 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2725 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2726 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2727 PetscFunctionReturn(0); 2728 } 2729 2730 #undef __FUNCT__ 2731 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2" 2732 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2733 { 2734 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2735 IS iscol=a->col,isrow=a->row; 2736 PetscErrorCode ierr; 2737 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2738 const PetscInt *r,*c,*rout,*cout; 2739 const MatScalar *aa=a->a,*v; 2740 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2741 const PetscScalar *b; 2742 2743 PetscFunctionBegin; 2744 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2745 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2746 t = a->solve_work; 2747 2748 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2749 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2750 2751 /* forward solve the lower triangular */ 2752 idx = 4*r[0]; 2753 t[0] = b[idx]; t[1] = b[1+idx]; 2754 t[2] = b[2+idx]; t[3] = b[3+idx]; 2755 for (i=1; i<n; i++) { 2756 v = aa + 16*ai[i]; 2757 vi = aj + ai[i]; 2758 nz = ai[i+1] - ai[i]; 2759 idx = 4*r[i]; 2760 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2761 for(m=0;m<nz;m++){ 2762 idx = 4*vi[m]; 2763 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2764 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2765 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2766 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2767 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2768 v += 16; 2769 } 2770 idx = 4*i; 2771 t[idx] = s1;t[1+idx] = s2; 2772 t[2+idx] = s3;t[3+idx] = s4; 2773 } 2774 /* backward solve the upper triangular */ 2775 for (i=n-1; i>=0; i--){ 2776 v = aa + 16*(adiag[i+1]+1); 2777 vi = aj + adiag[i+1]+1; 2778 nz = adiag[i] - adiag[i+1] - 1; 2779 idt = 4*i; 2780 s1 = t[idt]; s2 = t[1+idt]; 2781 s3 = t[2+idt];s4 = t[3+idt]; 2782 for(m=0;m<nz;m++){ 2783 idx = 4*vi[m]; 2784 x1 = t[idx]; x2 = t[1+idx]; 2785 x3 = t[2+idx]; x4 = t[3+idx]; 2786 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2787 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2788 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2789 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2790 v += 16; 2791 } 2792 idc = 4*c[i]; 2793 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2794 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2795 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2796 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2797 } 2798 2799 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2800 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2801 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2802 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2803 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2804 PetscFunctionReturn(0); 2805 } 2806 2807 #undef __FUNCT__ 2808 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2809 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2810 { 2811 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2812 IS iscol=a->col,isrow=a->row; 2813 PetscErrorCode ierr; 2814 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2815 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2816 const MatScalar *aa=a->a,*v; 2817 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2818 PetscScalar *x; 2819 const PetscScalar *b; 2820 2821 PetscFunctionBegin; 2822 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2823 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2824 t = (MatScalar *)a->solve_work; 2825 2826 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2827 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2828 2829 /* forward solve the lower triangular */ 2830 idx = 4*(*r++); 2831 t[0] = (MatScalar)b[idx]; 2832 t[1] = (MatScalar)b[1+idx]; 2833 t[2] = (MatScalar)b[2+idx]; 2834 t[3] = (MatScalar)b[3+idx]; 2835 for (i=1; i<n; i++) { 2836 v = aa + 16*ai[i]; 2837 vi = aj + ai[i]; 2838 nz = diag[i] - ai[i]; 2839 idx = 4*(*r++); 2840 s1 = (MatScalar)b[idx]; 2841 s2 = (MatScalar)b[1+idx]; 2842 s3 = (MatScalar)b[2+idx]; 2843 s4 = (MatScalar)b[3+idx]; 2844 while (nz--) { 2845 idx = 4*(*vi++); 2846 x1 = t[idx]; 2847 x2 = t[1+idx]; 2848 x3 = t[2+idx]; 2849 x4 = t[3+idx]; 2850 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2851 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2852 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2853 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2854 v += 16; 2855 } 2856 idx = 4*i; 2857 t[idx] = s1; 2858 t[1+idx] = s2; 2859 t[2+idx] = s3; 2860 t[3+idx] = s4; 2861 } 2862 /* backward solve the upper triangular */ 2863 for (i=n-1; i>=0; i--){ 2864 v = aa + 16*diag[i] + 16; 2865 vi = aj + diag[i] + 1; 2866 nz = ai[i+1] - diag[i] - 1; 2867 idt = 4*i; 2868 s1 = t[idt]; 2869 s2 = t[1+idt]; 2870 s3 = t[2+idt]; 2871 s4 = t[3+idt]; 2872 while (nz--) { 2873 idx = 4*(*vi++); 2874 x1 = t[idx]; 2875 x2 = t[1+idx]; 2876 x3 = t[2+idx]; 2877 x4 = t[3+idx]; 2878 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2879 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2880 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2881 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2882 v += 16; 2883 } 2884 idc = 4*(*c--); 2885 v = aa + 16*diag[i]; 2886 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2887 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2888 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2889 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2890 x[idc] = (PetscScalar)t[idt]; 2891 x[1+idc] = (PetscScalar)t[1+idt]; 2892 x[2+idc] = (PetscScalar)t[2+idt]; 2893 x[3+idc] = (PetscScalar)t[3+idt]; 2894 } 2895 2896 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2897 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2898 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2899 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2900 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2901 PetscFunctionReturn(0); 2902 } 2903 2904 #if defined (PETSC_HAVE_SSE) 2905 2906 #include PETSC_HAVE_SSE 2907 2908 #undef __FUNCT__ 2909 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2910 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 2911 { 2912 /* 2913 Note: This code uses demotion of double 2914 to float when performing the mixed-mode computation. 2915 This may not be numerically reasonable for all applications. 2916 */ 2917 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2918 IS iscol=a->col,isrow=a->row; 2919 PetscErrorCode ierr; 2920 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 2921 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2922 MatScalar *aa=a->a,*v; 2923 PetscScalar *x,*b,*t; 2924 2925 /* Make space in temp stack for 16 Byte Aligned arrays */ 2926 float ssealignedspace[11],*tmps,*tmpx; 2927 unsigned long offset; 2928 2929 PetscFunctionBegin; 2930 SSE_SCOPE_BEGIN; 2931 2932 offset = (unsigned long)ssealignedspace % 16; 2933 if (offset) offset = (16 - offset)/4; 2934 tmps = &ssealignedspace[offset]; 2935 tmpx = &ssealignedspace[offset+4]; 2936 PREFETCH_NTA(aa+16*ai[1]); 2937 2938 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2939 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2940 t = a->solve_work; 2941 2942 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2943 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2944 2945 /* forward solve the lower triangular */ 2946 idx = 4*(*r++); 2947 t[0] = b[idx]; t[1] = b[1+idx]; 2948 t[2] = b[2+idx]; t[3] = b[3+idx]; 2949 v = aa + 16*ai[1]; 2950 2951 for (i=1; i<n;) { 2952 PREFETCH_NTA(&v[8]); 2953 vi = aj + ai[i]; 2954 nz = diag[i] - ai[i]; 2955 idx = 4*(*r++); 2956 2957 /* Demote sum from double to float */ 2958 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2959 LOAD_PS(tmps,XMM7); 2960 2961 while (nz--) { 2962 PREFETCH_NTA(&v[16]); 2963 idx = 4*(*vi++); 2964 2965 /* Demote solution (so far) from double to float */ 2966 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2967 2968 /* 4x4 Matrix-Vector product with negative accumulation: */ 2969 SSE_INLINE_BEGIN_2(tmpx,v) 2970 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2971 2972 /* First Column */ 2973 SSE_COPY_PS(XMM0,XMM6) 2974 SSE_SHUFFLE(XMM0,XMM0,0x00) 2975 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2976 SSE_SUB_PS(XMM7,XMM0) 2977 2978 /* Second Column */ 2979 SSE_COPY_PS(XMM1,XMM6) 2980 SSE_SHUFFLE(XMM1,XMM1,0x55) 2981 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2982 SSE_SUB_PS(XMM7,XMM1) 2983 2984 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2985 2986 /* Third Column */ 2987 SSE_COPY_PS(XMM2,XMM6) 2988 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2989 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2990 SSE_SUB_PS(XMM7,XMM2) 2991 2992 /* Fourth Column */ 2993 SSE_COPY_PS(XMM3,XMM6) 2994 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2995 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2996 SSE_SUB_PS(XMM7,XMM3) 2997 SSE_INLINE_END_2 2998 2999 v += 16; 3000 } 3001 idx = 4*i; 3002 v = aa + 16*ai[++i]; 3003 PREFETCH_NTA(v); 3004 STORE_PS(tmps,XMM7); 3005 3006 /* Promote result from float to double */ 3007 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3008 } 3009 /* backward solve the upper triangular */ 3010 idt = 4*(n-1); 3011 ai16 = 16*diag[n-1]; 3012 v = aa + ai16 + 16; 3013 for (i=n-1; i>=0;){ 3014 PREFETCH_NTA(&v[8]); 3015 vi = aj + diag[i] + 1; 3016 nz = ai[i+1] - diag[i] - 1; 3017 3018 /* Demote accumulator from double to float */ 3019 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3020 LOAD_PS(tmps,XMM7); 3021 3022 while (nz--) { 3023 PREFETCH_NTA(&v[16]); 3024 idx = 4*(*vi++); 3025 3026 /* Demote solution (so far) from double to float */ 3027 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3028 3029 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3030 SSE_INLINE_BEGIN_2(tmpx,v) 3031 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3032 3033 /* First Column */ 3034 SSE_COPY_PS(XMM0,XMM6) 3035 SSE_SHUFFLE(XMM0,XMM0,0x00) 3036 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3037 SSE_SUB_PS(XMM7,XMM0) 3038 3039 /* Second Column */ 3040 SSE_COPY_PS(XMM1,XMM6) 3041 SSE_SHUFFLE(XMM1,XMM1,0x55) 3042 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3043 SSE_SUB_PS(XMM7,XMM1) 3044 3045 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3046 3047 /* Third Column */ 3048 SSE_COPY_PS(XMM2,XMM6) 3049 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3050 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3051 SSE_SUB_PS(XMM7,XMM2) 3052 3053 /* Fourth Column */ 3054 SSE_COPY_PS(XMM3,XMM6) 3055 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3056 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3057 SSE_SUB_PS(XMM7,XMM3) 3058 SSE_INLINE_END_2 3059 v += 16; 3060 } 3061 v = aa + ai16; 3062 ai16 = 16*diag[--i]; 3063 PREFETCH_NTA(aa+ai16+16); 3064 /* 3065 Scale the result by the diagonal 4x4 block, 3066 which was inverted as part of the factorization 3067 */ 3068 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3069 /* First Column */ 3070 SSE_COPY_PS(XMM0,XMM7) 3071 SSE_SHUFFLE(XMM0,XMM0,0x00) 3072 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3073 3074 /* Second Column */ 3075 SSE_COPY_PS(XMM1,XMM7) 3076 SSE_SHUFFLE(XMM1,XMM1,0x55) 3077 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3078 SSE_ADD_PS(XMM0,XMM1) 3079 3080 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3081 3082 /* Third Column */ 3083 SSE_COPY_PS(XMM2,XMM7) 3084 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3085 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3086 SSE_ADD_PS(XMM0,XMM2) 3087 3088 /* Fourth Column */ 3089 SSE_COPY_PS(XMM3,XMM7) 3090 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3091 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3092 SSE_ADD_PS(XMM0,XMM3) 3093 3094 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3095 SSE_INLINE_END_3 3096 3097 /* Promote solution from float to double */ 3098 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3099 3100 /* Apply reordering to t and stream into x. */ 3101 /* This way, x doesn't pollute the cache. */ 3102 /* Be careful with size: 2 doubles = 4 floats! */ 3103 idc = 4*(*c--); 3104 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3105 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3106 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3107 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3108 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3109 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3110 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3111 SSE_INLINE_END_2 3112 v = aa + ai16 + 16; 3113 idt -= 4; 3114 } 3115 3116 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3117 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3118 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3119 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3120 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3121 SSE_SCOPE_END; 3122 PetscFunctionReturn(0); 3123 } 3124 3125 #endif 3126 3127 3128 /* 3129 Special case where the matrix was ILU(0) factored in the natural 3130 ordering. This eliminates the need for the column and row permutation. 3131 */ 3132 #undef __FUNCT__ 3133 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3134 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3135 { 3136 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3137 PetscInt n=a->mbs; 3138 const PetscInt *ai=a->i,*aj=a->j; 3139 PetscErrorCode ierr; 3140 const PetscInt *diag = a->diag; 3141 const MatScalar *aa=a->a; 3142 PetscScalar *x; 3143 const PetscScalar *b; 3144 3145 PetscFunctionBegin; 3146 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3147 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3148 3149 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3150 { 3151 static PetscScalar w[2000]; /* very BAD need to fix */ 3152 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3153 } 3154 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3155 { 3156 static PetscScalar w[2000]; /* very BAD need to fix */ 3157 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3158 } 3159 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3160 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3161 #else 3162 { 3163 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3164 const MatScalar *v; 3165 PetscInt jdx,idt,idx,nz,i,ai16; 3166 const PetscInt *vi; 3167 3168 /* forward solve the lower triangular */ 3169 idx = 0; 3170 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3171 for (i=1; i<n; i++) { 3172 v = aa + 16*ai[i]; 3173 vi = aj + ai[i]; 3174 nz = diag[i] - ai[i]; 3175 idx += 4; 3176 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3177 while (nz--) { 3178 jdx = 4*(*vi++); 3179 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3180 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3181 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3182 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3183 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3184 v += 16; 3185 } 3186 x[idx] = s1; 3187 x[1+idx] = s2; 3188 x[2+idx] = s3; 3189 x[3+idx] = s4; 3190 } 3191 /* backward solve the upper triangular */ 3192 idt = 4*(n-1); 3193 for (i=n-1; i>=0; i--){ 3194 ai16 = 16*diag[i]; 3195 v = aa + ai16 + 16; 3196 vi = aj + diag[i] + 1; 3197 nz = ai[i+1] - diag[i] - 1; 3198 s1 = x[idt]; s2 = x[1+idt]; 3199 s3 = x[2+idt];s4 = x[3+idt]; 3200 while (nz--) { 3201 idx = 4*(*vi++); 3202 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3203 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3204 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3205 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3206 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3207 v += 16; 3208 } 3209 v = aa + ai16; 3210 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3211 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3212 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3213 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3214 idt -= 4; 3215 } 3216 } 3217 #endif 3218 3219 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3220 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3221 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3222 PetscFunctionReturn(0); 3223 } 3224 3225 #undef __FUNCT__ 3226 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3227 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3228 { 3229 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3230 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3231 PetscErrorCode ierr; 3232 PetscInt idx,jdx,idt; 3233 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3234 const MatScalar *aa=a->a,*v; 3235 PetscScalar *x; 3236 const PetscScalar *b; 3237 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3238 3239 PetscFunctionBegin; 3240 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3241 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3242 /* forward solve the lower triangular */ 3243 idx = 0; 3244 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3245 for (i=1; i<n; i++) { 3246 v = aa + bs2*ai[i]; 3247 vi = aj + ai[i]; 3248 nz = ai[i+1] - ai[i]; 3249 idx = bs*i; 3250 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3251 for(k=0;k<nz;k++) { 3252 jdx = bs*vi[k]; 3253 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3254 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3255 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3256 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3257 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3258 3259 v += bs2; 3260 } 3261 3262 x[idx] = s1; 3263 x[1+idx] = s2; 3264 x[2+idx] = s3; 3265 x[3+idx] = s4; 3266 } 3267 3268 /* backward solve the upper triangular */ 3269 for (i=n-1; i>=0; i--){ 3270 v = aa + bs2*ai[2*n-i]; 3271 vi = aj + ai[2*n-i]; 3272 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3273 idt = bs*i; 3274 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3275 3276 for(k=0;k<nz;k++){ 3277 idx = bs*vi[k]; 3278 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3279 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3280 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3281 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3282 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3283 3284 v += bs2; 3285 } 3286 /* x = inv_diagonal*x */ 3287 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3288 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3289 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3290 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3291 3292 } 3293 3294 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3295 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3296 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3297 PetscFunctionReturn(0); 3298 } 3299 3300 #undef __FUNCT__ 3301 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 3302 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3303 { 3304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3305 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3306 PetscErrorCode ierr; 3307 PetscInt idx,jdx,idt; 3308 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3309 const MatScalar *aa=a->a,*v; 3310 PetscScalar *x; 3311 const PetscScalar *b; 3312 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3313 3314 PetscFunctionBegin; 3315 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3316 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3317 /* forward solve the lower triangular */ 3318 idx = 0; 3319 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3320 for (i=1; i<n; i++) { 3321 v = aa + bs2*ai[i]; 3322 vi = aj + ai[i]; 3323 nz = ai[i+1] - ai[i]; 3324 idx = bs*i; 3325 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3326 for(k=0;k<nz;k++) { 3327 jdx = bs*vi[k]; 3328 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3329 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3330 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3331 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3332 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3333 3334 v += bs2; 3335 } 3336 3337 x[idx] = s1; 3338 x[1+idx] = s2; 3339 x[2+idx] = s3; 3340 x[3+idx] = s4; 3341 } 3342 3343 /* backward solve the upper triangular */ 3344 for (i=n-1; i>=0; i--){ 3345 v = aa + bs2*(adiag[i+1]+1); 3346 vi = aj + adiag[i+1]+1; 3347 nz = adiag[i] - adiag[i+1]-1; 3348 idt = bs*i; 3349 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3350 3351 for(k=0;k<nz;k++){ 3352 idx = bs*vi[k]; 3353 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3354 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3355 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3356 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3357 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3358 3359 v += bs2; 3360 } 3361 /* x = inv_diagonal*x */ 3362 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3363 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3364 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3365 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3366 3367 } 3368 3369 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3370 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3371 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3372 PetscFunctionReturn(0); 3373 } 3374 3375 #undef __FUNCT__ 3376 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3377 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3378 { 3379 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3380 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3381 PetscErrorCode ierr; 3382 PetscInt *diag = a->diag; 3383 MatScalar *aa=a->a; 3384 PetscScalar *x,*b; 3385 3386 PetscFunctionBegin; 3387 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3388 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3389 3390 { 3391 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3392 MatScalar *v,*t=(MatScalar *)x; 3393 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3394 3395 /* forward solve the lower triangular */ 3396 idx = 0; 3397 t[0] = (MatScalar)b[0]; 3398 t[1] = (MatScalar)b[1]; 3399 t[2] = (MatScalar)b[2]; 3400 t[3] = (MatScalar)b[3]; 3401 for (i=1; i<n; i++) { 3402 v = aa + 16*ai[i]; 3403 vi = aj + ai[i]; 3404 nz = diag[i] - ai[i]; 3405 idx += 4; 3406 s1 = (MatScalar)b[idx]; 3407 s2 = (MatScalar)b[1+idx]; 3408 s3 = (MatScalar)b[2+idx]; 3409 s4 = (MatScalar)b[3+idx]; 3410 while (nz--) { 3411 jdx = 4*(*vi++); 3412 x1 = t[jdx]; 3413 x2 = t[1+jdx]; 3414 x3 = t[2+jdx]; 3415 x4 = t[3+jdx]; 3416 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3417 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3418 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3419 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3420 v += 16; 3421 } 3422 t[idx] = s1; 3423 t[1+idx] = s2; 3424 t[2+idx] = s3; 3425 t[3+idx] = s4; 3426 } 3427 /* backward solve the upper triangular */ 3428 idt = 4*(n-1); 3429 for (i=n-1; i>=0; i--){ 3430 ai16 = 16*diag[i]; 3431 v = aa + ai16 + 16; 3432 vi = aj + diag[i] + 1; 3433 nz = ai[i+1] - diag[i] - 1; 3434 s1 = t[idt]; 3435 s2 = t[1+idt]; 3436 s3 = t[2+idt]; 3437 s4 = t[3+idt]; 3438 while (nz--) { 3439 idx = 4*(*vi++); 3440 x1 = (MatScalar)x[idx]; 3441 x2 = (MatScalar)x[1+idx]; 3442 x3 = (MatScalar)x[2+idx]; 3443 x4 = (MatScalar)x[3+idx]; 3444 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3445 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3446 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3447 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3448 v += 16; 3449 } 3450 v = aa + ai16; 3451 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3452 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3453 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3454 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3455 idt -= 4; 3456 } 3457 } 3458 3459 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3460 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3461 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3462 PetscFunctionReturn(0); 3463 } 3464 3465 #if defined (PETSC_HAVE_SSE) 3466 3467 #include PETSC_HAVE_SSE 3468 #undef __FUNCT__ 3469 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3470 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 3471 { 3472 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3473 unsigned short *aj=(unsigned short *)a->j; 3474 PetscErrorCode ierr; 3475 int *ai=a->i,n=a->mbs,*diag = a->diag; 3476 MatScalar *aa=a->a; 3477 PetscScalar *x,*b; 3478 3479 PetscFunctionBegin; 3480 SSE_SCOPE_BEGIN; 3481 /* 3482 Note: This code currently uses demotion of double 3483 to float when performing the mixed-mode computation. 3484 This may not be numerically reasonable for all applications. 3485 */ 3486 PREFETCH_NTA(aa+16*ai[1]); 3487 3488 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3489 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3490 { 3491 /* x will first be computed in single precision then promoted inplace to double */ 3492 MatScalar *v,*t=(MatScalar *)x; 3493 int nz,i,idt,ai16; 3494 unsigned int jdx,idx; 3495 unsigned short *vi; 3496 /* Forward solve the lower triangular factor. */ 3497 3498 /* First block is the identity. */ 3499 idx = 0; 3500 CONVERT_DOUBLE4_FLOAT4(t,b); 3501 v = aa + 16*((unsigned int)ai[1]); 3502 3503 for (i=1; i<n;) { 3504 PREFETCH_NTA(&v[8]); 3505 vi = aj + ai[i]; 3506 nz = diag[i] - ai[i]; 3507 idx += 4; 3508 3509 /* Demote RHS from double to float. */ 3510 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3511 LOAD_PS(&t[idx],XMM7); 3512 3513 while (nz--) { 3514 PREFETCH_NTA(&v[16]); 3515 jdx = 4*((unsigned int)(*vi++)); 3516 3517 /* 4x4 Matrix-Vector product with negative accumulation: */ 3518 SSE_INLINE_BEGIN_2(&t[jdx],v) 3519 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3520 3521 /* First Column */ 3522 SSE_COPY_PS(XMM0,XMM6) 3523 SSE_SHUFFLE(XMM0,XMM0,0x00) 3524 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3525 SSE_SUB_PS(XMM7,XMM0) 3526 3527 /* Second Column */ 3528 SSE_COPY_PS(XMM1,XMM6) 3529 SSE_SHUFFLE(XMM1,XMM1,0x55) 3530 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3531 SSE_SUB_PS(XMM7,XMM1) 3532 3533 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3534 3535 /* Third Column */ 3536 SSE_COPY_PS(XMM2,XMM6) 3537 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3538 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3539 SSE_SUB_PS(XMM7,XMM2) 3540 3541 /* Fourth Column */ 3542 SSE_COPY_PS(XMM3,XMM6) 3543 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3544 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3545 SSE_SUB_PS(XMM7,XMM3) 3546 SSE_INLINE_END_2 3547 3548 v += 16; 3549 } 3550 v = aa + 16*ai[++i]; 3551 PREFETCH_NTA(v); 3552 STORE_PS(&t[idx],XMM7); 3553 } 3554 3555 /* Backward solve the upper triangular factor.*/ 3556 3557 idt = 4*(n-1); 3558 ai16 = 16*diag[n-1]; 3559 v = aa + ai16 + 16; 3560 for (i=n-1; i>=0;){ 3561 PREFETCH_NTA(&v[8]); 3562 vi = aj + diag[i] + 1; 3563 nz = ai[i+1] - diag[i] - 1; 3564 3565 LOAD_PS(&t[idt],XMM7); 3566 3567 while (nz--) { 3568 PREFETCH_NTA(&v[16]); 3569 idx = 4*((unsigned int)(*vi++)); 3570 3571 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3572 SSE_INLINE_BEGIN_2(&t[idx],v) 3573 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3574 3575 /* First Column */ 3576 SSE_COPY_PS(XMM0,XMM6) 3577 SSE_SHUFFLE(XMM0,XMM0,0x00) 3578 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3579 SSE_SUB_PS(XMM7,XMM0) 3580 3581 /* Second Column */ 3582 SSE_COPY_PS(XMM1,XMM6) 3583 SSE_SHUFFLE(XMM1,XMM1,0x55) 3584 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3585 SSE_SUB_PS(XMM7,XMM1) 3586 3587 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3588 3589 /* Third Column */ 3590 SSE_COPY_PS(XMM2,XMM6) 3591 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3592 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3593 SSE_SUB_PS(XMM7,XMM2) 3594 3595 /* Fourth Column */ 3596 SSE_COPY_PS(XMM3,XMM6) 3597 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3598 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3599 SSE_SUB_PS(XMM7,XMM3) 3600 SSE_INLINE_END_2 3601 v += 16; 3602 } 3603 v = aa + ai16; 3604 ai16 = 16*diag[--i]; 3605 PREFETCH_NTA(aa+ai16+16); 3606 /* 3607 Scale the result by the diagonal 4x4 block, 3608 which was inverted as part of the factorization 3609 */ 3610 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3611 /* First Column */ 3612 SSE_COPY_PS(XMM0,XMM7) 3613 SSE_SHUFFLE(XMM0,XMM0,0x00) 3614 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3615 3616 /* Second Column */ 3617 SSE_COPY_PS(XMM1,XMM7) 3618 SSE_SHUFFLE(XMM1,XMM1,0x55) 3619 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3620 SSE_ADD_PS(XMM0,XMM1) 3621 3622 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3623 3624 /* Third Column */ 3625 SSE_COPY_PS(XMM2,XMM7) 3626 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3627 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3628 SSE_ADD_PS(XMM0,XMM2) 3629 3630 /* Fourth Column */ 3631 SSE_COPY_PS(XMM3,XMM7) 3632 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3633 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3634 SSE_ADD_PS(XMM0,XMM3) 3635 3636 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3637 SSE_INLINE_END_3 3638 3639 v = aa + ai16 + 16; 3640 idt -= 4; 3641 } 3642 3643 /* Convert t from single precision back to double precision (inplace)*/ 3644 idt = 4*(n-1); 3645 for (i=n-1;i>=0;i--) { 3646 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3647 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3648 PetscScalar *xtemp=&x[idt]; 3649 MatScalar *ttemp=&t[idt]; 3650 xtemp[3] = (PetscScalar)ttemp[3]; 3651 xtemp[2] = (PetscScalar)ttemp[2]; 3652 xtemp[1] = (PetscScalar)ttemp[1]; 3653 xtemp[0] = (PetscScalar)ttemp[0]; 3654 idt -= 4; 3655 } 3656 3657 } /* End of artificial scope. */ 3658 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3659 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3660 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3661 SSE_SCOPE_END; 3662 PetscFunctionReturn(0); 3663 } 3664 3665 #undef __FUNCT__ 3666 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3667 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 3668 { 3669 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3670 int *aj=a->j; 3671 PetscErrorCode ierr; 3672 int *ai=a->i,n=a->mbs,*diag = a->diag; 3673 MatScalar *aa=a->a; 3674 PetscScalar *x,*b; 3675 3676 PetscFunctionBegin; 3677 SSE_SCOPE_BEGIN; 3678 /* 3679 Note: This code currently uses demotion of double 3680 to float when performing the mixed-mode computation. 3681 This may not be numerically reasonable for all applications. 3682 */ 3683 PREFETCH_NTA(aa+16*ai[1]); 3684 3685 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3686 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3687 { 3688 /* x will first be computed in single precision then promoted inplace to double */ 3689 MatScalar *v,*t=(MatScalar *)x; 3690 int nz,i,idt,ai16; 3691 int jdx,idx; 3692 int *vi; 3693 /* Forward solve the lower triangular factor. */ 3694 3695 /* First block is the identity. */ 3696 idx = 0; 3697 CONVERT_DOUBLE4_FLOAT4(t,b); 3698 v = aa + 16*ai[1]; 3699 3700 for (i=1; i<n;) { 3701 PREFETCH_NTA(&v[8]); 3702 vi = aj + ai[i]; 3703 nz = diag[i] - ai[i]; 3704 idx += 4; 3705 3706 /* Demote RHS from double to float. */ 3707 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3708 LOAD_PS(&t[idx],XMM7); 3709 3710 while (nz--) { 3711 PREFETCH_NTA(&v[16]); 3712 jdx = 4*(*vi++); 3713 /* jdx = *vi++; */ 3714 3715 /* 4x4 Matrix-Vector product with negative accumulation: */ 3716 SSE_INLINE_BEGIN_2(&t[jdx],v) 3717 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3718 3719 /* First Column */ 3720 SSE_COPY_PS(XMM0,XMM6) 3721 SSE_SHUFFLE(XMM0,XMM0,0x00) 3722 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3723 SSE_SUB_PS(XMM7,XMM0) 3724 3725 /* Second Column */ 3726 SSE_COPY_PS(XMM1,XMM6) 3727 SSE_SHUFFLE(XMM1,XMM1,0x55) 3728 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3729 SSE_SUB_PS(XMM7,XMM1) 3730 3731 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3732 3733 /* Third Column */ 3734 SSE_COPY_PS(XMM2,XMM6) 3735 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3736 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3737 SSE_SUB_PS(XMM7,XMM2) 3738 3739 /* Fourth Column */ 3740 SSE_COPY_PS(XMM3,XMM6) 3741 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3742 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3743 SSE_SUB_PS(XMM7,XMM3) 3744 SSE_INLINE_END_2 3745 3746 v += 16; 3747 } 3748 v = aa + 16*ai[++i]; 3749 PREFETCH_NTA(v); 3750 STORE_PS(&t[idx],XMM7); 3751 } 3752 3753 /* Backward solve the upper triangular factor.*/ 3754 3755 idt = 4*(n-1); 3756 ai16 = 16*diag[n-1]; 3757 v = aa + ai16 + 16; 3758 for (i=n-1; i>=0;){ 3759 PREFETCH_NTA(&v[8]); 3760 vi = aj + diag[i] + 1; 3761 nz = ai[i+1] - diag[i] - 1; 3762 3763 LOAD_PS(&t[idt],XMM7); 3764 3765 while (nz--) { 3766 PREFETCH_NTA(&v[16]); 3767 idx = 4*(*vi++); 3768 /* idx = *vi++; */ 3769 3770 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3771 SSE_INLINE_BEGIN_2(&t[idx],v) 3772 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3773 3774 /* First Column */ 3775 SSE_COPY_PS(XMM0,XMM6) 3776 SSE_SHUFFLE(XMM0,XMM0,0x00) 3777 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3778 SSE_SUB_PS(XMM7,XMM0) 3779 3780 /* Second Column */ 3781 SSE_COPY_PS(XMM1,XMM6) 3782 SSE_SHUFFLE(XMM1,XMM1,0x55) 3783 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3784 SSE_SUB_PS(XMM7,XMM1) 3785 3786 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3787 3788 /* Third Column */ 3789 SSE_COPY_PS(XMM2,XMM6) 3790 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3791 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3792 SSE_SUB_PS(XMM7,XMM2) 3793 3794 /* Fourth Column */ 3795 SSE_COPY_PS(XMM3,XMM6) 3796 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3797 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3798 SSE_SUB_PS(XMM7,XMM3) 3799 SSE_INLINE_END_2 3800 v += 16; 3801 } 3802 v = aa + ai16; 3803 ai16 = 16*diag[--i]; 3804 PREFETCH_NTA(aa+ai16+16); 3805 /* 3806 Scale the result by the diagonal 4x4 block, 3807 which was inverted as part of the factorization 3808 */ 3809 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3810 /* First Column */ 3811 SSE_COPY_PS(XMM0,XMM7) 3812 SSE_SHUFFLE(XMM0,XMM0,0x00) 3813 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3814 3815 /* Second Column */ 3816 SSE_COPY_PS(XMM1,XMM7) 3817 SSE_SHUFFLE(XMM1,XMM1,0x55) 3818 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3819 SSE_ADD_PS(XMM0,XMM1) 3820 3821 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3822 3823 /* Third Column */ 3824 SSE_COPY_PS(XMM2,XMM7) 3825 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3826 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3827 SSE_ADD_PS(XMM0,XMM2) 3828 3829 /* Fourth Column */ 3830 SSE_COPY_PS(XMM3,XMM7) 3831 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3832 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3833 SSE_ADD_PS(XMM0,XMM3) 3834 3835 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3836 SSE_INLINE_END_3 3837 3838 v = aa + ai16 + 16; 3839 idt -= 4; 3840 } 3841 3842 /* Convert t from single precision back to double precision (inplace)*/ 3843 idt = 4*(n-1); 3844 for (i=n-1;i>=0;i--) { 3845 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3846 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3847 PetscScalar *xtemp=&x[idt]; 3848 MatScalar *ttemp=&t[idt]; 3849 xtemp[3] = (PetscScalar)ttemp[3]; 3850 xtemp[2] = (PetscScalar)ttemp[2]; 3851 xtemp[1] = (PetscScalar)ttemp[1]; 3852 xtemp[0] = (PetscScalar)ttemp[0]; 3853 idt -= 4; 3854 } 3855 3856 } /* End of artificial scope. */ 3857 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3858 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3859 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3860 SSE_SCOPE_END; 3861 PetscFunctionReturn(0); 3862 } 3863 3864 #endif 3865 3866 #undef __FUNCT__ 3867 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3868 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 3869 { 3870 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3871 IS iscol=a->col,isrow=a->row; 3872 PetscErrorCode ierr; 3873 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3874 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3875 const MatScalar *aa=a->a,*v; 3876 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3877 const PetscScalar *b; 3878 3879 PetscFunctionBegin; 3880 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3881 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3882 t = a->solve_work; 3883 3884 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3885 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3886 3887 /* forward solve the lower triangular */ 3888 idx = 3*(*r++); 3889 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3890 for (i=1; i<n; i++) { 3891 v = aa + 9*ai[i]; 3892 vi = aj + ai[i]; 3893 nz = diag[i] - ai[i]; 3894 idx = 3*(*r++); 3895 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3896 while (nz--) { 3897 idx = 3*(*vi++); 3898 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3899 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3900 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3901 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3902 v += 9; 3903 } 3904 idx = 3*i; 3905 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3906 } 3907 /* backward solve the upper triangular */ 3908 for (i=n-1; i>=0; i--){ 3909 v = aa + 9*diag[i] + 9; 3910 vi = aj + diag[i] + 1; 3911 nz = ai[i+1] - diag[i] - 1; 3912 idt = 3*i; 3913 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3914 while (nz--) { 3915 idx = 3*(*vi++); 3916 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3917 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3918 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3919 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3920 v += 9; 3921 } 3922 idc = 3*(*c--); 3923 v = aa + 9*diag[i]; 3924 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3925 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3926 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3927 } 3928 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3929 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3930 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3931 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3932 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3933 PetscFunctionReturn(0); 3934 } 3935 3936 #undef __FUNCT__ 3937 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 3938 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 3939 { 3940 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3941 IS iscol=a->col,isrow=a->row; 3942 PetscErrorCode ierr; 3943 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 3944 const PetscInt *r,*c,*rout,*cout; 3945 const MatScalar *aa=a->a,*v; 3946 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3947 const PetscScalar *b; 3948 3949 PetscFunctionBegin; 3950 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3951 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3952 t = a->solve_work; 3953 3954 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3955 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3956 3957 /* forward solve the lower triangular */ 3958 idx = 3*r[0]; 3959 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3960 for (i=1; i<n; i++) { 3961 v = aa + 9*ai[i]; 3962 vi = aj + ai[i]; 3963 nz = ai[i+1] - ai[i]; 3964 idx = 3*r[i]; 3965 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3966 for(m=0;m<nz;m++){ 3967 idx = 3*vi[m]; 3968 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3969 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3970 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3971 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3972 v += 9; 3973 } 3974 idx = 3*i; 3975 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3976 } 3977 /* backward solve the upper triangular */ 3978 for (i=n-1; i>=0; i--){ 3979 k = 2*n-i; 3980 v = aa + 9*ai[k]; 3981 vi = aj + ai[k]; 3982 nz = ai[k +1] - ai[k] - 1; 3983 idt = 3*i; 3984 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3985 for(m=0;m<nz;m++){ 3986 idx = 3*vi[m]; 3987 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3988 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3989 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3990 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3991 v += 9; 3992 } 3993 idc = 3*c[i]; 3994 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3995 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3996 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3997 } 3998 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3999 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4000 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4001 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4002 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4003 PetscFunctionReturn(0); 4004 } 4005 4006 #undef __FUNCT__ 4007 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2" 4008 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4009 { 4010 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4011 IS iscol=a->col,isrow=a->row; 4012 PetscErrorCode ierr; 4013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 4014 const PetscInt *r,*c,*rout,*cout; 4015 const MatScalar *aa=a->a,*v; 4016 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4017 const PetscScalar *b; 4018 4019 PetscFunctionBegin; 4020 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4022 t = a->solve_work; 4023 4024 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4025 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4026 4027 /* forward solve the lower triangular */ 4028 idx = 3*r[0]; 4029 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4030 for (i=1; i<n; i++) { 4031 v = aa + 9*ai[i]; 4032 vi = aj + ai[i]; 4033 nz = ai[i+1] - ai[i]; 4034 idx = 3*r[i]; 4035 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4036 for(m=0;m<nz;m++){ 4037 idx = 3*vi[m]; 4038 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4039 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4040 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4041 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4042 v += 9; 4043 } 4044 idx = 3*i; 4045 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4046 } 4047 /* backward solve the upper triangular */ 4048 for (i=n-1; i>=0; i--){ 4049 v = aa + 9*(adiag[i+1]+1); 4050 vi = aj + adiag[i+1]+1; 4051 nz = adiag[i] - adiag[i+1] - 1; 4052 idt = 3*i; 4053 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4054 for(m=0;m<nz;m++){ 4055 idx = 3*vi[m]; 4056 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4057 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4058 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4059 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4060 v += 9; 4061 } 4062 idc = 3*c[i]; 4063 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4064 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4065 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4066 } 4067 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4068 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4069 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4070 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4071 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4072 PetscFunctionReturn(0); 4073 } 4074 4075 /* 4076 Special case where the matrix was ILU(0) factored in the natural 4077 ordering. This eliminates the need for the column and row permutation. 4078 */ 4079 #undef __FUNCT__ 4080 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4081 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4082 { 4083 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4084 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4085 PetscErrorCode ierr; 4086 PetscInt *diag = a->diag; 4087 const MatScalar *aa=a->a,*v; 4088 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4089 const PetscScalar *b; 4090 PetscInt jdx,idt,idx,nz,*vi,i; 4091 4092 PetscFunctionBegin; 4093 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4094 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4095 4096 /* forward solve the lower triangular */ 4097 idx = 0; 4098 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4099 for (i=1; i<n; i++) { 4100 v = aa + 9*ai[i]; 4101 vi = aj + ai[i]; 4102 nz = diag[i] - ai[i]; 4103 idx += 3; 4104 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4105 while (nz--) { 4106 jdx = 3*(*vi++); 4107 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4108 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4109 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4110 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4111 v += 9; 4112 } 4113 x[idx] = s1; 4114 x[1+idx] = s2; 4115 x[2+idx] = s3; 4116 } 4117 /* backward solve the upper triangular */ 4118 for (i=n-1; i>=0; i--){ 4119 v = aa + 9*diag[i] + 9; 4120 vi = aj + diag[i] + 1; 4121 nz = ai[i+1] - diag[i] - 1; 4122 idt = 3*i; 4123 s1 = x[idt]; s2 = x[1+idt]; 4124 s3 = x[2+idt]; 4125 while (nz--) { 4126 idx = 3*(*vi++); 4127 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4128 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4129 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4130 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4131 v += 9; 4132 } 4133 v = aa + 9*diag[i]; 4134 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4135 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4136 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4137 } 4138 4139 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4140 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4141 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4142 PetscFunctionReturn(0); 4143 } 4144 4145 #undef __FUNCT__ 4146 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4147 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4148 { 4149 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4150 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4151 PetscErrorCode ierr; 4152 PetscInt idx,jdx,idt; 4153 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4154 const MatScalar *aa=a->a,*v; 4155 PetscScalar *x; 4156 const PetscScalar *b; 4157 PetscScalar s1,s2,s3,x1,x2,x3; 4158 4159 PetscFunctionBegin; 4160 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4161 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4162 /* forward solve the lower triangular */ 4163 idx = 0; 4164 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4165 for (i=1; i<n; i++) { 4166 v = aa + bs2*ai[i]; 4167 vi = aj + ai[i]; 4168 nz = ai[i+1] - ai[i]; 4169 idx = bs*i; 4170 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4171 for(k=0;k<nz;k++){ 4172 jdx = bs*vi[k]; 4173 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4174 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4175 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4176 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4177 4178 v += bs2; 4179 } 4180 4181 x[idx] = s1; 4182 x[1+idx] = s2; 4183 x[2+idx] = s3; 4184 } 4185 4186 /* backward solve the upper triangular */ 4187 for (i=n-1; i>=0; i--){ 4188 v = aa + bs2*ai[2*n-i]; 4189 vi = aj + ai[2*n-i]; 4190 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4191 idt = bs*i; 4192 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4193 4194 for(k=0;k<nz;k++){ 4195 idx = bs*vi[k]; 4196 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4197 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4198 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4199 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4200 4201 v += bs2; 4202 } 4203 /* x = inv_diagonal*x */ 4204 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4205 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4206 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4207 4208 } 4209 4210 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4211 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4212 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4213 PetscFunctionReturn(0); 4214 } 4215 4216 #undef __FUNCT__ 4217 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 4218 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4219 { 4220 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4221 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4222 PetscErrorCode ierr; 4223 PetscInt idx,jdx,idt; 4224 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4225 const MatScalar *aa=a->a,*v; 4226 PetscScalar *x; 4227 const PetscScalar *b; 4228 PetscScalar s1,s2,s3,x1,x2,x3; 4229 4230 PetscFunctionBegin; 4231 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4232 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4233 /* forward solve the lower triangular */ 4234 idx = 0; 4235 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4236 for (i=1; i<n; i++) { 4237 v = aa + bs2*ai[i]; 4238 vi = aj + ai[i]; 4239 nz = ai[i+1] - ai[i]; 4240 idx = bs*i; 4241 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4242 for(k=0;k<nz;k++){ 4243 jdx = bs*vi[k]; 4244 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4245 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4246 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4247 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4248 4249 v += bs2; 4250 } 4251 4252 x[idx] = s1; 4253 x[1+idx] = s2; 4254 x[2+idx] = s3; 4255 } 4256 4257 /* backward solve the upper triangular */ 4258 for (i=n-1; i>=0; i--){ 4259 v = aa + bs2*(adiag[i+1]+1); 4260 vi = aj + adiag[i+1]+1; 4261 nz = adiag[i] - adiag[i+1]-1; 4262 idt = bs*i; 4263 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4264 4265 for(k=0;k<nz;k++){ 4266 idx = bs*vi[k]; 4267 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4268 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4269 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4270 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4271 4272 v += bs2; 4273 } 4274 /* x = inv_diagonal*x */ 4275 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4276 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4277 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4278 4279 } 4280 4281 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4282 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4283 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4284 PetscFunctionReturn(0); 4285 } 4286 4287 #undef __FUNCT__ 4288 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4289 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4290 { 4291 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4292 IS iscol=a->col,isrow=a->row; 4293 PetscErrorCode ierr; 4294 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4295 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4296 const MatScalar *aa=a->a,*v; 4297 PetscScalar *x,s1,s2,x1,x2,*t; 4298 const PetscScalar *b; 4299 4300 PetscFunctionBegin; 4301 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4302 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4303 t = a->solve_work; 4304 4305 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4306 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4307 4308 /* forward solve the lower triangular */ 4309 idx = 2*(*r++); 4310 t[0] = b[idx]; t[1] = b[1+idx]; 4311 for (i=1; i<n; i++) { 4312 v = aa + 4*ai[i]; 4313 vi = aj + ai[i]; 4314 nz = diag[i] - ai[i]; 4315 idx = 2*(*r++); 4316 s1 = b[idx]; s2 = b[1+idx]; 4317 while (nz--) { 4318 idx = 2*(*vi++); 4319 x1 = t[idx]; x2 = t[1+idx]; 4320 s1 -= v[0]*x1 + v[2]*x2; 4321 s2 -= v[1]*x1 + v[3]*x2; 4322 v += 4; 4323 } 4324 idx = 2*i; 4325 t[idx] = s1; t[1+idx] = s2; 4326 } 4327 /* backward solve the upper triangular */ 4328 for (i=n-1; i>=0; i--){ 4329 v = aa + 4*diag[i] + 4; 4330 vi = aj + diag[i] + 1; 4331 nz = ai[i+1] - diag[i] - 1; 4332 idt = 2*i; 4333 s1 = t[idt]; s2 = t[1+idt]; 4334 while (nz--) { 4335 idx = 2*(*vi++); 4336 x1 = t[idx]; x2 = t[1+idx]; 4337 s1 -= v[0]*x1 + v[2]*x2; 4338 s2 -= v[1]*x1 + v[3]*x2; 4339 v += 4; 4340 } 4341 idc = 2*(*c--); 4342 v = aa + 4*diag[i]; 4343 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4344 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4345 } 4346 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4347 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4348 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4349 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4350 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4351 PetscFunctionReturn(0); 4352 } 4353 4354 #undef __FUNCT__ 4355 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4356 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 4357 { 4358 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4359 IS iscol=a->col,isrow=a->row; 4360 PetscErrorCode ierr; 4361 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 4362 const PetscInt *r,*c,*rout,*cout; 4363 const MatScalar *aa=a->a,*v; 4364 PetscScalar *x,s1,s2,x1,x2,*t; 4365 const PetscScalar *b; 4366 4367 PetscFunctionBegin; 4368 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4369 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4370 t = a->solve_work; 4371 4372 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4373 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4374 4375 /* forward solve the lower triangular */ 4376 idx = 2*r[0]; 4377 t[0] = b[idx]; t[1] = b[1+idx]; 4378 for (i=1; i<n; i++) { 4379 v = aa + 4*ai[i]; 4380 vi = aj + ai[i]; 4381 nz = ai[i+1] - ai[i]; 4382 idx = 2*r[i]; 4383 s1 = b[idx]; s2 = b[1+idx]; 4384 for(m=0;m<nz;m++){ 4385 jdx = 2*vi[m]; 4386 x1 = t[jdx]; x2 = t[1+jdx]; 4387 s1 -= v[0]*x1 + v[2]*x2; 4388 s2 -= v[1]*x1 + v[3]*x2; 4389 v += 4; 4390 } 4391 idx = 2*i; 4392 t[idx] = s1; t[1+idx] = s2; 4393 } 4394 /* backward solve the upper triangular */ 4395 for (i=n-1; i>=0; i--){ 4396 k = 2*n-i; 4397 v = aa + 4*ai[k]; 4398 vi = aj + ai[k]; 4399 nz = ai[k +1] - ai[k] - 1; 4400 idt = 2*i; 4401 s1 = t[idt]; s2 = t[1+idt]; 4402 for(m=0;m<nz;m++){ 4403 idx = 2*vi[m]; 4404 x1 = t[idx]; x2 = t[1+idx]; 4405 s1 -= v[0]*x1 + v[2]*x2; 4406 s2 -= v[1]*x1 + v[3]*x2; 4407 v += 4; 4408 } 4409 idc = 2*c[i]; 4410 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4411 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4412 } 4413 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4414 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4415 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4416 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4417 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4418 PetscFunctionReturn(0); 4419 } 4420 4421 #undef __FUNCT__ 4422 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2" 4423 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4424 { 4425 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4426 IS iscol=a->col,isrow=a->row; 4427 PetscErrorCode ierr; 4428 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4429 const PetscInt *r,*c,*rout,*cout; 4430 const MatScalar *aa=a->a,*v; 4431 PetscScalar *x,s1,s2,x1,x2,*t; 4432 const PetscScalar *b; 4433 4434 PetscFunctionBegin; 4435 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4436 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4437 t = a->solve_work; 4438 4439 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4440 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4441 4442 /* forward solve the lower triangular */ 4443 idx = 2*r[0]; 4444 t[0] = b[idx]; t[1] = b[1+idx]; 4445 for (i=1; i<n; i++) { 4446 v = aa + 4*ai[i]; 4447 vi = aj + ai[i]; 4448 nz = ai[i+1] - ai[i]; 4449 idx = 2*r[i]; 4450 s1 = b[idx]; s2 = b[1+idx]; 4451 for(m=0;m<nz;m++){ 4452 jdx = 2*vi[m]; 4453 x1 = t[jdx]; x2 = t[1+jdx]; 4454 s1 -= v[0]*x1 + v[2]*x2; 4455 s2 -= v[1]*x1 + v[3]*x2; 4456 v += 4; 4457 } 4458 idx = 2*i; 4459 t[idx] = s1; t[1+idx] = s2; 4460 } 4461 /* backward solve the upper triangular */ 4462 for (i=n-1; i>=0; i--){ 4463 v = aa + 4*(adiag[i+1]+1); 4464 vi = aj + adiag[i+1]+1; 4465 nz = adiag[i] - adiag[i+1] - 1; 4466 idt = 2*i; 4467 s1 = t[idt]; s2 = t[1+idt]; 4468 for(m=0;m<nz;m++){ 4469 idx = 2*vi[m]; 4470 x1 = t[idx]; x2 = t[1+idx]; 4471 s1 -= v[0]*x1 + v[2]*x2; 4472 s2 -= v[1]*x1 + v[3]*x2; 4473 v += 4; 4474 } 4475 idc = 2*c[i]; 4476 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4477 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4478 } 4479 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4480 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4481 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4482 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4483 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4484 PetscFunctionReturn(0); 4485 } 4486 4487 /* 4488 Special case where the matrix was ILU(0) factored in the natural 4489 ordering. This eliminates the need for the column and row permutation. 4490 */ 4491 #undef __FUNCT__ 4492 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4493 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 4494 { 4495 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4496 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4497 PetscErrorCode ierr; 4498 PetscInt *diag = a->diag; 4499 const MatScalar *aa=a->a,*v; 4500 PetscScalar *x,s1,s2,x1,x2; 4501 const PetscScalar *b; 4502 PetscInt jdx,idt,idx,nz,*vi,i; 4503 4504 PetscFunctionBegin; 4505 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4506 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4507 4508 /* forward solve the lower triangular */ 4509 idx = 0; 4510 x[0] = b[0]; x[1] = b[1]; 4511 for (i=1; i<n; i++) { 4512 v = aa + 4*ai[i]; 4513 vi = aj + ai[i]; 4514 nz = diag[i] - ai[i]; 4515 idx += 2; 4516 s1 = b[idx];s2 = b[1+idx]; 4517 while (nz--) { 4518 jdx = 2*(*vi++); 4519 x1 = x[jdx];x2 = x[1+jdx]; 4520 s1 -= v[0]*x1 + v[2]*x2; 4521 s2 -= v[1]*x1 + v[3]*x2; 4522 v += 4; 4523 } 4524 x[idx] = s1; 4525 x[1+idx] = s2; 4526 } 4527 /* backward solve the upper triangular */ 4528 for (i=n-1; i>=0; i--){ 4529 v = aa + 4*diag[i] + 4; 4530 vi = aj + diag[i] + 1; 4531 nz = ai[i+1] - diag[i] - 1; 4532 idt = 2*i; 4533 s1 = x[idt]; s2 = x[1+idt]; 4534 while (nz--) { 4535 idx = 2*(*vi++); 4536 x1 = x[idx]; x2 = x[1+idx]; 4537 s1 -= v[0]*x1 + v[2]*x2; 4538 s2 -= v[1]*x1 + v[3]*x2; 4539 v += 4; 4540 } 4541 v = aa + 4*diag[i]; 4542 x[idt] = v[0]*s1 + v[2]*s2; 4543 x[1+idt] = v[1]*s1 + v[3]*s2; 4544 } 4545 4546 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4547 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4548 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4549 PetscFunctionReturn(0); 4550 } 4551 4552 #undef __FUNCT__ 4553 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4554 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4555 { 4556 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4557 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4558 PetscErrorCode ierr; 4559 PetscInt jdx; 4560 const MatScalar *aa=a->a,*v; 4561 PetscScalar *x,s1,s2,x1,x2; 4562 const PetscScalar *b; 4563 4564 PetscFunctionBegin; 4565 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4566 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4567 /* forward solve the lower triangular */ 4568 idx = 0; 4569 x[0] = b[idx]; x[1] = b[1+idx]; 4570 for (i=1; i<n; i++) { 4571 v = aa + 4*ai[i]; 4572 vi = aj + ai[i]; 4573 nz = ai[i+1] - ai[i]; 4574 idx = 2*i; 4575 s1 = b[idx];s2 = b[1+idx]; 4576 for(k=0;k<nz;k++){ 4577 jdx = 2*vi[k]; 4578 x1 = x[jdx];x2 = x[1+jdx]; 4579 s1 -= v[0]*x1 + v[2]*x2; 4580 s2 -= v[1]*x1 + v[3]*x2; 4581 v += 4; 4582 } 4583 x[idx] = s1; 4584 x[1+idx] = s2; 4585 } 4586 4587 /* backward solve the upper triangular */ 4588 for (i=n-1; i>=0; i--){ 4589 v = aa + 4*ai[2*n-i]; 4590 vi = aj + ai[2*n-i]; 4591 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4592 idt = 2*i; 4593 s1 = x[idt]; s2 = x[1+idt]; 4594 for(k=0;k<nz;k++){ 4595 idx = 2*vi[k]; 4596 x1 = x[idx]; x2 = x[1+idx]; 4597 s1 -= v[0]*x1 + v[2]*x2; 4598 s2 -= v[1]*x1 + v[3]*x2; 4599 v += 4; 4600 } 4601 /* x = inv_diagonal*x */ 4602 x[idt] = v[0]*s1 + v[2]*s2; 4603 x[1+idt] = v[1]*s1 + v[3]*s2; 4604 } 4605 4606 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4608 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4609 PetscFunctionReturn(0); 4610 } 4611 4612 #undef __FUNCT__ 4613 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4614 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4615 { 4616 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4617 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4618 PetscErrorCode ierr; 4619 PetscInt jdx; 4620 const MatScalar *aa=a->a,*v; 4621 PetscScalar *x,s1,s2,x1,x2; 4622 const PetscScalar *b; 4623 4624 PetscFunctionBegin; 4625 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4626 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4627 /* forward solve the lower triangular */ 4628 idx = 0; 4629 x[0] = b[idx]; x[1] = b[1+idx]; 4630 for (i=1; i<n; i++) { 4631 v = aa + 4*ai[i]; 4632 vi = aj + ai[i]; 4633 nz = ai[i+1] - ai[i]; 4634 idx = 2*i; 4635 s1 = b[idx];s2 = b[1+idx]; 4636 for(k=0;k<nz;k++){ 4637 jdx = 2*vi[k]; 4638 x1 = x[jdx];x2 = x[1+jdx]; 4639 s1 -= v[0]*x1 + v[2]*x2; 4640 s2 -= v[1]*x1 + v[3]*x2; 4641 v += 4; 4642 } 4643 x[idx] = s1; 4644 x[1+idx] = s2; 4645 } 4646 4647 /* backward solve the upper triangular */ 4648 for (i=n-1; i>=0; i--){ 4649 v = aa + 4*(adiag[i+1]+1); 4650 vi = aj + adiag[i+1]+1; 4651 nz = adiag[i] - adiag[i+1]-1; 4652 idt = 2*i; 4653 s1 = x[idt]; s2 = x[1+idt]; 4654 for(k=0;k<nz;k++){ 4655 idx = 2*vi[k]; 4656 x1 = x[idx]; x2 = x[1+idx]; 4657 s1 -= v[0]*x1 + v[2]*x2; 4658 s2 -= v[1]*x1 + v[3]*x2; 4659 v += 4; 4660 } 4661 /* x = inv_diagonal*x */ 4662 x[idt] = v[0]*s1 + v[2]*s2; 4663 x[1+idt] = v[1]*s1 + v[3]*s2; 4664 } 4665 4666 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4667 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4668 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4669 PetscFunctionReturn(0); 4670 } 4671 4672 #undef __FUNCT__ 4673 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4674 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4675 { 4676 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4677 IS iscol=a->col,isrow=a->row; 4678 PetscErrorCode ierr; 4679 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4680 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4681 MatScalar *aa=a->a,*v; 4682 PetscScalar *x,*b,s1,*t; 4683 4684 PetscFunctionBegin; 4685 if (!n) PetscFunctionReturn(0); 4686 4687 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4688 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4689 t = a->solve_work; 4690 4691 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4692 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4693 4694 /* forward solve the lower triangular */ 4695 t[0] = b[*r++]; 4696 for (i=1; i<n; i++) { 4697 v = aa + ai[i]; 4698 vi = aj + ai[i]; 4699 nz = diag[i] - ai[i]; 4700 s1 = b[*r++]; 4701 while (nz--) { 4702 s1 -= (*v++)*t[*vi++]; 4703 } 4704 t[i] = s1; 4705 } 4706 /* backward solve the upper triangular */ 4707 for (i=n-1; i>=0; i--){ 4708 v = aa + diag[i] + 1; 4709 vi = aj + diag[i] + 1; 4710 nz = ai[i+1] - diag[i] - 1; 4711 s1 = t[i]; 4712 while (nz--) { 4713 s1 -= (*v++)*t[*vi++]; 4714 } 4715 x[*c--] = t[i] = aa[diag[i]]*s1; 4716 } 4717 4718 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4719 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4720 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4721 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4722 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4723 PetscFunctionReturn(0); 4724 } 4725 /* 4726 Special case where the matrix was ILU(0) factored in the natural 4727 ordering. This eliminates the need for the column and row permutation. 4728 */ 4729 #undef __FUNCT__ 4730 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4731 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 4732 { 4733 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4734 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4735 PetscErrorCode ierr; 4736 PetscInt *diag = a->diag; 4737 MatScalar *aa=a->a; 4738 PetscScalar *x,*b; 4739 PetscScalar s1,x1; 4740 MatScalar *v; 4741 PetscInt jdx,idt,idx,nz,*vi,i; 4742 4743 PetscFunctionBegin; 4744 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4745 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4746 4747 /* forward solve the lower triangular */ 4748 idx = 0; 4749 x[0] = b[0]; 4750 for (i=1; i<n; i++) { 4751 v = aa + ai[i]; 4752 vi = aj + ai[i]; 4753 nz = diag[i] - ai[i]; 4754 idx += 1; 4755 s1 = b[idx]; 4756 while (nz--) { 4757 jdx = *vi++; 4758 x1 = x[jdx]; 4759 s1 -= v[0]*x1; 4760 v += 1; 4761 } 4762 x[idx] = s1; 4763 } 4764 /* backward solve the upper triangular */ 4765 for (i=n-1; i>=0; i--){ 4766 v = aa + diag[i] + 1; 4767 vi = aj + diag[i] + 1; 4768 nz = ai[i+1] - diag[i] - 1; 4769 idt = i; 4770 s1 = x[idt]; 4771 while (nz--) { 4772 idx = *vi++; 4773 x1 = x[idx]; 4774 s1 -= v[0]*x1; 4775 v += 1; 4776 } 4777 v = aa + diag[i]; 4778 x[idt] = v[0]*s1; 4779 } 4780 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4781 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4782 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4783 PetscFunctionReturn(0); 4784 } 4785 4786 /* ----------------------------------------------------------------*/ 4787 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 4788 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 4789 4790 #undef __FUNCT__ 4791 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 4792 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 4793 { 4794 Mat C=B; 4795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 4796 IS isrow = b->row,isicol = b->icol; 4797 PetscErrorCode ierr; 4798 const PetscInt *r,*ic,*ics; 4799 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 4800 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4801 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4802 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4803 MatScalar *v_work; 4804 4805 PetscFunctionBegin; 4806 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4807 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4808 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4809 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 4810 ics = ic; 4811 4812 /* generate work space needed by dense LU factorization */ 4813 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 4814 mwork = v_work + bs; 4815 v_pivots = (PetscInt*)(mwork + bs2); 4816 4817 for (i=0; i<n; i++){ 4818 /* zero rtmp */ 4819 /* L part */ 4820 nz = bi[i+1] - bi[i]; 4821 bjtmp = bj + bi[i]; 4822 for (j=0; j<nz; j++){ 4823 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4824 } 4825 4826 /* U part */ 4827 nz = bi[2*n-i+1] - bi[2*n-i]; 4828 bjtmp = bj + bi[2*n-i]; 4829 for (j=0; j<nz; j++){ 4830 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4831 } 4832 4833 /* load in initial (unfactored row) */ 4834 nz = ai[r[i]+1] - ai[r[i]]; 4835 ajtmp = aj + ai[r[i]]; 4836 v = aa + bs2*ai[r[i]]; 4837 for (j=0; j<nz; j++) { 4838 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 4839 } 4840 4841 /* elimination */ 4842 bjtmp = bj + bi[i]; 4843 nzL = bi[i+1] - bi[i]; 4844 for(k=0;k < nzL;k++) { 4845 row = bjtmp[k]; 4846 pc = rtmp + bs2*row; 4847 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4848 if (flg) { 4849 pv = b->a + bs2*bdiag[row]; 4850 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 4851 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 4852 pv = b->a + bs2*bi[2*n-row]; 4853 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 4854 for (j=0; j<nz; j++) { 4855 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4856 } 4857 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 4858 } 4859 } 4860 4861 /* finished row so stick it into b->a */ 4862 /* L part */ 4863 pv = b->a + bs2*bi[i] ; 4864 pj = b->j + bi[i] ; 4865 nz = bi[i+1] - bi[i]; 4866 for (j=0; j<nz; j++) { 4867 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4868 } 4869 4870 /* Mark diagonal and invert diagonal for simplier triangular solves */ 4871 pv = b->a + bs2*bdiag[i]; 4872 pj = b->j + bdiag[i]; 4873 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4874 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4875 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 4876 4877 /* U part */ 4878 pv = b->a + bs2*bi[2*n-i]; 4879 pj = b->j + bi[2*n-i]; 4880 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 4881 for (j=0; j<nz; j++){ 4882 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4883 } 4884 } 4885 4886 ierr = PetscFree(rtmp);CHKERRQ(ierr); 4887 ierr = PetscFree(v_work);CHKERRQ(ierr); 4888 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4889 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4890 4891 C->assembled = PETSC_TRUE; 4892 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 4893 PetscFunctionReturn(0); 4894 } 4895 4896 #undef __FUNCT__ 4897 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2" 4898 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 4899 { 4900 Mat C=B; 4901 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 4902 IS isrow = b->row,isicol = b->icol; 4903 PetscErrorCode ierr; 4904 const PetscInt *r,*ic,*ics; 4905 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 4906 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4907 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4908 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4909 MatScalar *v_work; 4910 4911 PetscFunctionBegin; 4912 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4913 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4914 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4915 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 4916 ics = ic; 4917 4918 /* generate work space needed by dense LU factorization */ 4919 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 4920 mwork = v_work + bs; 4921 v_pivots = (PetscInt*)(mwork + bs2); 4922 4923 for (i=0; i<n; i++){ 4924 /* zero rtmp */ 4925 /* L part */ 4926 nz = bi[i+1] - bi[i]; 4927 bjtmp = bj + bi[i]; 4928 for (j=0; j<nz; j++){ 4929 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4930 } 4931 4932 /* U part */ 4933 nz = bdiag[i] - bdiag[i+1]; 4934 bjtmp = bj + bdiag[i+1]+1; 4935 for (j=0; j<nz; j++){ 4936 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4937 } 4938 4939 /* load in initial (unfactored row) */ 4940 nz = ai[r[i]+1] - ai[r[i]]; 4941 ajtmp = aj + ai[r[i]]; 4942 v = aa + bs2*ai[r[i]]; 4943 for (j=0; j<nz; j++) { 4944 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 4945 } 4946 4947 /* elimination */ 4948 bjtmp = bj + bi[i]; 4949 nzL = bi[i+1] - bi[i]; 4950 for(k=0;k < nzL;k++) { 4951 row = bjtmp[k]; 4952 pc = rtmp + bs2*row; 4953 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4954 if (flg) { 4955 pv = b->a + bs2*bdiag[row]; 4956 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 4957 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 4958 pv = b->a + bs2*(bdiag[row+1]+1); 4959 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 4960 for (j=0; j<nz; j++) { 4961 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4962 } 4963 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 4964 } 4965 } 4966 4967 /* finished row so stick it into b->a */ 4968 /* L part */ 4969 pv = b->a + bs2*bi[i] ; 4970 pj = b->j + bi[i] ; 4971 nz = bi[i+1] - bi[i]; 4972 for (j=0; j<nz; j++) { 4973 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4974 } 4975 4976 /* Mark diagonal and invert diagonal for simplier triangular solves */ 4977 pv = b->a + bs2*bdiag[i]; 4978 pj = b->j + bdiag[i]; 4979 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4980 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4981 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 4982 4983 /* U part */ 4984 pv = b->a + bs2*(bdiag[i+1]+1); 4985 pj = b->j + bdiag[i+1]+1; 4986 nz = bdiag[i] - bdiag[i+1] - 1; 4987 for (j=0; j<nz; j++){ 4988 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4989 } 4990 } 4991 4992 ierr = PetscFree(rtmp);CHKERRQ(ierr); 4993 ierr = PetscFree(v_work);CHKERRQ(ierr); 4994 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4995 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4996 4997 C->assembled = PETSC_TRUE; 4998 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 4999 PetscFunctionReturn(0); 5000 } 5001 5002 /* 5003 ilu(0) with natural ordering under new data structure. 5004 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 5005 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 5006 */ 5007 #undef __FUNCT__ 5008 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 5009 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5010 { 5011 5012 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5013 PetscErrorCode ierr; 5014 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5015 PetscInt i,j,nz,*bi,*bj,*bdiag; 5016 5017 PetscFunctionBegin; 5018 /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */ 5019 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5020 b = (Mat_SeqBAIJ*)(fact)->data; 5021 5022 /* allocate matrix arrays for new data structure */ 5023 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr); 5024 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr); 5025 b->singlemalloc = PETSC_TRUE; 5026 if (!b->diag){ 5027 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5028 } 5029 bdiag = b->diag; 5030 5031 if (n > 0) { 5032 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5033 } 5034 5035 /* set bi and bj with new data structure */ 5036 bi = b->i; 5037 bj = b->j; 5038 5039 /* L part */ 5040 bi[0] = 0; 5041 for (i=0; i<n; i++){ 5042 nz = adiag[i] - ai[i]; 5043 bi[i+1] = bi[i] + nz; 5044 aj = a->j + ai[i]; 5045 for (j=0; j<nz; j++){ 5046 *bj = aj[j]; bj++; 5047 } 5048 } 5049 5050 /* U part */ 5051 bi[n+1] = bi[n]; 5052 for (i=n-1; i>=0; i--){ 5053 nz = ai[i+1] - adiag[i] - 1; 5054 bi[2*n-i+1] = bi[2*n-i] + nz + 1; 5055 aj = a->j + adiag[i] + 1; 5056 for (j=0; j<nz; j++){ 5057 *bj = aj[j]; bj++; 5058 } 5059 /* diag[i] */ 5060 *bj = i; bj++; 5061 bdiag[i] = bi[2*n-i+1]-1; 5062 } 5063 PetscFunctionReturn(0); 5064 } 5065 5066 #undef __FUNCT__ 5067 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 5068 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5069 { 5070 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5071 IS isicol; 5072 PetscErrorCode ierr; 5073 const PetscInt *r,*ic; 5074 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5075 PetscInt *bi,*cols,nnz,*cols_lvl; 5076 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5077 PetscInt i,levels,diagonal_fill; 5078 PetscTruth col_identity,row_identity,both_identity; 5079 PetscReal f; 5080 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5081 PetscBT lnkbt; 5082 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5083 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5084 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5085 PetscTruth missing; 5086 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5087 5088 PetscFunctionBegin; 5089 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5090 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5091 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5092 5093 f = info->fill; 5094 levels = (PetscInt)info->levels; 5095 diagonal_fill = (PetscInt)info->diagonal_fill; 5096 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5097 5098 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5099 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5100 both_identity = (PetscTruth) (row_identity && col_identity); 5101 5102 if (!levels && both_identity) { 5103 /* special case: ilu(0) with natural ordering */ 5104 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5105 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 5106 /* set MatSolve routines */ 5107 switch (bs){ 5108 case 2: 5109 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 5110 break; 5111 case 3: 5112 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 5113 break; 5114 case 4: 5115 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 5116 break; 5117 case 5: 5118 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 5119 break; 5120 case 6: 5121 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 5122 break; 5123 case 7: 5124 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 5125 break; 5126 default: 5127 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 5128 break; 5129 } 5130 5131 fact->factor = MAT_FACTOR_ILU; 5132 (fact)->info.factor_mallocs = 0; 5133 (fact)->info.fill_ratio_given = info->fill; 5134 (fact)->info.fill_ratio_needed = 1.0; 5135 b = (Mat_SeqBAIJ*)(fact)->data; 5136 b->row = isrow; 5137 b->col = iscol; 5138 b->icol = isicol; 5139 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5140 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5141 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5142 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5143 PetscFunctionReturn(0); 5144 } 5145 5146 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5147 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5148 5149 /* get new row pointers */ 5150 ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5151 bi[0] = 0; 5152 /* bdiag is location of diagonal in factor */ 5153 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5154 bdiag[0] = 0; 5155 5156 ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 5157 bjlvl_ptr = (PetscInt**)(bj_ptr + n); 5158 5159 /* create a linked list for storing column indices of the active row */ 5160 nlnk = n + 1; 5161 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5162 5163 /* initial FreeSpace size is f*(ai[n]+1) */ 5164 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5165 current_space = free_space; 5166 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5167 current_space_lvl = free_space_lvl; 5168 5169 for (i=0; i<n; i++) { 5170 nzi = 0; 5171 /* copy current row into linked list */ 5172 nnz = ai[r[i]+1] - ai[r[i]]; 5173 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5174 cols = aj + ai[r[i]]; 5175 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5176 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5177 nzi += nlnk; 5178 5179 /* make sure diagonal entry is included */ 5180 if (diagonal_fill && lnk[i] == -1) { 5181 fm = n; 5182 while (lnk[fm] < i) fm = lnk[fm]; 5183 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5184 lnk[fm] = i; 5185 lnk_lvl[i] = 0; 5186 nzi++; dcount++; 5187 } 5188 5189 /* add pivot rows into the active row */ 5190 nzbd = 0; 5191 prow = lnk[n]; 5192 while (prow < i) { 5193 nnz = bdiag[prow]; 5194 cols = bj_ptr[prow] + nnz + 1; 5195 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5196 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5197 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5198 nzi += nlnk; 5199 prow = lnk[prow]; 5200 nzbd++; 5201 } 5202 bdiag[i] = nzbd; 5203 bi[i+1] = bi[i] + nzi; 5204 5205 /* if free space is not available, make more free space */ 5206 if (current_space->local_remaining<nzi) { 5207 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5208 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5209 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5210 reallocs++; 5211 } 5212 5213 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5214 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5215 bj_ptr[i] = current_space->array; 5216 bjlvl_ptr[i] = current_space_lvl->array; 5217 5218 /* make sure the active row i has diagonal entry */ 5219 if (*(bj_ptr[i]+bdiag[i]) != i) { 5220 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5221 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5222 } 5223 5224 current_space->array += nzi; 5225 current_space->local_used += nzi; 5226 current_space->local_remaining -= nzi; 5227 current_space_lvl->array += nzi; 5228 current_space_lvl->local_used += nzi; 5229 current_space_lvl->local_remaining -= nzi; 5230 } 5231 5232 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5233 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5234 5235 /* destroy list of free space and other temporary arrays */ 5236 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5237 5238 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5239 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5240 5241 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5242 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5243 ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 5244 5245 #if defined(PETSC_USE_INFO) 5246 { 5247 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5248 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5249 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5250 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5251 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5252 if (diagonal_fill) { 5253 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5254 } 5255 } 5256 #endif 5257 5258 /* put together the new matrix */ 5259 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5260 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5261 b = (Mat_SeqBAIJ*)(fact)->data; 5262 b->free_a = PETSC_TRUE; 5263 b->free_ij = PETSC_TRUE; 5264 b->singlemalloc = PETSC_FALSE; 5265 ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5266 b->j = bj; 5267 b->i = bi; 5268 b->diag = bdiag; 5269 b->free_diag = PETSC_TRUE; 5270 b->ilen = 0; 5271 b->imax = 0; 5272 b->row = isrow; 5273 b->col = iscol; 5274 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5275 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5276 b->icol = isicol; 5277 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5278 /* In b structure: Free imax, ilen, old a, old j. 5279 Allocate bdiag, solve_work, new a, new j */ 5280 ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5281 b->maxnz = b->nz = bi[2*n+1] ; 5282 (fact)->info.factor_mallocs = reallocs; 5283 (fact)->info.fill_ratio_given = f; 5284 (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]); 5285 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 5286 /* set MatSolve routines */ 5287 if (both_identity){ 5288 switch (bs){ 5289 case 2: 5290 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 5291 break; 5292 case 3: 5293 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 5294 break; 5295 case 4: 5296 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 5297 break; 5298 case 5: 5299 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 5300 break; 5301 case 6: 5302 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 5303 break; 5304 case 7: 5305 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 5306 break; 5307 default: 5308 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 5309 break; 5310 } 5311 } else { 5312 switch (bs){ 5313 case 2: 5314 fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct; 5315 break; 5316 case 3: 5317 fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct; 5318 break; 5319 case 4: 5320 fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct; 5321 break; 5322 case 5: 5323 fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct; 5324 break; 5325 case 6: 5326 fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct; 5327 break; 5328 case 7: 5329 fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct; 5330 break; 5331 default: 5332 fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 5333 break; 5334 } 5335 } 5336 PetscFunctionReturn(0); 5337 } 5338 5339 /* 5340 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5341 except that the data structure of Mat_SeqAIJ is slightly different. 5342 Not a good example of code reuse. 5343 */ 5344 #undef __FUNCT__ 5345 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5346 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5347 { 5348 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5349 IS isicol; 5350 PetscErrorCode ierr; 5351 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5352 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5353 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5354 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5355 PetscTruth col_identity,row_identity,both_identity,flg; 5356 PetscReal f; 5357 PetscTruth newdatastruct=PETSC_FALSE; 5358 5359 PetscFunctionBegin; 5360 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 5361 if (newdatastruct){ 5362 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5363 PetscFunctionReturn(0); 5364 } 5365 5366 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5367 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5368 5369 f = info->fill; 5370 levels = (PetscInt)info->levels; 5371 diagonal_fill = (PetscInt)info->diagonal_fill; 5372 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5373 5374 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5375 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5376 both_identity = (PetscTruth) (row_identity && col_identity); 5377 5378 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5379 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5380 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5381 5382 fact->factor = MAT_FACTOR_ILU; 5383 b = (Mat_SeqBAIJ*)(fact)->data; 5384 b->row = isrow; 5385 b->col = iscol; 5386 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5387 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5388 b->icol = isicol; 5389 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5390 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5391 PetscFunctionReturn(0); 5392 } 5393 5394 /* general case perform the symbolic factorization */ 5395 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5396 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5397 5398 /* get new row pointers */ 5399 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5400 ainew[0] = 0; 5401 /* don't know how many column pointers are needed so estimate */ 5402 jmax = (PetscInt)(f*ai[n] + 1); 5403 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5404 /* ajfill is level of fill for each fill entry */ 5405 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5406 /* fill is a linked list of nonzeros in active row */ 5407 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5408 /* im is level for each filled value */ 5409 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5410 /* dloc is location of diagonal in factor */ 5411 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5412 dloc[0] = 0; 5413 for (prow=0; prow<n; prow++) { 5414 5415 /* copy prow into linked list */ 5416 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5417 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5418 xi = aj + ai[r[prow]]; 5419 fill[n] = n; 5420 fill[prow] = -1; /* marker for diagonal entry */ 5421 while (nz--) { 5422 fm = n; 5423 idx = ic[*xi++]; 5424 do { 5425 m = fm; 5426 fm = fill[m]; 5427 } while (fm < idx); 5428 fill[m] = idx; 5429 fill[idx] = fm; 5430 im[idx] = 0; 5431 } 5432 5433 /* make sure diagonal entry is included */ 5434 if (diagonal_fill && fill[prow] == -1) { 5435 fm = n; 5436 while (fill[fm] < prow) fm = fill[fm]; 5437 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5438 fill[fm] = prow; 5439 im[prow] = 0; 5440 nzf++; 5441 dcount++; 5442 } 5443 5444 nzi = 0; 5445 row = fill[n]; 5446 while (row < prow) { 5447 incrlev = im[row] + 1; 5448 nz = dloc[row]; 5449 xi = ajnew + ainew[row] + nz + 1; 5450 flev = ajfill + ainew[row] + nz + 1; 5451 nnz = ainew[row+1] - ainew[row] - nz - 1; 5452 fm = row; 5453 while (nnz-- > 0) { 5454 idx = *xi++; 5455 if (*flev + incrlev > levels) { 5456 flev++; 5457 continue; 5458 } 5459 do { 5460 m = fm; 5461 fm = fill[m]; 5462 } while (fm < idx); 5463 if (fm != idx) { 5464 im[idx] = *flev + incrlev; 5465 fill[m] = idx; 5466 fill[idx] = fm; 5467 fm = idx; 5468 nzf++; 5469 } else { 5470 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5471 } 5472 flev++; 5473 } 5474 row = fill[row]; 5475 nzi++; 5476 } 5477 /* copy new filled row into permanent storage */ 5478 ainew[prow+1] = ainew[prow] + nzf; 5479 if (ainew[prow+1] > jmax) { 5480 5481 /* estimate how much additional space we will need */ 5482 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5483 /* just double the memory each time */ 5484 PetscInt maxadd = jmax; 5485 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5486 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5487 jmax += maxadd; 5488 5489 /* allocate a longer ajnew and ajfill */ 5490 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5491 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5492 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5493 ajnew = xitmp; 5494 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5495 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5496 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5497 ajfill = xitmp; 5498 reallocate++; /* count how many reallocations are needed */ 5499 } 5500 xitmp = ajnew + ainew[prow]; 5501 flev = ajfill + ainew[prow]; 5502 dloc[prow] = nzi; 5503 fm = fill[n]; 5504 while (nzf--) { 5505 *xitmp++ = fm; 5506 *flev++ = im[fm]; 5507 fm = fill[fm]; 5508 } 5509 /* make sure row has diagonal entry */ 5510 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5511 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5512 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5513 } 5514 } 5515 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5516 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5517 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5518 ierr = PetscFree(fill);CHKERRQ(ierr); 5519 ierr = PetscFree(im);CHKERRQ(ierr); 5520 5521 #if defined(PETSC_USE_INFO) 5522 { 5523 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5524 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5525 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5526 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5527 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5528 if (diagonal_fill) { 5529 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5530 } 5531 } 5532 #endif 5533 5534 /* put together the new matrix */ 5535 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5536 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5537 b = (Mat_SeqBAIJ*)(fact)->data; 5538 b->free_a = PETSC_TRUE; 5539 b->free_ij = PETSC_TRUE; 5540 b->singlemalloc = PETSC_FALSE; 5541 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5542 b->j = ajnew; 5543 b->i = ainew; 5544 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5545 b->diag = dloc; 5546 b->free_diag = PETSC_TRUE; 5547 b->ilen = 0; 5548 b->imax = 0; 5549 b->row = isrow; 5550 b->col = iscol; 5551 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5552 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5553 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5554 b->icol = isicol; 5555 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5556 /* In b structure: Free imax, ilen, old a, old j. 5557 Allocate dloc, solve_work, new a, new j */ 5558 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5559 b->maxnz = b->nz = ainew[n]; 5560 5561 (fact)->info.factor_mallocs = reallocate; 5562 (fact)->info.fill_ratio_given = f; 5563 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5564 5565 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5566 PetscFunctionReturn(0); 5567 } 5568 5569 #undef __FUNCT__ 5570 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5571 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5572 { 5573 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5574 /* int i,*AJ=a->j,nz=a->nz; */ 5575 PetscFunctionBegin; 5576 /* Undo Column scaling */ 5577 /* while (nz--) { */ 5578 /* AJ[i] = AJ[i]/4; */ 5579 /* } */ 5580 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5581 A->ops->setunfactored = PETSC_NULL; 5582 PetscFunctionReturn(0); 5583 } 5584 5585 #undef __FUNCT__ 5586 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5587 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5588 { 5589 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5590 PetscInt *AJ=a->j,nz=a->nz; 5591 unsigned short *aj=(unsigned short *)AJ; 5592 PetscFunctionBegin; 5593 /* Is this really necessary? */ 5594 while (nz--) { 5595 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5596 } 5597 A->ops->setunfactored = PETSC_NULL; 5598 PetscFunctionReturn(0); 5599 } 5600 5601 5602