1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124 PetscInt *diag = a->diag,oidx; 125 MatScalar *aa=a->a,*v; 126 PetscScalar s1,s2,s3,x1,x2,x3; 127 PetscScalar *x,*b; 128 129 PetscFunctionBegin; 130 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133 134 /* forward solve the U^T */ 135 idx = 0; 136 for (i=0; i<n; i++) { 137 138 v = aa + 9*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144 v += 9; 145 146 vi = aj + diag[i] + 1; 147 nz = ai[i+1] - diag[i] - 1; 148 while (nz--) { 149 oidx = 3*(*vi++); 150 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153 v += 9; 154 } 155 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156 idx += 3; 157 } 158 /* backward solve the L^T */ 159 for (i=n-1; i>=0; i--){ 160 v = aa + 9*diag[i] - 9; 161 vi = aj + diag[i] - 1; 162 nz = diag[i] - ai[i]; 163 idt = 3*i; 164 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165 while (nz--) { 166 idx = 3*(*vi--); 167 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170 v -= 9; 171 } 172 } 173 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182 { 183 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184 PetscErrorCode ierr; 185 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186 PetscInt *diag = a->diag,oidx; 187 MatScalar *aa=a->a,*v; 188 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 189 PetscScalar *x,*b; 190 191 PetscFunctionBegin; 192 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 193 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195 196 /* forward solve the U^T */ 197 idx = 0; 198 for (i=0; i<n; i++) { 199 200 v = aa + 16*diag[i]; 201 /* multiply by the inverse of the block diagonal */ 202 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207 v += 16; 208 209 vi = aj + diag[i] + 1; 210 nz = ai[i+1] - diag[i] - 1; 211 while (nz--) { 212 oidx = 4*(*vi++); 213 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217 v += 16; 218 } 219 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220 idx += 4; 221 } 222 /* backward solve the L^T */ 223 for (i=n-1; i>=0; i--){ 224 v = aa + 16*diag[i] - 16; 225 vi = aj + diag[i] - 1; 226 nz = diag[i] - ai[i]; 227 idt = 4*i; 228 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229 while (nz--) { 230 idx = 4*(*vi--); 231 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235 v -= 16; 236 } 237 } 238 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 239 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247 { 248 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249 PetscErrorCode ierr; 250 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251 PetscInt *diag = a->diag,oidx; 252 MatScalar *aa=a->a,*v; 253 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 254 PetscScalar *x,*b; 255 256 PetscFunctionBegin; 257 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 258 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 259 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260 261 /* forward solve the U^T */ 262 idx = 0; 263 for (i=0; i<n; i++) { 264 265 v = aa + 25*diag[i]; 266 /* multiply by the inverse of the block diagonal */ 267 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273 v += 25; 274 275 vi = aj + diag[i] + 1; 276 nz = ai[i+1] - diag[i] - 1; 277 while (nz--) { 278 oidx = 5*(*vi++); 279 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284 v += 25; 285 } 286 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287 idx += 5; 288 } 289 /* backward solve the L^T */ 290 for (i=n-1; i>=0; i--){ 291 v = aa + 25*diag[i] - 25; 292 vi = aj + diag[i] - 1; 293 nz = diag[i] - ai[i]; 294 idt = 5*i; 295 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296 while (nz--) { 297 idx = 5*(*vi--); 298 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303 v -= 25; 304 } 305 } 306 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 307 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311 312 #undef __FUNCT__ 313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315 { 316 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317 PetscErrorCode ierr; 318 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319 PetscInt *diag = a->diag,oidx; 320 MatScalar *aa=a->a,*v; 321 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 322 PetscScalar *x,*b; 323 324 PetscFunctionBegin; 325 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 326 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 327 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328 329 /* forward solve the U^T */ 330 idx = 0; 331 for (i=0; i<n; i++) { 332 333 v = aa + 36*diag[i]; 334 /* multiply by the inverse of the block diagonal */ 335 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336 x6 = x[5+idx]; 337 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343 v += 36; 344 345 vi = aj + diag[i] + 1; 346 nz = ai[i+1] - diag[i] - 1; 347 while (nz--) { 348 oidx = 6*(*vi++); 349 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355 v += 36; 356 } 357 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358 x[5+idx] = s6; 359 idx += 6; 360 } 361 /* backward solve the L^T */ 362 for (i=n-1; i>=0; i--){ 363 v = aa + 36*diag[i] - 36; 364 vi = aj + diag[i] - 1; 365 nz = diag[i] - ai[i]; 366 idt = 6*i; 367 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368 s6 = x[5+idt]; 369 while (nz--) { 370 idx = 6*(*vi--); 371 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377 v -= 36; 378 } 379 } 380 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389 { 390 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391 PetscErrorCode ierr; 392 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393 PetscInt *diag = a->diag,oidx; 394 MatScalar *aa=a->a,*v; 395 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 396 PetscScalar *x,*b; 397 398 PetscFunctionBegin; 399 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 400 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 401 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402 403 /* forward solve the U^T */ 404 idx = 0; 405 for (i=0; i<n; i++) { 406 407 v = aa + 49*diag[i]; 408 /* multiply by the inverse of the block diagonal */ 409 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410 x6 = x[5+idx]; x7 = x[6+idx]; 411 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418 v += 49; 419 420 vi = aj + diag[i] + 1; 421 nz = ai[i+1] - diag[i] - 1; 422 while (nz--) { 423 oidx = 7*(*vi++); 424 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431 v += 49; 432 } 433 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434 x[5+idx] = s6;x[6+idx] = s7; 435 idx += 7; 436 } 437 /* backward solve the L^T */ 438 for (i=n-1; i>=0; i--){ 439 v = aa + 49*diag[i] - 49; 440 vi = aj + diag[i] - 1; 441 nz = diag[i] - ai[i]; 442 idt = 7*i; 443 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444 s6 = x[5+idt];s7 = x[6+idt]; 445 while (nz--) { 446 idx = 7*(*vi--); 447 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454 v -= 49; 455 } 456 } 457 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460 PetscFunctionReturn(0); 461 } 462 463 /*---------------------------------------------------------------------------------------------*/ 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469 IS iscol=a->col,isrow=a->row; 470 PetscErrorCode ierr; 471 const PetscInt *r,*c,*rout,*cout; 472 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473 PetscInt *diag = a->diag; 474 MatScalar *aa=a->a,*v; 475 PetscScalar s1,*x,*b,*t; 476 477 PetscFunctionBegin; 478 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 482 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484 485 /* copy the b into temp work space according to permutation */ 486 for (i=0; i<n; i++) { 487 t[i] = b[c[i]]; 488 } 489 490 /* forward solve the U^T */ 491 for (i=0; i<n; i++) { 492 493 v = aa + diag[i]; 494 /* multiply by the inverse of the block diagonal */ 495 s1 = (*v++)*t[i]; 496 vi = aj + diag[i] + 1; 497 nz = ai[i+1] - diag[i] - 1; 498 while (nz--) { 499 t[*vi++] -= (*v++)*s1; 500 } 501 t[i] = s1; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--){ 505 v = aa + diag[i] - 1; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 s1 = t[i]; 509 while (nz--) { 510 t[*vi--] -= (*v--)*s1; 511 } 512 } 513 514 /* copy t into x according to permutation */ 515 for (i=0; i<n; i++) { 516 x[r[i]] = t[i]; 517 } 518 519 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 521 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530 { 531 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532 IS iscol=a->col,isrow=a->row; 533 PetscErrorCode ierr; 534 const PetscInt *r,*c,*rout,*cout; 535 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536 PetscInt *diag = a->diag,ii,ic,ir,oidx; 537 MatScalar *aa=a->a,*v; 538 PetscScalar s1,s2,x1,x2; 539 PetscScalar *x,*b,*t; 540 541 PetscFunctionBegin; 542 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544 t = a->solve_work; 545 546 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548 549 /* copy the b into temp work space according to permutation */ 550 ii = 0; 551 for (i=0; i<n; i++) { 552 ic = 2*c[i]; 553 t[ii] = b[ic]; 554 t[ii+1] = b[ic+1]; 555 ii += 2; 556 } 557 558 /* forward solve the U^T */ 559 idx = 0; 560 for (i=0; i<n; i++) { 561 562 v = aa + 4*diag[i]; 563 /* multiply by the inverse of the block diagonal */ 564 x1 = t[idx]; x2 = t[1+idx]; 565 s1 = v[0]*x1 + v[1]*x2; 566 s2 = v[2]*x1 + v[3]*x2; 567 v += 4; 568 569 vi = aj + diag[i] + 1; 570 nz = ai[i+1] - diag[i] - 1; 571 while (nz--) { 572 oidx = 2*(*vi++); 573 t[oidx] -= v[0]*s1 + v[1]*s2; 574 t[oidx+1] -= v[2]*s1 + v[3]*s2; 575 v += 4; 576 } 577 t[idx] = s1;t[1+idx] = s2; 578 idx += 2; 579 } 580 /* backward solve the L^T */ 581 for (i=n-1; i>=0; i--){ 582 v = aa + 4*diag[i] - 4; 583 vi = aj + diag[i] - 1; 584 nz = diag[i] - ai[i]; 585 idt = 2*i; 586 s1 = t[idt]; s2 = t[1+idt]; 587 while (nz--) { 588 idx = 2*(*vi--); 589 t[idx] -= v[0]*s1 + v[1]*s2; 590 t[idx+1] -= v[2]*s1 + v[3]*s2; 591 v -= 4; 592 } 593 } 594 595 /* copy t into x according to permutation */ 596 ii = 0; 597 for (i=0; i<n; i++) { 598 ir = 2*r[i]; 599 x[ir] = t[ii]; 600 x[ir+1] = t[ii+1]; 601 ii += 2; 602 } 603 604 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 606 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 #undef __FUNCT__ 613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615 { 616 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617 IS iscol=a->col,isrow=a->row; 618 PetscErrorCode ierr; 619 const PetscInt *r,*c,*rout,*cout; 620 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621 PetscInt *diag = a->diag,ii,ic,ir,oidx; 622 MatScalar *aa=a->a,*v; 623 PetscScalar s1,s2,s3,x1,x2,x3; 624 PetscScalar *x,*b,*t; 625 626 PetscFunctionBegin; 627 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 628 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629 t = a->solve_work; 630 631 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633 634 /* copy the b into temp work space according to permutation */ 635 ii = 0; 636 for (i=0; i<n; i++) { 637 ic = 3*c[i]; 638 t[ii] = b[ic]; 639 t[ii+1] = b[ic+1]; 640 t[ii+2] = b[ic+2]; 641 ii += 3; 642 } 643 644 /* forward solve the U^T */ 645 idx = 0; 646 for (i=0; i<n; i++) { 647 648 v = aa + 9*diag[i]; 649 /* multiply by the inverse of the block diagonal */ 650 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654 v += 9; 655 656 vi = aj + diag[i] + 1; 657 nz = ai[i+1] - diag[i] - 1; 658 while (nz--) { 659 oidx = 3*(*vi++); 660 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663 v += 9; 664 } 665 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666 idx += 3; 667 } 668 /* backward solve the L^T */ 669 for (i=n-1; i>=0; i--){ 670 v = aa + 9*diag[i] - 9; 671 vi = aj + diag[i] - 1; 672 nz = diag[i] - ai[i]; 673 idt = 3*i; 674 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675 while (nz--) { 676 idx = 3*(*vi--); 677 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680 v -= 9; 681 } 682 } 683 684 /* copy t into x according to permutation */ 685 ii = 0; 686 for (i=0; i<n; i++) { 687 ir = 3*r[i]; 688 x[ir] = t[ii]; 689 x[ir+1] = t[ii+1]; 690 x[ir+2] = t[ii+2]; 691 ii += 3; 692 } 693 694 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 696 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707 IS iscol=a->col,isrow=a->row; 708 PetscErrorCode ierr; 709 const PetscInt *r,*c,*rout,*cout; 710 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711 PetscInt *diag = a->diag,ii,ic,ir,oidx; 712 MatScalar *aa=a->a,*v; 713 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 714 PetscScalar *x,*b,*t; 715 716 PetscFunctionBegin; 717 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 t = a->solve_work; 720 721 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723 724 /* copy the b into temp work space according to permutation */ 725 ii = 0; 726 for (i=0; i<n; i++) { 727 ic = 4*c[i]; 728 t[ii] = b[ic]; 729 t[ii+1] = b[ic+1]; 730 t[ii+2] = b[ic+2]; 731 t[ii+3] = b[ic+3]; 732 ii += 4; 733 } 734 735 /* forward solve the U^T */ 736 idx = 0; 737 for (i=0; i<n; i++) { 738 739 v = aa + 16*diag[i]; 740 /* multiply by the inverse of the block diagonal */ 741 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746 v += 16; 747 748 vi = aj + diag[i] + 1; 749 nz = ai[i+1] - diag[i] - 1; 750 while (nz--) { 751 oidx = 4*(*vi++); 752 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756 v += 16; 757 } 758 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759 idx += 4; 760 } 761 /* backward solve the L^T */ 762 for (i=n-1; i>=0; i--){ 763 v = aa + 16*diag[i] - 16; 764 vi = aj + diag[i] - 1; 765 nz = diag[i] - ai[i]; 766 idt = 4*i; 767 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768 while (nz--) { 769 idx = 4*(*vi--); 770 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774 v -= 16; 775 } 776 } 777 778 /* copy t into x according to permutation */ 779 ii = 0; 780 for (i=0; i<n; i++) { 781 ir = 4*r[i]; 782 x[ir] = t[ii]; 783 x[ir+1] = t[ii+1]; 784 x[ir+2] = t[ii+2]; 785 x[ir+3] = t[ii+3]; 786 ii += 4; 787 } 788 789 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 791 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800 { 801 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802 IS iscol=a->col,isrow=a->row; 803 PetscErrorCode ierr; 804 const PetscInt *r,*c,*rout,*cout; 805 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806 PetscInt *diag = a->diag,ii,ic,ir,oidx; 807 MatScalar *aa=a->a,*v; 808 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 809 PetscScalar *x,*b,*t; 810 811 PetscFunctionBegin; 812 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 813 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814 t = a->solve_work; 815 816 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818 819 /* copy the b into temp work space according to permutation */ 820 ii = 0; 821 for (i=0; i<n; i++) { 822 ic = 5*c[i]; 823 t[ii] = b[ic]; 824 t[ii+1] = b[ic+1]; 825 t[ii+2] = b[ic+2]; 826 t[ii+3] = b[ic+3]; 827 t[ii+4] = b[ic+4]; 828 ii += 5; 829 } 830 831 /* forward solve the U^T */ 832 idx = 0; 833 for (i=0; i<n; i++) { 834 835 v = aa + 25*diag[i]; 836 /* multiply by the inverse of the block diagonal */ 837 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843 v += 25; 844 845 vi = aj + diag[i] + 1; 846 nz = ai[i+1] - diag[i] - 1; 847 while (nz--) { 848 oidx = 5*(*vi++); 849 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v += 25; 855 } 856 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857 idx += 5; 858 } 859 /* backward solve the L^T */ 860 for (i=n-1; i>=0; i--){ 861 v = aa + 25*diag[i] - 25; 862 vi = aj + diag[i] - 1; 863 nz = diag[i] - ai[i]; 864 idt = 5*i; 865 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866 while (nz--) { 867 idx = 5*(*vi--); 868 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873 v -= 25; 874 } 875 } 876 877 /* copy t into x according to permutation */ 878 ii = 0; 879 for (i=0; i<n; i++) { 880 ir = 5*r[i]; 881 x[ir] = t[ii]; 882 x[ir+1] = t[ii+1]; 883 x[ir+2] = t[ii+2]; 884 x[ir+3] = t[ii+3]; 885 x[ir+4] = t[ii+4]; 886 ii += 5; 887 } 888 889 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 891 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902 IS iscol=a->col,isrow=a->row; 903 PetscErrorCode ierr; 904 const PetscInt *r,*c,*rout,*cout; 905 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906 PetscInt *diag = a->diag,ii,ic,ir,oidx; 907 MatScalar *aa=a->a,*v; 908 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 909 PetscScalar *x,*b,*t; 910 911 PetscFunctionBegin; 912 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 t = a->solve_work; 915 916 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918 919 /* copy the b into temp work space according to permutation */ 920 ii = 0; 921 for (i=0; i<n; i++) { 922 ic = 6*c[i]; 923 t[ii] = b[ic]; 924 t[ii+1] = b[ic+1]; 925 t[ii+2] = b[ic+2]; 926 t[ii+3] = b[ic+3]; 927 t[ii+4] = b[ic+4]; 928 t[ii+5] = b[ic+5]; 929 ii += 6; 930 } 931 932 /* forward solve the U^T */ 933 idx = 0; 934 for (i=0; i<n; i++) { 935 936 v = aa + 36*diag[i]; 937 /* multiply by the inverse of the block diagonal */ 938 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939 x6 = t[5+idx]; 940 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946 v += 36; 947 948 vi = aj + diag[i] + 1; 949 nz = ai[i+1] - diag[i] - 1; 950 while (nz--) { 951 oidx = 6*(*vi++); 952 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958 v += 36; 959 } 960 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961 t[5+idx] = s6; 962 idx += 6; 963 } 964 /* backward solve the L^T */ 965 for (i=n-1; i>=0; i--){ 966 v = aa + 36*diag[i] - 36; 967 vi = aj + diag[i] - 1; 968 nz = diag[i] - ai[i]; 969 idt = 6*i; 970 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971 s6 = t[5+idt]; 972 while (nz--) { 973 idx = 6*(*vi--); 974 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980 v -= 36; 981 } 982 } 983 984 /* copy t into x according to permutation */ 985 ii = 0; 986 for (i=0; i<n; i++) { 987 ir = 6*r[i]; 988 x[ir] = t[ii]; 989 x[ir+1] = t[ii+1]; 990 x[ir+2] = t[ii+2]; 991 x[ir+3] = t[ii+3]; 992 x[ir+4] = t[ii+4]; 993 x[ir+5] = t[ii+5]; 994 ii += 6; 995 } 996 997 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 999 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1000 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008 { 1009 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010 IS iscol=a->col,isrow=a->row; 1011 PetscErrorCode ierr; 1012 const PetscInt *r,*c,*rout,*cout; 1013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015 MatScalar *aa=a->a,*v; 1016 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1017 PetscScalar *x,*b,*t; 1018 1019 PetscFunctionBegin; 1020 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022 t = a->solve_work; 1023 1024 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026 1027 /* copy the b into temp work space according to permutation */ 1028 ii = 0; 1029 for (i=0; i<n; i++) { 1030 ic = 7*c[i]; 1031 t[ii] = b[ic]; 1032 t[ii+1] = b[ic+1]; 1033 t[ii+2] = b[ic+2]; 1034 t[ii+3] = b[ic+3]; 1035 t[ii+4] = b[ic+4]; 1036 t[ii+5] = b[ic+5]; 1037 t[ii+6] = b[ic+6]; 1038 ii += 7; 1039 } 1040 1041 /* forward solve the U^T */ 1042 idx = 0; 1043 for (i=0; i<n; i++) { 1044 1045 v = aa + 49*diag[i]; 1046 /* multiply by the inverse of the block diagonal */ 1047 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048 x6 = t[5+idx]; x7 = t[6+idx]; 1049 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056 v += 49; 1057 1058 vi = aj + diag[i] + 1; 1059 nz = ai[i+1] - diag[i] - 1; 1060 while (nz--) { 1061 oidx = 7*(*vi++); 1062 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v += 49; 1070 } 1071 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072 t[5+idx] = s6;t[6+idx] = s7; 1073 idx += 7; 1074 } 1075 /* backward solve the L^T */ 1076 for (i=n-1; i>=0; i--){ 1077 v = aa + 49*diag[i] - 49; 1078 vi = aj + diag[i] - 1; 1079 nz = diag[i] - ai[i]; 1080 idt = 7*i; 1081 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082 s6 = t[5+idt];s7 = t[6+idt]; 1083 while (nz--) { 1084 idx = 7*(*vi--); 1085 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092 v -= 49; 1093 } 1094 } 1095 1096 /* copy t into x according to permutation */ 1097 ii = 0; 1098 for (i=0; i<n; i++) { 1099 ir = 7*r[i]; 1100 x[ir] = t[ii]; 1101 x[ir+1] = t[ii+1]; 1102 x[ir+2] = t[ii+2]; 1103 x[ir+3] = t[ii+3]; 1104 x[ir+4] = t[ii+4]; 1105 x[ir+5] = t[ii+5]; 1106 x[ir+6] = t[ii+6]; 1107 ii += 7; 1108 } 1109 1110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1112 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 /* ----------------------------------------------------------- */ 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1124 IS iscol=a->col,isrow=a->row; 1125 PetscErrorCode ierr; 1126 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1127 PetscInt i,n=a->mbs; 1128 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1129 MatScalar *aa=a->a,*v; 1130 PetscScalar *x,*b,*s,*t,*ls; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135 t = a->solve_work; 1136 1137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1139 1140 /* forward solve the lower triangular */ 1141 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1142 for (i=1; i<n; i++) { 1143 v = aa + bs2*ai[i]; 1144 vi = aj + ai[i]; 1145 nz = a->diag[i] - ai[i]; 1146 s = t + bs*i; 1147 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 while (nz--) { 1149 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1150 v += bs2; 1151 } 1152 } 1153 /* backward solve the upper triangular */ 1154 ls = a->solve_work + A->cmap->n; 1155 for (i=n-1; i>=0; i--){ 1156 v = aa + bs2*(a->diag[i] + 1); 1157 vi = aj + a->diag[i] + 1; 1158 nz = ai[i+1] - a->diag[i] - 1; 1159 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1160 while (nz--) { 1161 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1162 v += bs2; 1163 } 1164 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1165 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1166 } 1167 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 /* ----------------------------------------------------------- */ 1177 #undef __FUNCT__ 1178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 1179 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1180 { 1181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1182 IS iscol=a->col,isrow=a->row; 1183 PetscErrorCode ierr; 1184 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1185 PetscInt i,n=a->mbs,j; 1186 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1187 const MatScalar *aa=a->a,*v; 1188 PetscScalar *x,*t,*ls; 1189 const PetscScalar *b; 1190 PetscFunctionBegin; 1191 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1193 t = a->solve_work; 1194 1195 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1196 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1197 1198 /* copy the b into temp work space according to permutation */ 1199 for (i=0; i<n; i++) { 1200 for (j=0; j<bs; j++) { 1201 t[i*bs+j] = b[c[i]*bs+j]; 1202 } 1203 } 1204 1205 1206 /* forward solve the upper triangular transpose */ 1207 ls = a->solve_work + A->cmap->n; 1208 for (i=0; i<n; i++){ 1209 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1210 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1211 v = aa + bs2*(a->diag[i] + 1); 1212 vi = aj + a->diag[i] + 1; 1213 nz = ai[i+1] - a->diag[i] - 1; 1214 while (nz--) { 1215 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1216 v += bs2; 1217 } 1218 } 1219 1220 /* backward solve the lower triangular transpose */ 1221 for (i=n-1; i>=0; i--) { 1222 v = aa + bs2*ai[i]; 1223 vi = aj + ai[i]; 1224 nz = a->diag[i] - ai[i]; 1225 while (nz--) { 1226 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1227 v += bs2; 1228 } 1229 } 1230 1231 /* copy t into x according to permutation */ 1232 for (i=0; i<n; i++) { 1233 for (j=0; j<bs; j++) { 1234 x[bs*r[i]+j] = t[bs*i+j]; 1235 } 1236 } 1237 1238 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1239 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1240 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1241 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1242 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1243 PetscFunctionReturn(0); 1244 } 1245 1246 #undef __FUNCT__ 1247 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1248 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1249 { 1250 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1251 IS iscol=a->col,isrow=a->row; 1252 PetscErrorCode ierr; 1253 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1254 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1255 MatScalar *aa=a->a,*v; 1256 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1257 PetscScalar *x,*b,*t; 1258 1259 PetscFunctionBegin; 1260 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1261 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1262 t = a->solve_work; 1263 1264 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1265 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1266 1267 /* forward solve the lower triangular */ 1268 idx = 7*(*r++); 1269 t[0] = b[idx]; t[1] = b[1+idx]; 1270 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1271 t[5] = b[5+idx]; t[6] = b[6+idx]; 1272 1273 for (i=1; i<n; i++) { 1274 v = aa + 49*ai[i]; 1275 vi = aj + ai[i]; 1276 nz = diag[i] - ai[i]; 1277 idx = 7*(*r++); 1278 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1279 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1280 while (nz--) { 1281 idx = 7*(*vi++); 1282 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1283 x4 = t[3+idx];x5 = t[4+idx]; 1284 x6 = t[5+idx];x7 = t[6+idx]; 1285 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1286 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1287 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1288 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1289 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1290 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1291 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1292 v += 49; 1293 } 1294 idx = 7*i; 1295 t[idx] = s1;t[1+idx] = s2; 1296 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1297 t[5+idx] = s6;t[6+idx] = s7; 1298 } 1299 /* backward solve the upper triangular */ 1300 for (i=n-1; i>=0; i--){ 1301 v = aa + 49*diag[i] + 49; 1302 vi = aj + diag[i] + 1; 1303 nz = ai[i+1] - diag[i] - 1; 1304 idt = 7*i; 1305 s1 = t[idt]; s2 = t[1+idt]; 1306 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1307 s6 = t[5+idt];s7 = t[6+idt]; 1308 while (nz--) { 1309 idx = 7*(*vi++); 1310 x1 = t[idx]; x2 = t[1+idx]; 1311 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1312 x6 = t[5+idx]; x7 = t[6+idx]; 1313 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1314 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1315 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1316 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1317 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1318 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1319 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1320 v += 49; 1321 } 1322 idc = 7*(*c--); 1323 v = aa + 49*diag[i]; 1324 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1325 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1326 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1327 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1328 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1329 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1330 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1331 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1332 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1333 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1334 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1335 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1336 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1337 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1338 } 1339 1340 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1341 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1342 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1343 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1344 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 #undef __FUNCT__ 1349 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1350 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1351 { 1352 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1353 IS iscol=a->col,isrow=a->row; 1354 PetscErrorCode ierr; 1355 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 1356 PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 1357 MatScalar *aa=a->a,*v; 1358 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1359 PetscScalar *x,*b,*t; 1360 1361 PetscFunctionBegin; 1362 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1363 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1364 t = a->solve_work; 1365 1366 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1367 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1368 1369 /* forward solve the lower triangular */ 1370 idx = 7*r[0]; 1371 t[0] = b[idx]; t[1] = b[1+idx]; 1372 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1373 t[5] = b[5+idx]; t[6] = b[6+idx]; 1374 1375 for (i=1; i<n; i++) { 1376 v = aa + 49*ai[i]; 1377 vi = aj + ai[i]; 1378 nz = ai[i+1] - ai[i]; 1379 idx = 7*r[i]; 1380 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1381 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1382 for(m=0;m<nz;m++){ 1383 idx = 7*vi[m]; 1384 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1385 x4 = t[3+idx];x5 = t[4+idx]; 1386 x6 = t[5+idx];x7 = t[6+idx]; 1387 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1388 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1389 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1390 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1391 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1392 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1393 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1394 v += 49; 1395 } 1396 idx = 7*i; 1397 t[idx] = s1;t[1+idx] = s2; 1398 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1399 t[5+idx] = s6;t[6+idx] = s7; 1400 } 1401 /* backward solve the upper triangular */ 1402 for (i=n-1; i>=0; i--){ 1403 k = 2*n-i; 1404 v = aa + 49*ai[k]; 1405 vi = aj + ai[k]; 1406 nz = ai[k+1] - ai[k] - 1; 1407 idt = 7*i; 1408 s1 = t[idt]; s2 = t[1+idt]; 1409 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1410 s6 = t[5+idt];s7 = t[6+idt]; 1411 for(m=0;m<nz;m++){ 1412 idx = 7*vi[m]; 1413 x1 = t[idx]; x2 = t[1+idx]; 1414 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1415 x6 = t[5+idx]; x7 = t[6+idx]; 1416 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1417 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1418 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1419 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1420 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1421 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1422 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1423 v += 49; 1424 } 1425 idc = 7*c[i]; 1426 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1427 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1428 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1429 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1430 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1431 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1432 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1433 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1434 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1435 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1436 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1437 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1438 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1439 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1440 } 1441 1442 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1443 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1444 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1445 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1446 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1447 PetscFunctionReturn(0); 1448 } 1449 1450 #undef __FUNCT__ 1451 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct_v2" 1452 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1453 { 1454 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1455 IS iscol=a->col,isrow=a->row; 1456 PetscErrorCode ierr; 1457 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 1458 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 1459 MatScalar *aa=a->a,*v; 1460 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1461 PetscScalar *x,*b,*t; 1462 1463 PetscFunctionBegin; 1464 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1465 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1466 t = a->solve_work; 1467 1468 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1469 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1470 1471 /* forward solve the lower triangular */ 1472 idx = 7*r[0]; 1473 t[0] = b[idx]; t[1] = b[1+idx]; 1474 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1475 t[5] = b[5+idx]; t[6] = b[6+idx]; 1476 1477 for (i=1; i<n; i++) { 1478 v = aa + 49*ai[i]; 1479 vi = aj + ai[i]; 1480 nz = ai[i+1] - ai[i]; 1481 idx = 7*r[i]; 1482 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1483 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1484 for(m=0;m<nz;m++){ 1485 idx = 7*vi[m]; 1486 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1487 x4 = t[3+idx];x5 = t[4+idx]; 1488 x6 = t[5+idx];x7 = t[6+idx]; 1489 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1490 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1491 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1492 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1493 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1494 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1495 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1496 v += 49; 1497 } 1498 idx = 7*i; 1499 t[idx] = s1;t[1+idx] = s2; 1500 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1501 t[5+idx] = s6;t[6+idx] = s7; 1502 } 1503 /* backward solve the upper triangular */ 1504 for (i=n-1; i>=0; i--){ 1505 v = aa + 49*(adiag[i+1]+1); 1506 vi = aj + adiag[i+1]+1; 1507 nz = adiag[i] - adiag[i+1] - 1; 1508 idt = 7*i; 1509 s1 = t[idt]; s2 = t[1+idt]; 1510 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1511 s6 = t[5+idt];s7 = t[6+idt]; 1512 for(m=0;m<nz;m++){ 1513 idx = 7*vi[m]; 1514 x1 = t[idx]; x2 = t[1+idx]; 1515 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1516 x6 = t[5+idx]; x7 = t[6+idx]; 1517 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1518 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1519 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1520 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1521 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1522 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1523 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1524 v += 49; 1525 } 1526 idc = 7*c[i]; 1527 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1528 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1529 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1530 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1531 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1532 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1533 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1534 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1535 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1536 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1537 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1538 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1539 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1540 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1541 } 1542 1543 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1544 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1545 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1546 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1547 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 #undef __FUNCT__ 1552 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1553 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1554 { 1555 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1556 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1557 PetscErrorCode ierr; 1558 PetscInt *diag = a->diag,jdx; 1559 const MatScalar *aa=a->a,*v; 1560 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1561 const PetscScalar *b; 1562 1563 PetscFunctionBegin; 1564 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1565 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1566 /* forward solve the lower triangular */ 1567 idx = 0; 1568 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1569 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1570 x[6] = b[6+idx]; 1571 for (i=1; i<n; i++) { 1572 v = aa + 49*ai[i]; 1573 vi = aj + ai[i]; 1574 nz = diag[i] - ai[i]; 1575 idx = 7*i; 1576 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1577 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1578 s7 = b[6+idx]; 1579 while (nz--) { 1580 jdx = 7*(*vi++); 1581 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1582 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1583 x7 = x[6+jdx]; 1584 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1585 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1586 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1587 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1588 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1589 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1590 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1591 v += 49; 1592 } 1593 x[idx] = s1; 1594 x[1+idx] = s2; 1595 x[2+idx] = s3; 1596 x[3+idx] = s4; 1597 x[4+idx] = s5; 1598 x[5+idx] = s6; 1599 x[6+idx] = s7; 1600 } 1601 /* backward solve the upper triangular */ 1602 for (i=n-1; i>=0; i--){ 1603 v = aa + 49*diag[i] + 49; 1604 vi = aj + diag[i] + 1; 1605 nz = ai[i+1] - diag[i] - 1; 1606 idt = 7*i; 1607 s1 = x[idt]; s2 = x[1+idt]; 1608 s3 = x[2+idt]; s4 = x[3+idt]; 1609 s5 = x[4+idt]; s6 = x[5+idt]; 1610 s7 = x[6+idt]; 1611 while (nz--) { 1612 idx = 7*(*vi++); 1613 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1614 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1615 x7 = x[6+idx]; 1616 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1617 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1618 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1619 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1620 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1621 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1622 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1623 v += 49; 1624 } 1625 v = aa + 49*diag[i]; 1626 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1627 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1628 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1629 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1630 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1631 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1632 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1633 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1634 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1635 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1636 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1637 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1638 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1639 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1640 } 1641 1642 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1643 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1644 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1645 PetscFunctionReturn(0); 1646 } 1647 1648 #undef __FUNCT__ 1649 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1650 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1651 { 1652 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1653 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1654 PetscErrorCode ierr; 1655 PetscInt idx,jdx,idt; 1656 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1657 const MatScalar *aa=a->a,*v; 1658 PetscScalar *x; 1659 const PetscScalar *b; 1660 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1661 1662 PetscFunctionBegin; 1663 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1664 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1665 /* forward solve the lower triangular */ 1666 idx = 0; 1667 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1668 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1669 for (i=1; i<n; i++) { 1670 v = aa + bs2*ai[i]; 1671 vi = aj + ai[i]; 1672 nz = ai[i+1] - ai[i]; 1673 idx = bs*i; 1674 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1675 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1676 for(k=0;k<nz;k++) { 1677 jdx = bs*vi[k]; 1678 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1679 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1680 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1681 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1682 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1683 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1684 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1685 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1686 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1687 v += bs2; 1688 } 1689 1690 x[idx] = s1; 1691 x[1+idx] = s2; 1692 x[2+idx] = s3; 1693 x[3+idx] = s4; 1694 x[4+idx] = s5; 1695 x[5+idx] = s6; 1696 x[6+idx] = s7; 1697 } 1698 1699 /* backward solve the upper triangular */ 1700 for (i=n-1; i>=0; i--){ 1701 v = aa + bs2*ai[2*n-i]; 1702 vi = aj + ai[2*n-i]; 1703 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1704 idt = bs*i; 1705 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1706 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1707 for(k=0;k<nz;k++) { 1708 idx = bs*vi[k]; 1709 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1710 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1711 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1712 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1713 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1714 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1715 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1716 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1717 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1718 v += bs2; 1719 } 1720 /* x = inv_diagonal*x */ 1721 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1722 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1723 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1724 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1725 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1726 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1727 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1728 } 1729 1730 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1731 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1732 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1733 PetscFunctionReturn(0); 1734 } 1735 1736 #undef __FUNCT__ 1737 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2" 1738 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1739 { 1740 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1741 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 1742 PetscErrorCode ierr; 1743 PetscInt idx,jdx,idt; 1744 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1745 const MatScalar *aa=a->a,*v; 1746 PetscScalar *x; 1747 const PetscScalar *b; 1748 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1749 1750 PetscFunctionBegin; 1751 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1752 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1753 /* forward solve the lower triangular */ 1754 idx = 0; 1755 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1756 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1757 for (i=1; i<n; i++) { 1758 v = aa + bs2*ai[i]; 1759 vi = aj + ai[i]; 1760 nz = ai[i+1] - ai[i]; 1761 idx = bs*i; 1762 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1763 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1764 for(k=0;k<nz;k++) { 1765 jdx = bs*vi[k]; 1766 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1767 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1768 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1769 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1770 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1771 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1772 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1773 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1774 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1775 v += bs2; 1776 } 1777 1778 x[idx] = s1; 1779 x[1+idx] = s2; 1780 x[2+idx] = s3; 1781 x[3+idx] = s4; 1782 x[4+idx] = s5; 1783 x[5+idx] = s6; 1784 x[6+idx] = s7; 1785 } 1786 1787 /* backward solve the upper triangular */ 1788 for (i=n-1; i>=0; i--){ 1789 v = aa + bs2*(adiag[i+1]+1); 1790 vi = aj + adiag[i+1]+1; 1791 nz = adiag[i] - adiag[i+1]-1; 1792 idt = bs*i; 1793 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1794 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1795 for(k=0;k<nz;k++) { 1796 idx = bs*vi[k]; 1797 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1798 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1799 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1800 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1801 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1802 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1803 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1804 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1805 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1806 v += bs2; 1807 } 1808 /* x = inv_diagonal*x */ 1809 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1810 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1811 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1812 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1813 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1814 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1815 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1816 } 1817 1818 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1819 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1820 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1821 PetscFunctionReturn(0); 1822 } 1823 1824 #undef __FUNCT__ 1825 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1826 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1827 { 1828 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1829 IS iscol=a->col,isrow=a->row; 1830 PetscErrorCode ierr; 1831 const PetscInt *r,*c,*rout,*cout; 1832 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1833 const MatScalar *aa=a->a,*v; 1834 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1835 const PetscScalar *b; 1836 PetscFunctionBegin; 1837 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1838 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1839 t = a->solve_work; 1840 1841 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1842 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1843 1844 /* forward solve the lower triangular */ 1845 idx = 6*(*r++); 1846 t[0] = b[idx]; t[1] = b[1+idx]; 1847 t[2] = b[2+idx]; t[3] = b[3+idx]; 1848 t[4] = b[4+idx]; t[5] = b[5+idx]; 1849 for (i=1; i<n; i++) { 1850 v = aa + 36*ai[i]; 1851 vi = aj + ai[i]; 1852 nz = diag[i] - ai[i]; 1853 idx = 6*(*r++); 1854 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1855 s5 = b[4+idx]; s6 = b[5+idx]; 1856 while (nz--) { 1857 idx = 6*(*vi++); 1858 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1859 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1860 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1861 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1862 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1863 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1864 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1865 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1866 v += 36; 1867 } 1868 idx = 6*i; 1869 t[idx] = s1;t[1+idx] = s2; 1870 t[2+idx] = s3;t[3+idx] = s4; 1871 t[4+idx] = s5;t[5+idx] = s6; 1872 } 1873 /* backward solve the upper triangular */ 1874 for (i=n-1; i>=0; i--){ 1875 v = aa + 36*diag[i] + 36; 1876 vi = aj + diag[i] + 1; 1877 nz = ai[i+1] - diag[i] - 1; 1878 idt = 6*i; 1879 s1 = t[idt]; s2 = t[1+idt]; 1880 s3 = t[2+idt];s4 = t[3+idt]; 1881 s5 = t[4+idt];s6 = t[5+idt]; 1882 while (nz--) { 1883 idx = 6*(*vi++); 1884 x1 = t[idx]; x2 = t[1+idx]; 1885 x3 = t[2+idx]; x4 = t[3+idx]; 1886 x5 = t[4+idx]; x6 = t[5+idx]; 1887 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1888 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1889 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1890 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1891 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1892 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1893 v += 36; 1894 } 1895 idc = 6*(*c--); 1896 v = aa + 36*diag[i]; 1897 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1898 v[18]*s4+v[24]*s5+v[30]*s6; 1899 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1900 v[19]*s4+v[25]*s5+v[31]*s6; 1901 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1902 v[20]*s4+v[26]*s5+v[32]*s6; 1903 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1904 v[21]*s4+v[27]*s5+v[33]*s6; 1905 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1906 v[22]*s4+v[28]*s5+v[34]*s6; 1907 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1908 v[23]*s4+v[29]*s5+v[35]*s6; 1909 } 1910 1911 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1912 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1913 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1914 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1915 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1916 PetscFunctionReturn(0); 1917 } 1918 1919 #undef __FUNCT__ 1920 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 1921 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 1922 { 1923 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1924 IS iscol=a->col,isrow=a->row; 1925 PetscErrorCode ierr; 1926 const PetscInt *r,*c,*rout,*cout; 1927 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 1928 const MatScalar *aa=a->a,*v; 1929 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1930 const PetscScalar *b; 1931 PetscFunctionBegin; 1932 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1933 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1934 t = a->solve_work; 1935 1936 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1937 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1938 1939 /* forward solve the lower triangular */ 1940 idx = 6*r[0]; 1941 t[0] = b[idx]; t[1] = b[1+idx]; 1942 t[2] = b[2+idx]; t[3] = b[3+idx]; 1943 t[4] = b[4+idx]; t[5] = b[5+idx]; 1944 for (i=1; i<n; i++) { 1945 v = aa + 36*ai[i]; 1946 vi = aj + ai[i]; 1947 nz = ai[i+1] - ai[i]; 1948 idx = 6*r[i]; 1949 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1950 s5 = b[4+idx]; s6 = b[5+idx]; 1951 for(m=0;m<nz;m++){ 1952 idx = 6*vi[m]; 1953 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1954 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1955 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1956 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1957 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1958 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1959 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1960 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1961 v += 36; 1962 } 1963 idx = 6*i; 1964 t[idx] = s1;t[1+idx] = s2; 1965 t[2+idx] = s3;t[3+idx] = s4; 1966 t[4+idx] = s5;t[5+idx] = s6; 1967 } 1968 /* backward solve the upper triangular */ 1969 for (i=n-1; i>=0; i--){ 1970 k = 2*n-i; 1971 v = aa + 36*ai[k]; 1972 vi = aj + ai[k]; 1973 nz = ai[k+1] - ai[k] - 1; 1974 idt = 6*i; 1975 s1 = t[idt]; s2 = t[1+idt]; 1976 s3 = t[2+idt];s4 = t[3+idt]; 1977 s5 = t[4+idt];s6 = t[5+idt]; 1978 for(m=0;m<nz;m++){ 1979 idx = 6*vi[m]; 1980 x1 = t[idx]; x2 = t[1+idx]; 1981 x3 = t[2+idx]; x4 = t[3+idx]; 1982 x5 = t[4+idx]; x6 = t[5+idx]; 1983 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1984 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1985 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1986 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1987 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1988 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1989 v += 36; 1990 } 1991 idc = 6*c[i]; 1992 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1993 v[18]*s4+v[24]*s5+v[30]*s6; 1994 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1995 v[19]*s4+v[25]*s5+v[31]*s6; 1996 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1997 v[20]*s4+v[26]*s5+v[32]*s6; 1998 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1999 v[21]*s4+v[27]*s5+v[33]*s6; 2000 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2001 v[22]*s4+v[28]*s5+v[34]*s6; 2002 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2003 v[23]*s4+v[29]*s5+v[35]*s6; 2004 } 2005 2006 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2007 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2008 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2009 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2010 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2011 PetscFunctionReturn(0); 2012 } 2013 2014 #undef __FUNCT__ 2015 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2" 2016 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2017 { 2018 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2019 IS iscol=a->col,isrow=a->row; 2020 PetscErrorCode ierr; 2021 const PetscInt *r,*c,*rout,*cout; 2022 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2023 const MatScalar *aa=a->a,*v; 2024 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2025 const PetscScalar *b; 2026 PetscFunctionBegin; 2027 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2028 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2029 t = a->solve_work; 2030 2031 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2032 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2033 2034 /* forward solve the lower triangular */ 2035 idx = 6*r[0]; 2036 t[0] = b[idx]; t[1] = b[1+idx]; 2037 t[2] = b[2+idx]; t[3] = b[3+idx]; 2038 t[4] = b[4+idx]; t[5] = b[5+idx]; 2039 for (i=1; i<n; i++) { 2040 v = aa + 36*ai[i]; 2041 vi = aj + ai[i]; 2042 nz = ai[i+1] - ai[i]; 2043 idx = 6*r[i]; 2044 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2045 s5 = b[4+idx]; s6 = b[5+idx]; 2046 for(m=0;m<nz;m++){ 2047 idx = 6*vi[m]; 2048 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2049 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2050 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2051 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2052 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2053 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2054 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2055 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2056 v += 36; 2057 } 2058 idx = 6*i; 2059 t[idx] = s1;t[1+idx] = s2; 2060 t[2+idx] = s3;t[3+idx] = s4; 2061 t[4+idx] = s5;t[5+idx] = s6; 2062 } 2063 /* backward solve the upper triangular */ 2064 for (i=n-1; i>=0; i--){ 2065 v = aa + 36*(adiag[i+1]+1); 2066 vi = aj + adiag[i+1]+1; 2067 nz = adiag[i] - adiag[i+1] - 1; 2068 idt = 6*i; 2069 s1 = t[idt]; s2 = t[1+idt]; 2070 s3 = t[2+idt];s4 = t[3+idt]; 2071 s5 = t[4+idt];s6 = t[5+idt]; 2072 for(m=0;m<nz;m++){ 2073 idx = 6*vi[m]; 2074 x1 = t[idx]; x2 = t[1+idx]; 2075 x3 = t[2+idx]; x4 = t[3+idx]; 2076 x5 = t[4+idx]; x6 = t[5+idx]; 2077 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2078 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2079 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2080 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2081 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2082 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2083 v += 36; 2084 } 2085 idc = 6*c[i]; 2086 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2087 v[18]*s4+v[24]*s5+v[30]*s6; 2088 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2089 v[19]*s4+v[25]*s5+v[31]*s6; 2090 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2091 v[20]*s4+v[26]*s5+v[32]*s6; 2092 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2093 v[21]*s4+v[27]*s5+v[33]*s6; 2094 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2095 v[22]*s4+v[28]*s5+v[34]*s6; 2096 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2097 v[23]*s4+v[29]*s5+v[35]*s6; 2098 } 2099 2100 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2101 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2102 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2103 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2104 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2105 PetscFunctionReturn(0); 2106 } 2107 2108 #undef __FUNCT__ 2109 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2110 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 2111 { 2112 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2113 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2114 PetscErrorCode ierr; 2115 PetscInt *diag = a->diag,jdx; 2116 const MatScalar *aa=a->a,*v; 2117 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2118 const PetscScalar *b; 2119 2120 PetscFunctionBegin; 2121 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2122 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2123 /* forward solve the lower triangular */ 2124 idx = 0; 2125 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2126 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2127 for (i=1; i<n; i++) { 2128 v = aa + 36*ai[i]; 2129 vi = aj + ai[i]; 2130 nz = diag[i] - ai[i]; 2131 idx = 6*i; 2132 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2133 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2134 while (nz--) { 2135 jdx = 6*(*vi++); 2136 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2137 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2138 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2139 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2140 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2141 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2142 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2143 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2144 v += 36; 2145 } 2146 x[idx] = s1; 2147 x[1+idx] = s2; 2148 x[2+idx] = s3; 2149 x[3+idx] = s4; 2150 x[4+idx] = s5; 2151 x[5+idx] = s6; 2152 } 2153 /* backward solve the upper triangular */ 2154 for (i=n-1; i>=0; i--){ 2155 v = aa + 36*diag[i] + 36; 2156 vi = aj + diag[i] + 1; 2157 nz = ai[i+1] - diag[i] - 1; 2158 idt = 6*i; 2159 s1 = x[idt]; s2 = x[1+idt]; 2160 s3 = x[2+idt]; s4 = x[3+idt]; 2161 s5 = x[4+idt]; s6 = x[5+idt]; 2162 while (nz--) { 2163 idx = 6*(*vi++); 2164 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2165 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2166 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2167 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2168 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2169 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2170 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2171 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2172 v += 36; 2173 } 2174 v = aa + 36*diag[i]; 2175 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2176 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2177 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2178 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2179 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2180 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2181 } 2182 2183 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2184 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2185 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2186 PetscFunctionReturn(0); 2187 } 2188 2189 #undef __FUNCT__ 2190 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2191 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2192 { 2193 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2194 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2195 PetscErrorCode ierr; 2196 PetscInt idx,jdx,idt; 2197 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2198 const MatScalar *aa=a->a,*v; 2199 PetscScalar *x; 2200 const PetscScalar *b; 2201 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2202 2203 PetscFunctionBegin; 2204 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2205 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2206 /* forward solve the lower triangular */ 2207 idx = 0; 2208 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2209 x[4] = b[4+idx];x[5] = b[5+idx]; 2210 for (i=1; i<n; i++) { 2211 v = aa + bs2*ai[i]; 2212 vi = aj + ai[i]; 2213 nz = ai[i+1] - ai[i]; 2214 idx = bs*i; 2215 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2216 s5 = b[4+idx];s6 = b[5+idx]; 2217 for(k=0;k<nz;k++){ 2218 jdx = bs*vi[k]; 2219 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2220 x5 = x[4+jdx]; x6 = x[5+jdx]; 2221 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2222 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2223 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2224 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2225 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2226 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2227 v += bs2; 2228 } 2229 2230 x[idx] = s1; 2231 x[1+idx] = s2; 2232 x[2+idx] = s3; 2233 x[3+idx] = s4; 2234 x[4+idx] = s5; 2235 x[5+idx] = s6; 2236 } 2237 2238 /* backward solve the upper triangular */ 2239 for (i=n-1; i>=0; i--){ 2240 v = aa + bs2*ai[2*n-i]; 2241 vi = aj + ai[2*n-i]; 2242 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2243 idt = bs*i; 2244 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2245 s5 = x[4+idt];s6 = x[5+idt]; 2246 for(k=0;k<nz;k++){ 2247 idx = bs*vi[k]; 2248 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2249 x5 = x[4+idx];x6 = x[5+idx]; 2250 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2251 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2252 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2253 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2254 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2255 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2256 v += bs2; 2257 } 2258 /* x = inv_diagonal*x */ 2259 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2260 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2261 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2262 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2263 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2264 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2265 } 2266 2267 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2268 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2269 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2270 PetscFunctionReturn(0); 2271 } 2272 2273 #undef __FUNCT__ 2274 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2" 2275 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2276 { 2277 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2278 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2279 PetscErrorCode ierr; 2280 PetscInt idx,jdx,idt; 2281 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2282 const MatScalar *aa=a->a,*v; 2283 PetscScalar *x; 2284 const PetscScalar *b; 2285 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2286 2287 PetscFunctionBegin; 2288 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2289 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2290 /* forward solve the lower triangular */ 2291 idx = 0; 2292 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2293 x[4] = b[4+idx];x[5] = b[5+idx]; 2294 for (i=1; i<n; i++) { 2295 v = aa + bs2*ai[i]; 2296 vi = aj + ai[i]; 2297 nz = ai[i+1] - ai[i]; 2298 idx = bs*i; 2299 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2300 s5 = b[4+idx];s6 = b[5+idx]; 2301 for(k=0;k<nz;k++){ 2302 jdx = bs*vi[k]; 2303 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2304 x5 = x[4+jdx]; x6 = x[5+jdx]; 2305 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2306 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2307 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2308 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2309 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2310 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2311 v += bs2; 2312 } 2313 2314 x[idx] = s1; 2315 x[1+idx] = s2; 2316 x[2+idx] = s3; 2317 x[3+idx] = s4; 2318 x[4+idx] = s5; 2319 x[5+idx] = s6; 2320 } 2321 2322 /* backward solve the upper triangular */ 2323 for (i=n-1; i>=0; i--){ 2324 v = aa + bs2*(adiag[i+1]+1); 2325 vi = aj + adiag[i+1]+1; 2326 nz = adiag[i] - adiag[i+1]-1; 2327 idt = bs*i; 2328 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2329 s5 = x[4+idt];s6 = x[5+idt]; 2330 for(k=0;k<nz;k++){ 2331 idx = bs*vi[k]; 2332 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2333 x5 = x[4+idx];x6 = x[5+idx]; 2334 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2335 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2336 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2337 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2338 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2339 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2340 v += bs2; 2341 } 2342 /* x = inv_diagonal*x */ 2343 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2344 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2345 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2346 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2347 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2348 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2349 } 2350 2351 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2352 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2353 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2354 PetscFunctionReturn(0); 2355 } 2356 2357 #undef __FUNCT__ 2358 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2359 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 2360 { 2361 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2362 IS iscol=a->col,isrow=a->row; 2363 PetscErrorCode ierr; 2364 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 2365 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2366 const MatScalar *aa=a->a,*v; 2367 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2368 const PetscScalar *b; 2369 2370 PetscFunctionBegin; 2371 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2372 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2373 t = a->solve_work; 2374 2375 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2376 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2377 2378 /* forward solve the lower triangular */ 2379 idx = 5*(*r++); 2380 t[0] = b[idx]; t[1] = b[1+idx]; 2381 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2382 for (i=1; i<n; i++) { 2383 v = aa + 25*ai[i]; 2384 vi = aj + ai[i]; 2385 nz = diag[i] - ai[i]; 2386 idx = 5*(*r++); 2387 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2388 s5 = b[4+idx]; 2389 while (nz--) { 2390 idx = 5*(*vi++); 2391 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2392 x4 = t[3+idx];x5 = t[4+idx]; 2393 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2394 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2395 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2396 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2397 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2398 v += 25; 2399 } 2400 idx = 5*i; 2401 t[idx] = s1;t[1+idx] = s2; 2402 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2403 } 2404 /* backward solve the upper triangular */ 2405 for (i=n-1; i>=0; i--){ 2406 v = aa + 25*diag[i] + 25; 2407 vi = aj + diag[i] + 1; 2408 nz = ai[i+1] - diag[i] - 1; 2409 idt = 5*i; 2410 s1 = t[idt]; s2 = t[1+idt]; 2411 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2412 while (nz--) { 2413 idx = 5*(*vi++); 2414 x1 = t[idx]; x2 = t[1+idx]; 2415 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2416 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2417 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2418 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2419 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2420 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2421 v += 25; 2422 } 2423 idc = 5*(*c--); 2424 v = aa + 25*diag[i]; 2425 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2426 v[15]*s4+v[20]*s5; 2427 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2428 v[16]*s4+v[21]*s5; 2429 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2430 v[17]*s4+v[22]*s5; 2431 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2432 v[18]*s4+v[23]*s5; 2433 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2434 v[19]*s4+v[24]*s5; 2435 } 2436 2437 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2438 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2439 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2440 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2441 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2442 PetscFunctionReturn(0); 2443 } 2444 2445 #undef __FUNCT__ 2446 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2447 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2448 { 2449 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2450 IS iscol=a->col,isrow=a->row; 2451 PetscErrorCode ierr; 2452 const PetscInt *r,*c,*rout,*cout; 2453 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2454 const MatScalar *aa=a->a,*v; 2455 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2456 const PetscScalar *b; 2457 2458 PetscFunctionBegin; 2459 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2460 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2461 t = a->solve_work; 2462 2463 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2464 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2465 2466 /* forward solve the lower triangular */ 2467 idx = 5*r[0]; 2468 t[0] = b[idx]; t[1] = b[1+idx]; 2469 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2470 for (i=1; i<n; i++) { 2471 v = aa + 25*ai[i]; 2472 vi = aj + ai[i]; 2473 nz = ai[i+1] - ai[i]; 2474 idx = 5*r[i]; 2475 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2476 s5 = b[4+idx]; 2477 for(m=0;m<nz;m++){ 2478 idx = 5*vi[m]; 2479 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2480 x4 = t[3+idx];x5 = t[4+idx]; 2481 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2482 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2483 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2484 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2485 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2486 v += 25; 2487 } 2488 idx = 5*i; 2489 t[idx] = s1;t[1+idx] = s2; 2490 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2491 } 2492 /* backward solve the upper triangular */ 2493 for (i=n-1; i>=0; i--){ 2494 k = 2*n-i; 2495 v = aa + 25*ai[k]; 2496 vi = aj + ai[k]; 2497 nz = ai[k+1] - ai[k] - 1; 2498 idt = 5*i; 2499 s1 = t[idt]; s2 = t[1+idt]; 2500 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2501 for(m=0;m<nz;m++){ 2502 idx = 5*vi[m]; 2503 x1 = t[idx]; x2 = t[1+idx]; 2504 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2505 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2506 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2507 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2508 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2509 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2510 v += 25; 2511 } 2512 idc = 5*c[i]; 2513 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2514 v[15]*s4+v[20]*s5; 2515 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2516 v[16]*s4+v[21]*s5; 2517 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2518 v[17]*s4+v[22]*s5; 2519 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2520 v[18]*s4+v[23]*s5; 2521 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2522 v[19]*s4+v[24]*s5; 2523 } 2524 2525 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2526 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2527 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2528 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2529 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2530 PetscFunctionReturn(0); 2531 } 2532 2533 #undef __FUNCT__ 2534 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2" 2535 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2536 { 2537 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2538 IS iscol=a->col,isrow=a->row; 2539 PetscErrorCode ierr; 2540 const PetscInt *r,*c,*rout,*cout; 2541 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2542 const MatScalar *aa=a->a,*v; 2543 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2544 const PetscScalar *b; 2545 2546 PetscFunctionBegin; 2547 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2548 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2549 t = a->solve_work; 2550 2551 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2552 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2553 2554 /* forward solve the lower triangular */ 2555 idx = 5*r[0]; 2556 t[0] = b[idx]; t[1] = b[1+idx]; 2557 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2558 for (i=1; i<n; i++) { 2559 v = aa + 25*ai[i]; 2560 vi = aj + ai[i]; 2561 nz = ai[i+1] - ai[i]; 2562 idx = 5*r[i]; 2563 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2564 s5 = b[4+idx]; 2565 for(m=0;m<nz;m++){ 2566 idx = 5*vi[m]; 2567 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2568 x4 = t[3+idx];x5 = t[4+idx]; 2569 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2570 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2571 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2572 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2573 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2574 v += 25; 2575 } 2576 idx = 5*i; 2577 t[idx] = s1;t[1+idx] = s2; 2578 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2579 } 2580 /* backward solve the upper triangular */ 2581 for (i=n-1; i>=0; i--){ 2582 v = aa + 25*(adiag[i+1]+1); 2583 vi = aj + adiag[i+1]+1; 2584 nz = adiag[i] - adiag[i+1] - 1; 2585 idt = 5*i; 2586 s1 = t[idt]; s2 = t[1+idt]; 2587 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2588 for(m=0;m<nz;m++){ 2589 idx = 5*vi[m]; 2590 x1 = t[idx]; x2 = t[1+idx]; 2591 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2592 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2593 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2594 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2595 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2596 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2597 v += 25; 2598 } 2599 idc = 5*c[i]; 2600 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2601 v[15]*s4+v[20]*s5; 2602 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2603 v[16]*s4+v[21]*s5; 2604 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2605 v[17]*s4+v[22]*s5; 2606 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2607 v[18]*s4+v[23]*s5; 2608 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2609 v[19]*s4+v[24]*s5; 2610 } 2611 2612 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2613 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2614 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2615 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2616 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2617 PetscFunctionReturn(0); 2618 } 2619 2620 #undef __FUNCT__ 2621 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2622 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 2623 { 2624 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2625 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2626 PetscErrorCode ierr; 2627 PetscInt *diag = a->diag,jdx; 2628 const MatScalar *aa=a->a,*v; 2629 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2630 const PetscScalar *b; 2631 2632 PetscFunctionBegin; 2633 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2634 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2635 /* forward solve the lower triangular */ 2636 idx = 0; 2637 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2638 for (i=1; i<n; i++) { 2639 v = aa + 25*ai[i]; 2640 vi = aj + ai[i]; 2641 nz = diag[i] - ai[i]; 2642 idx = 5*i; 2643 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2644 while (nz--) { 2645 jdx = 5*(*vi++); 2646 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2647 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2648 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2649 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2650 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2651 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2652 v += 25; 2653 } 2654 x[idx] = s1; 2655 x[1+idx] = s2; 2656 x[2+idx] = s3; 2657 x[3+idx] = s4; 2658 x[4+idx] = s5; 2659 } 2660 /* backward solve the upper triangular */ 2661 for (i=n-1; i>=0; i--){ 2662 v = aa + 25*diag[i] + 25; 2663 vi = aj + diag[i] + 1; 2664 nz = ai[i+1] - diag[i] - 1; 2665 idt = 5*i; 2666 s1 = x[idt]; s2 = x[1+idt]; 2667 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2668 while (nz--) { 2669 idx = 5*(*vi++); 2670 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2671 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2672 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2673 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2674 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2675 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2676 v += 25; 2677 } 2678 v = aa + 25*diag[i]; 2679 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2680 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2681 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2682 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2683 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2684 } 2685 2686 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2687 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2688 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2689 PetscFunctionReturn(0); 2690 } 2691 2692 #undef __FUNCT__ 2693 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2694 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2695 { 2696 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2697 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2698 PetscErrorCode ierr; 2699 PetscInt jdx; 2700 const MatScalar *aa=a->a,*v; 2701 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2702 const PetscScalar *b; 2703 2704 PetscFunctionBegin; 2705 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2706 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2707 /* forward solve the lower triangular */ 2708 idx = 0; 2709 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2710 for (i=1; i<n; i++) { 2711 v = aa + 25*ai[i]; 2712 vi = aj + ai[i]; 2713 nz = ai[i+1] - ai[i]; 2714 idx = 5*i; 2715 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2716 for(k=0;k<nz;k++) { 2717 jdx = 5*vi[k]; 2718 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2719 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2720 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2721 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2722 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2723 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2724 v += 25; 2725 } 2726 x[idx] = s1; 2727 x[1+idx] = s2; 2728 x[2+idx] = s3; 2729 x[3+idx] = s4; 2730 x[4+idx] = s5; 2731 } 2732 2733 /* backward solve the upper triangular */ 2734 for (i=n-1; i>=0; i--){ 2735 v = aa + 25*ai[2*n-i]; 2736 vi = aj + ai[2*n-i]; 2737 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2738 idt = 5*i; 2739 s1 = x[idt]; s2 = x[1+idt]; 2740 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2741 for(k=0;k<nz;k++){ 2742 idx = 5*vi[k]; 2743 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2744 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2745 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2746 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2747 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2748 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2749 v += 25; 2750 } 2751 /* x = inv_diagonal*x */ 2752 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2753 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2754 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2755 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2756 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2757 } 2758 2759 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2760 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2761 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2762 PetscFunctionReturn(0); 2763 } 2764 2765 #undef __FUNCT__ 2766 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2" 2767 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2768 { 2769 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2770 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 2771 PetscErrorCode ierr; 2772 PetscInt jdx; 2773 const MatScalar *aa=a->a,*v; 2774 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2775 const PetscScalar *b; 2776 2777 PetscFunctionBegin; 2778 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2779 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2780 /* forward solve the lower triangular */ 2781 idx = 0; 2782 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2783 for (i=1; i<n; i++) { 2784 v = aa + 25*ai[i]; 2785 vi = aj + ai[i]; 2786 nz = ai[i+1] - ai[i]; 2787 idx = 5*i; 2788 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2789 for(k=0;k<nz;k++) { 2790 jdx = 5*vi[k]; 2791 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2792 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2793 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2794 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2795 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2796 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2797 v += 25; 2798 } 2799 x[idx] = s1; 2800 x[1+idx] = s2; 2801 x[2+idx] = s3; 2802 x[3+idx] = s4; 2803 x[4+idx] = s5; 2804 } 2805 2806 /* backward solve the upper triangular */ 2807 for (i=n-1; i>=0; i--){ 2808 v = aa + 25*(adiag[i+1]+1); 2809 vi = aj + adiag[i+1]+1; 2810 nz = adiag[i] - adiag[i+1]-1; 2811 idt = 5*i; 2812 s1 = x[idt]; s2 = x[1+idt]; 2813 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2814 for(k=0;k<nz;k++){ 2815 idx = 5*vi[k]; 2816 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2817 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2818 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2819 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2820 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2821 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2822 v += 25; 2823 } 2824 /* x = inv_diagonal*x */ 2825 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2826 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2827 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2828 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2829 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2830 } 2831 2832 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2833 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2834 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2835 PetscFunctionReturn(0); 2836 } 2837 2838 #undef __FUNCT__ 2839 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2840 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 2841 { 2842 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2843 IS iscol=a->col,isrow=a->row; 2844 PetscErrorCode ierr; 2845 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2846 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2847 const MatScalar *aa=a->a,*v; 2848 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2849 const PetscScalar *b; 2850 2851 PetscFunctionBegin; 2852 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2853 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2854 t = a->solve_work; 2855 2856 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2857 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2858 2859 /* forward solve the lower triangular */ 2860 idx = 4*(*r++); 2861 t[0] = b[idx]; t[1] = b[1+idx]; 2862 t[2] = b[2+idx]; t[3] = b[3+idx]; 2863 for (i=1; i<n; i++) { 2864 v = aa + 16*ai[i]; 2865 vi = aj + ai[i]; 2866 nz = diag[i] - ai[i]; 2867 idx = 4*(*r++); 2868 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2869 while (nz--) { 2870 idx = 4*(*vi++); 2871 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2872 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2873 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2874 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2875 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2876 v += 16; 2877 } 2878 idx = 4*i; 2879 t[idx] = s1;t[1+idx] = s2; 2880 t[2+idx] = s3;t[3+idx] = s4; 2881 } 2882 /* backward solve the upper triangular */ 2883 for (i=n-1; i>=0; i--){ 2884 v = aa + 16*diag[i] + 16; 2885 vi = aj + diag[i] + 1; 2886 nz = ai[i+1] - diag[i] - 1; 2887 idt = 4*i; 2888 s1 = t[idt]; s2 = t[1+idt]; 2889 s3 = t[2+idt];s4 = t[3+idt]; 2890 while (nz--) { 2891 idx = 4*(*vi++); 2892 x1 = t[idx]; x2 = t[1+idx]; 2893 x3 = t[2+idx]; x4 = t[3+idx]; 2894 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2895 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2896 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2897 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2898 v += 16; 2899 } 2900 idc = 4*(*c--); 2901 v = aa + 16*diag[i]; 2902 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2903 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2904 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2905 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2906 } 2907 2908 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2909 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2910 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2911 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2912 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2913 PetscFunctionReturn(0); 2914 } 2915 2916 #undef __FUNCT__ 2917 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2918 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 2919 { 2920 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2921 IS iscol=a->col,isrow=a->row; 2922 PetscErrorCode ierr; 2923 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2924 const PetscInt *r,*c,*rout,*cout; 2925 const MatScalar *aa=a->a,*v; 2926 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2927 const PetscScalar *b; 2928 2929 PetscFunctionBegin; 2930 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2931 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2932 t = a->solve_work; 2933 2934 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2935 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2936 2937 /* forward solve the lower triangular */ 2938 idx = 4*r[0]; 2939 t[0] = b[idx]; t[1] = b[1+idx]; 2940 t[2] = b[2+idx]; t[3] = b[3+idx]; 2941 for (i=1; i<n; i++) { 2942 v = aa + 16*ai[i]; 2943 vi = aj + ai[i]; 2944 nz = ai[i+1] - ai[i]; 2945 idx = 4*r[i]; 2946 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2947 for(m=0;m<nz;m++){ 2948 idx = 4*vi[m]; 2949 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2950 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2951 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2952 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2953 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2954 v += 16; 2955 } 2956 idx = 4*i; 2957 t[idx] = s1;t[1+idx] = s2; 2958 t[2+idx] = s3;t[3+idx] = s4; 2959 } 2960 /* backward solve the upper triangular */ 2961 for (i=n-1; i>=0; i--){ 2962 k = 2*n-i; 2963 v = aa + 16*ai[k]; 2964 vi = aj + ai[k]; 2965 nz = ai[k+1] - ai[k] - 1; 2966 idt = 4*i; 2967 s1 = t[idt]; s2 = t[1+idt]; 2968 s3 = t[2+idt];s4 = t[3+idt]; 2969 for(m=0;m<nz;m++){ 2970 idx = 4*vi[m]; 2971 x1 = t[idx]; x2 = t[1+idx]; 2972 x3 = t[2+idx]; x4 = t[3+idx]; 2973 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2974 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2975 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2976 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2977 v += 16; 2978 } 2979 idc = 4*c[i]; 2980 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2981 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2982 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2983 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2984 } 2985 2986 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2987 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2988 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2989 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2990 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2991 PetscFunctionReturn(0); 2992 } 2993 2994 #undef __FUNCT__ 2995 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2" 2996 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2997 { 2998 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2999 IS iscol=a->col,isrow=a->row; 3000 PetscErrorCode ierr; 3001 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3002 const PetscInt *r,*c,*rout,*cout; 3003 const MatScalar *aa=a->a,*v; 3004 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3005 const PetscScalar *b; 3006 3007 PetscFunctionBegin; 3008 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3009 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3010 t = a->solve_work; 3011 3012 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3013 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3014 3015 /* forward solve the lower triangular */ 3016 idx = 4*r[0]; 3017 t[0] = b[idx]; t[1] = b[1+idx]; 3018 t[2] = b[2+idx]; t[3] = b[3+idx]; 3019 for (i=1; i<n; i++) { 3020 v = aa + 16*ai[i]; 3021 vi = aj + ai[i]; 3022 nz = ai[i+1] - ai[i]; 3023 idx = 4*r[i]; 3024 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3025 for(m=0;m<nz;m++){ 3026 idx = 4*vi[m]; 3027 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3028 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3029 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3030 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3031 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3032 v += 16; 3033 } 3034 idx = 4*i; 3035 t[idx] = s1;t[1+idx] = s2; 3036 t[2+idx] = s3;t[3+idx] = s4; 3037 } 3038 /* backward solve the upper triangular */ 3039 for (i=n-1; i>=0; i--){ 3040 v = aa + 16*(adiag[i+1]+1); 3041 vi = aj + adiag[i+1]+1; 3042 nz = adiag[i] - adiag[i+1] - 1; 3043 idt = 4*i; 3044 s1 = t[idt]; s2 = t[1+idt]; 3045 s3 = t[2+idt];s4 = t[3+idt]; 3046 for(m=0;m<nz;m++){ 3047 idx = 4*vi[m]; 3048 x1 = t[idx]; x2 = t[1+idx]; 3049 x3 = t[2+idx]; x4 = t[3+idx]; 3050 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3051 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3052 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3053 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3054 v += 16; 3055 } 3056 idc = 4*c[i]; 3057 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3058 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3059 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3060 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3061 } 3062 3063 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3064 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3065 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3066 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3067 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3068 PetscFunctionReturn(0); 3069 } 3070 3071 #undef __FUNCT__ 3072 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3073 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3074 { 3075 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3076 IS iscol=a->col,isrow=a->row; 3077 PetscErrorCode ierr; 3078 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3079 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3080 const MatScalar *aa=a->a,*v; 3081 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3082 PetscScalar *x; 3083 const PetscScalar *b; 3084 3085 PetscFunctionBegin; 3086 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3087 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3088 t = (MatScalar *)a->solve_work; 3089 3090 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3091 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3092 3093 /* forward solve the lower triangular */ 3094 idx = 4*(*r++); 3095 t[0] = (MatScalar)b[idx]; 3096 t[1] = (MatScalar)b[1+idx]; 3097 t[2] = (MatScalar)b[2+idx]; 3098 t[3] = (MatScalar)b[3+idx]; 3099 for (i=1; i<n; i++) { 3100 v = aa + 16*ai[i]; 3101 vi = aj + ai[i]; 3102 nz = diag[i] - ai[i]; 3103 idx = 4*(*r++); 3104 s1 = (MatScalar)b[idx]; 3105 s2 = (MatScalar)b[1+idx]; 3106 s3 = (MatScalar)b[2+idx]; 3107 s4 = (MatScalar)b[3+idx]; 3108 while (nz--) { 3109 idx = 4*(*vi++); 3110 x1 = t[idx]; 3111 x2 = t[1+idx]; 3112 x3 = t[2+idx]; 3113 x4 = t[3+idx]; 3114 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3115 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3116 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3117 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3118 v += 16; 3119 } 3120 idx = 4*i; 3121 t[idx] = s1; 3122 t[1+idx] = s2; 3123 t[2+idx] = s3; 3124 t[3+idx] = s4; 3125 } 3126 /* backward solve the upper triangular */ 3127 for (i=n-1; i>=0; i--){ 3128 v = aa + 16*diag[i] + 16; 3129 vi = aj + diag[i] + 1; 3130 nz = ai[i+1] - diag[i] - 1; 3131 idt = 4*i; 3132 s1 = t[idt]; 3133 s2 = t[1+idt]; 3134 s3 = t[2+idt]; 3135 s4 = t[3+idt]; 3136 while (nz--) { 3137 idx = 4*(*vi++); 3138 x1 = t[idx]; 3139 x2 = t[1+idx]; 3140 x3 = t[2+idx]; 3141 x4 = t[3+idx]; 3142 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3143 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3144 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3145 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3146 v += 16; 3147 } 3148 idc = 4*(*c--); 3149 v = aa + 16*diag[i]; 3150 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3151 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3152 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3153 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3154 x[idc] = (PetscScalar)t[idt]; 3155 x[1+idc] = (PetscScalar)t[1+idt]; 3156 x[2+idc] = (PetscScalar)t[2+idt]; 3157 x[3+idc] = (PetscScalar)t[3+idt]; 3158 } 3159 3160 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3161 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3162 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3163 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3164 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3165 PetscFunctionReturn(0); 3166 } 3167 3168 #if defined (PETSC_HAVE_SSE) 3169 3170 #include PETSC_HAVE_SSE 3171 3172 #undef __FUNCT__ 3173 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3174 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3175 { 3176 /* 3177 Note: This code uses demotion of double 3178 to float when performing the mixed-mode computation. 3179 This may not be numerically reasonable for all applications. 3180 */ 3181 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3182 IS iscol=a->col,isrow=a->row; 3183 PetscErrorCode ierr; 3184 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3185 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3186 MatScalar *aa=a->a,*v; 3187 PetscScalar *x,*b,*t; 3188 3189 /* Make space in temp stack for 16 Byte Aligned arrays */ 3190 float ssealignedspace[11],*tmps,*tmpx; 3191 unsigned long offset; 3192 3193 PetscFunctionBegin; 3194 SSE_SCOPE_BEGIN; 3195 3196 offset = (unsigned long)ssealignedspace % 16; 3197 if (offset) offset = (16 - offset)/4; 3198 tmps = &ssealignedspace[offset]; 3199 tmpx = &ssealignedspace[offset+4]; 3200 PREFETCH_NTA(aa+16*ai[1]); 3201 3202 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3203 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3204 t = a->solve_work; 3205 3206 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3207 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3208 3209 /* forward solve the lower triangular */ 3210 idx = 4*(*r++); 3211 t[0] = b[idx]; t[1] = b[1+idx]; 3212 t[2] = b[2+idx]; t[3] = b[3+idx]; 3213 v = aa + 16*ai[1]; 3214 3215 for (i=1; i<n;) { 3216 PREFETCH_NTA(&v[8]); 3217 vi = aj + ai[i]; 3218 nz = diag[i] - ai[i]; 3219 idx = 4*(*r++); 3220 3221 /* Demote sum from double to float */ 3222 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3223 LOAD_PS(tmps,XMM7); 3224 3225 while (nz--) { 3226 PREFETCH_NTA(&v[16]); 3227 idx = 4*(*vi++); 3228 3229 /* Demote solution (so far) from double to float */ 3230 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3231 3232 /* 4x4 Matrix-Vector product with negative accumulation: */ 3233 SSE_INLINE_BEGIN_2(tmpx,v) 3234 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3235 3236 /* First Column */ 3237 SSE_COPY_PS(XMM0,XMM6) 3238 SSE_SHUFFLE(XMM0,XMM0,0x00) 3239 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3240 SSE_SUB_PS(XMM7,XMM0) 3241 3242 /* Second Column */ 3243 SSE_COPY_PS(XMM1,XMM6) 3244 SSE_SHUFFLE(XMM1,XMM1,0x55) 3245 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3246 SSE_SUB_PS(XMM7,XMM1) 3247 3248 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3249 3250 /* Third Column */ 3251 SSE_COPY_PS(XMM2,XMM6) 3252 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3253 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3254 SSE_SUB_PS(XMM7,XMM2) 3255 3256 /* Fourth Column */ 3257 SSE_COPY_PS(XMM3,XMM6) 3258 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3259 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3260 SSE_SUB_PS(XMM7,XMM3) 3261 SSE_INLINE_END_2 3262 3263 v += 16; 3264 } 3265 idx = 4*i; 3266 v = aa + 16*ai[++i]; 3267 PREFETCH_NTA(v); 3268 STORE_PS(tmps,XMM7); 3269 3270 /* Promote result from float to double */ 3271 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3272 } 3273 /* backward solve the upper triangular */ 3274 idt = 4*(n-1); 3275 ai16 = 16*diag[n-1]; 3276 v = aa + ai16 + 16; 3277 for (i=n-1; i>=0;){ 3278 PREFETCH_NTA(&v[8]); 3279 vi = aj + diag[i] + 1; 3280 nz = ai[i+1] - diag[i] - 1; 3281 3282 /* Demote accumulator from double to float */ 3283 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3284 LOAD_PS(tmps,XMM7); 3285 3286 while (nz--) { 3287 PREFETCH_NTA(&v[16]); 3288 idx = 4*(*vi++); 3289 3290 /* Demote solution (so far) from double to float */ 3291 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3292 3293 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3294 SSE_INLINE_BEGIN_2(tmpx,v) 3295 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3296 3297 /* First Column */ 3298 SSE_COPY_PS(XMM0,XMM6) 3299 SSE_SHUFFLE(XMM0,XMM0,0x00) 3300 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3301 SSE_SUB_PS(XMM7,XMM0) 3302 3303 /* Second Column */ 3304 SSE_COPY_PS(XMM1,XMM6) 3305 SSE_SHUFFLE(XMM1,XMM1,0x55) 3306 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3307 SSE_SUB_PS(XMM7,XMM1) 3308 3309 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3310 3311 /* Third Column */ 3312 SSE_COPY_PS(XMM2,XMM6) 3313 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3314 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3315 SSE_SUB_PS(XMM7,XMM2) 3316 3317 /* Fourth Column */ 3318 SSE_COPY_PS(XMM3,XMM6) 3319 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3320 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3321 SSE_SUB_PS(XMM7,XMM3) 3322 SSE_INLINE_END_2 3323 v += 16; 3324 } 3325 v = aa + ai16; 3326 ai16 = 16*diag[--i]; 3327 PREFETCH_NTA(aa+ai16+16); 3328 /* 3329 Scale the result by the diagonal 4x4 block, 3330 which was inverted as part of the factorization 3331 */ 3332 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3333 /* First Column */ 3334 SSE_COPY_PS(XMM0,XMM7) 3335 SSE_SHUFFLE(XMM0,XMM0,0x00) 3336 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3337 3338 /* Second Column */ 3339 SSE_COPY_PS(XMM1,XMM7) 3340 SSE_SHUFFLE(XMM1,XMM1,0x55) 3341 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3342 SSE_ADD_PS(XMM0,XMM1) 3343 3344 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3345 3346 /* Third Column */ 3347 SSE_COPY_PS(XMM2,XMM7) 3348 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3349 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3350 SSE_ADD_PS(XMM0,XMM2) 3351 3352 /* Fourth Column */ 3353 SSE_COPY_PS(XMM3,XMM7) 3354 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3355 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3356 SSE_ADD_PS(XMM0,XMM3) 3357 3358 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3359 SSE_INLINE_END_3 3360 3361 /* Promote solution from float to double */ 3362 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3363 3364 /* Apply reordering to t and stream into x. */ 3365 /* This way, x doesn't pollute the cache. */ 3366 /* Be careful with size: 2 doubles = 4 floats! */ 3367 idc = 4*(*c--); 3368 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3369 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3370 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3371 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3372 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3373 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3374 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3375 SSE_INLINE_END_2 3376 v = aa + ai16 + 16; 3377 idt -= 4; 3378 } 3379 3380 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3381 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3382 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3383 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3384 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3385 SSE_SCOPE_END; 3386 PetscFunctionReturn(0); 3387 } 3388 3389 #endif 3390 3391 3392 /* 3393 Special case where the matrix was ILU(0) factored in the natural 3394 ordering. This eliminates the need for the column and row permutation. 3395 */ 3396 #undef __FUNCT__ 3397 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3398 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3399 { 3400 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3401 PetscInt n=a->mbs; 3402 const PetscInt *ai=a->i,*aj=a->j; 3403 PetscErrorCode ierr; 3404 const PetscInt *diag = a->diag; 3405 const MatScalar *aa=a->a; 3406 PetscScalar *x; 3407 const PetscScalar *b; 3408 3409 PetscFunctionBegin; 3410 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3411 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3412 3413 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3414 { 3415 static PetscScalar w[2000]; /* very BAD need to fix */ 3416 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3417 } 3418 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3419 { 3420 static PetscScalar w[2000]; /* very BAD need to fix */ 3421 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3422 } 3423 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3424 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3425 #else 3426 { 3427 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3428 const MatScalar *v; 3429 PetscInt jdx,idt,idx,nz,i,ai16; 3430 const PetscInt *vi; 3431 3432 /* forward solve the lower triangular */ 3433 idx = 0; 3434 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3435 for (i=1; i<n; i++) { 3436 v = aa + 16*ai[i]; 3437 vi = aj + ai[i]; 3438 nz = diag[i] - ai[i]; 3439 idx += 4; 3440 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3441 while (nz--) { 3442 jdx = 4*(*vi++); 3443 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3444 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3445 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3446 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3447 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3448 v += 16; 3449 } 3450 x[idx] = s1; 3451 x[1+idx] = s2; 3452 x[2+idx] = s3; 3453 x[3+idx] = s4; 3454 } 3455 /* backward solve the upper triangular */ 3456 idt = 4*(n-1); 3457 for (i=n-1; i>=0; i--){ 3458 ai16 = 16*diag[i]; 3459 v = aa + ai16 + 16; 3460 vi = aj + diag[i] + 1; 3461 nz = ai[i+1] - diag[i] - 1; 3462 s1 = x[idt]; s2 = x[1+idt]; 3463 s3 = x[2+idt];s4 = x[3+idt]; 3464 while (nz--) { 3465 idx = 4*(*vi++); 3466 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3467 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3468 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3469 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3470 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3471 v += 16; 3472 } 3473 v = aa + ai16; 3474 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3475 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3476 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3477 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3478 idt -= 4; 3479 } 3480 } 3481 #endif 3482 3483 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3484 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3485 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3486 PetscFunctionReturn(0); 3487 } 3488 3489 #undef __FUNCT__ 3490 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3491 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3492 { 3493 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3494 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3495 PetscErrorCode ierr; 3496 PetscInt idx,jdx,idt; 3497 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3498 const MatScalar *aa=a->a,*v; 3499 PetscScalar *x; 3500 const PetscScalar *b; 3501 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3502 3503 PetscFunctionBegin; 3504 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3505 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3506 /* forward solve the lower triangular */ 3507 idx = 0; 3508 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3509 for (i=1; i<n; i++) { 3510 v = aa + bs2*ai[i]; 3511 vi = aj + ai[i]; 3512 nz = ai[i+1] - ai[i]; 3513 idx = bs*i; 3514 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3515 for(k=0;k<nz;k++) { 3516 jdx = bs*vi[k]; 3517 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3518 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3519 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3520 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3521 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3522 3523 v += bs2; 3524 } 3525 3526 x[idx] = s1; 3527 x[1+idx] = s2; 3528 x[2+idx] = s3; 3529 x[3+idx] = s4; 3530 } 3531 3532 /* backward solve the upper triangular */ 3533 for (i=n-1; i>=0; i--){ 3534 v = aa + bs2*ai[2*n-i]; 3535 vi = aj + ai[2*n-i]; 3536 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3537 idt = bs*i; 3538 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3539 3540 for(k=0;k<nz;k++){ 3541 idx = bs*vi[k]; 3542 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3543 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3544 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3545 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3546 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3547 3548 v += bs2; 3549 } 3550 /* x = inv_diagonal*x */ 3551 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3552 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3553 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3554 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3555 3556 } 3557 3558 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3559 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3560 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3561 PetscFunctionReturn(0); 3562 } 3563 3564 #undef __FUNCT__ 3565 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 3566 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3567 { 3568 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3569 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3570 PetscErrorCode ierr; 3571 PetscInt idx,jdx,idt; 3572 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3573 const MatScalar *aa=a->a,*v; 3574 PetscScalar *x; 3575 const PetscScalar *b; 3576 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3577 3578 PetscFunctionBegin; 3579 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3580 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3581 /* forward solve the lower triangular */ 3582 idx = 0; 3583 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3584 for (i=1; i<n; i++) { 3585 v = aa + bs2*ai[i]; 3586 vi = aj + ai[i]; 3587 nz = ai[i+1] - ai[i]; 3588 idx = bs*i; 3589 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3590 for(k=0;k<nz;k++) { 3591 jdx = bs*vi[k]; 3592 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3593 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3594 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3595 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3596 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3597 3598 v += bs2; 3599 } 3600 3601 x[idx] = s1; 3602 x[1+idx] = s2; 3603 x[2+idx] = s3; 3604 x[3+idx] = s4; 3605 } 3606 3607 /* backward solve the upper triangular */ 3608 for (i=n-1; i>=0; i--){ 3609 v = aa + bs2*(adiag[i+1]+1); 3610 vi = aj + adiag[i+1]+1; 3611 nz = adiag[i] - adiag[i+1]-1; 3612 idt = bs*i; 3613 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3614 3615 for(k=0;k<nz;k++){ 3616 idx = bs*vi[k]; 3617 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3618 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3619 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3620 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3621 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3622 3623 v += bs2; 3624 } 3625 /* x = inv_diagonal*x */ 3626 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3627 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3628 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3629 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3630 3631 } 3632 3633 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3634 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3635 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3636 PetscFunctionReturn(0); 3637 } 3638 3639 #undef __FUNCT__ 3640 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3641 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3642 { 3643 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3644 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3645 PetscErrorCode ierr; 3646 PetscInt *diag = a->diag; 3647 MatScalar *aa=a->a; 3648 PetscScalar *x,*b; 3649 3650 PetscFunctionBegin; 3651 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3652 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3653 3654 { 3655 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3656 MatScalar *v,*t=(MatScalar *)x; 3657 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3658 3659 /* forward solve the lower triangular */ 3660 idx = 0; 3661 t[0] = (MatScalar)b[0]; 3662 t[1] = (MatScalar)b[1]; 3663 t[2] = (MatScalar)b[2]; 3664 t[3] = (MatScalar)b[3]; 3665 for (i=1; i<n; i++) { 3666 v = aa + 16*ai[i]; 3667 vi = aj + ai[i]; 3668 nz = diag[i] - ai[i]; 3669 idx += 4; 3670 s1 = (MatScalar)b[idx]; 3671 s2 = (MatScalar)b[1+idx]; 3672 s3 = (MatScalar)b[2+idx]; 3673 s4 = (MatScalar)b[3+idx]; 3674 while (nz--) { 3675 jdx = 4*(*vi++); 3676 x1 = t[jdx]; 3677 x2 = t[1+jdx]; 3678 x3 = t[2+jdx]; 3679 x4 = t[3+jdx]; 3680 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3681 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3682 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3683 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3684 v += 16; 3685 } 3686 t[idx] = s1; 3687 t[1+idx] = s2; 3688 t[2+idx] = s3; 3689 t[3+idx] = s4; 3690 } 3691 /* backward solve the upper triangular */ 3692 idt = 4*(n-1); 3693 for (i=n-1; i>=0; i--){ 3694 ai16 = 16*diag[i]; 3695 v = aa + ai16 + 16; 3696 vi = aj + diag[i] + 1; 3697 nz = ai[i+1] - diag[i] - 1; 3698 s1 = t[idt]; 3699 s2 = t[1+idt]; 3700 s3 = t[2+idt]; 3701 s4 = t[3+idt]; 3702 while (nz--) { 3703 idx = 4*(*vi++); 3704 x1 = (MatScalar)x[idx]; 3705 x2 = (MatScalar)x[1+idx]; 3706 x3 = (MatScalar)x[2+idx]; 3707 x4 = (MatScalar)x[3+idx]; 3708 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3709 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3710 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3711 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3712 v += 16; 3713 } 3714 v = aa + ai16; 3715 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3716 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3717 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3718 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3719 idt -= 4; 3720 } 3721 } 3722 3723 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3724 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3725 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3726 PetscFunctionReturn(0); 3727 } 3728 3729 #if defined (PETSC_HAVE_SSE) 3730 3731 #include PETSC_HAVE_SSE 3732 #undef __FUNCT__ 3733 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3734 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 3735 { 3736 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3737 unsigned short *aj=(unsigned short *)a->j; 3738 PetscErrorCode ierr; 3739 int *ai=a->i,n=a->mbs,*diag = a->diag; 3740 MatScalar *aa=a->a; 3741 PetscScalar *x,*b; 3742 3743 PetscFunctionBegin; 3744 SSE_SCOPE_BEGIN; 3745 /* 3746 Note: This code currently uses demotion of double 3747 to float when performing the mixed-mode computation. 3748 This may not be numerically reasonable for all applications. 3749 */ 3750 PREFETCH_NTA(aa+16*ai[1]); 3751 3752 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3753 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3754 { 3755 /* x will first be computed in single precision then promoted inplace to double */ 3756 MatScalar *v,*t=(MatScalar *)x; 3757 int nz,i,idt,ai16; 3758 unsigned int jdx,idx; 3759 unsigned short *vi; 3760 /* Forward solve the lower triangular factor. */ 3761 3762 /* First block is the identity. */ 3763 idx = 0; 3764 CONVERT_DOUBLE4_FLOAT4(t,b); 3765 v = aa + 16*((unsigned int)ai[1]); 3766 3767 for (i=1; i<n;) { 3768 PREFETCH_NTA(&v[8]); 3769 vi = aj + ai[i]; 3770 nz = diag[i] - ai[i]; 3771 idx += 4; 3772 3773 /* Demote RHS from double to float. */ 3774 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3775 LOAD_PS(&t[idx],XMM7); 3776 3777 while (nz--) { 3778 PREFETCH_NTA(&v[16]); 3779 jdx = 4*((unsigned int)(*vi++)); 3780 3781 /* 4x4 Matrix-Vector product with negative accumulation: */ 3782 SSE_INLINE_BEGIN_2(&t[jdx],v) 3783 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3784 3785 /* First Column */ 3786 SSE_COPY_PS(XMM0,XMM6) 3787 SSE_SHUFFLE(XMM0,XMM0,0x00) 3788 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3789 SSE_SUB_PS(XMM7,XMM0) 3790 3791 /* Second Column */ 3792 SSE_COPY_PS(XMM1,XMM6) 3793 SSE_SHUFFLE(XMM1,XMM1,0x55) 3794 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3795 SSE_SUB_PS(XMM7,XMM1) 3796 3797 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3798 3799 /* Third Column */ 3800 SSE_COPY_PS(XMM2,XMM6) 3801 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3802 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3803 SSE_SUB_PS(XMM7,XMM2) 3804 3805 /* Fourth Column */ 3806 SSE_COPY_PS(XMM3,XMM6) 3807 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3808 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3809 SSE_SUB_PS(XMM7,XMM3) 3810 SSE_INLINE_END_2 3811 3812 v += 16; 3813 } 3814 v = aa + 16*ai[++i]; 3815 PREFETCH_NTA(v); 3816 STORE_PS(&t[idx],XMM7); 3817 } 3818 3819 /* Backward solve the upper triangular factor.*/ 3820 3821 idt = 4*(n-1); 3822 ai16 = 16*diag[n-1]; 3823 v = aa + ai16 + 16; 3824 for (i=n-1; i>=0;){ 3825 PREFETCH_NTA(&v[8]); 3826 vi = aj + diag[i] + 1; 3827 nz = ai[i+1] - diag[i] - 1; 3828 3829 LOAD_PS(&t[idt],XMM7); 3830 3831 while (nz--) { 3832 PREFETCH_NTA(&v[16]); 3833 idx = 4*((unsigned int)(*vi++)); 3834 3835 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3836 SSE_INLINE_BEGIN_2(&t[idx],v) 3837 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3838 3839 /* First Column */ 3840 SSE_COPY_PS(XMM0,XMM6) 3841 SSE_SHUFFLE(XMM0,XMM0,0x00) 3842 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3843 SSE_SUB_PS(XMM7,XMM0) 3844 3845 /* Second Column */ 3846 SSE_COPY_PS(XMM1,XMM6) 3847 SSE_SHUFFLE(XMM1,XMM1,0x55) 3848 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3849 SSE_SUB_PS(XMM7,XMM1) 3850 3851 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3852 3853 /* Third Column */ 3854 SSE_COPY_PS(XMM2,XMM6) 3855 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3856 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3857 SSE_SUB_PS(XMM7,XMM2) 3858 3859 /* Fourth Column */ 3860 SSE_COPY_PS(XMM3,XMM6) 3861 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3862 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3863 SSE_SUB_PS(XMM7,XMM3) 3864 SSE_INLINE_END_2 3865 v += 16; 3866 } 3867 v = aa + ai16; 3868 ai16 = 16*diag[--i]; 3869 PREFETCH_NTA(aa+ai16+16); 3870 /* 3871 Scale the result by the diagonal 4x4 block, 3872 which was inverted as part of the factorization 3873 */ 3874 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3875 /* First Column */ 3876 SSE_COPY_PS(XMM0,XMM7) 3877 SSE_SHUFFLE(XMM0,XMM0,0x00) 3878 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3879 3880 /* Second Column */ 3881 SSE_COPY_PS(XMM1,XMM7) 3882 SSE_SHUFFLE(XMM1,XMM1,0x55) 3883 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3884 SSE_ADD_PS(XMM0,XMM1) 3885 3886 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3887 3888 /* Third Column */ 3889 SSE_COPY_PS(XMM2,XMM7) 3890 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3891 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3892 SSE_ADD_PS(XMM0,XMM2) 3893 3894 /* Fourth Column */ 3895 SSE_COPY_PS(XMM3,XMM7) 3896 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3897 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3898 SSE_ADD_PS(XMM0,XMM3) 3899 3900 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3901 SSE_INLINE_END_3 3902 3903 v = aa + ai16 + 16; 3904 idt -= 4; 3905 } 3906 3907 /* Convert t from single precision back to double precision (inplace)*/ 3908 idt = 4*(n-1); 3909 for (i=n-1;i>=0;i--) { 3910 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3911 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3912 PetscScalar *xtemp=&x[idt]; 3913 MatScalar *ttemp=&t[idt]; 3914 xtemp[3] = (PetscScalar)ttemp[3]; 3915 xtemp[2] = (PetscScalar)ttemp[2]; 3916 xtemp[1] = (PetscScalar)ttemp[1]; 3917 xtemp[0] = (PetscScalar)ttemp[0]; 3918 idt -= 4; 3919 } 3920 3921 } /* End of artificial scope. */ 3922 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3923 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3924 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3925 SSE_SCOPE_END; 3926 PetscFunctionReturn(0); 3927 } 3928 3929 #undef __FUNCT__ 3930 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3931 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 3932 { 3933 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3934 int *aj=a->j; 3935 PetscErrorCode ierr; 3936 int *ai=a->i,n=a->mbs,*diag = a->diag; 3937 MatScalar *aa=a->a; 3938 PetscScalar *x,*b; 3939 3940 PetscFunctionBegin; 3941 SSE_SCOPE_BEGIN; 3942 /* 3943 Note: This code currently uses demotion of double 3944 to float when performing the mixed-mode computation. 3945 This may not be numerically reasonable for all applications. 3946 */ 3947 PREFETCH_NTA(aa+16*ai[1]); 3948 3949 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3950 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3951 { 3952 /* x will first be computed in single precision then promoted inplace to double */ 3953 MatScalar *v,*t=(MatScalar *)x; 3954 int nz,i,idt,ai16; 3955 int jdx,idx; 3956 int *vi; 3957 /* Forward solve the lower triangular factor. */ 3958 3959 /* First block is the identity. */ 3960 idx = 0; 3961 CONVERT_DOUBLE4_FLOAT4(t,b); 3962 v = aa + 16*ai[1]; 3963 3964 for (i=1; i<n;) { 3965 PREFETCH_NTA(&v[8]); 3966 vi = aj + ai[i]; 3967 nz = diag[i] - ai[i]; 3968 idx += 4; 3969 3970 /* Demote RHS from double to float. */ 3971 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3972 LOAD_PS(&t[idx],XMM7); 3973 3974 while (nz--) { 3975 PREFETCH_NTA(&v[16]); 3976 jdx = 4*(*vi++); 3977 /* jdx = *vi++; */ 3978 3979 /* 4x4 Matrix-Vector product with negative accumulation: */ 3980 SSE_INLINE_BEGIN_2(&t[jdx],v) 3981 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3982 3983 /* First Column */ 3984 SSE_COPY_PS(XMM0,XMM6) 3985 SSE_SHUFFLE(XMM0,XMM0,0x00) 3986 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3987 SSE_SUB_PS(XMM7,XMM0) 3988 3989 /* Second Column */ 3990 SSE_COPY_PS(XMM1,XMM6) 3991 SSE_SHUFFLE(XMM1,XMM1,0x55) 3992 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3993 SSE_SUB_PS(XMM7,XMM1) 3994 3995 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3996 3997 /* Third Column */ 3998 SSE_COPY_PS(XMM2,XMM6) 3999 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4000 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4001 SSE_SUB_PS(XMM7,XMM2) 4002 4003 /* Fourth Column */ 4004 SSE_COPY_PS(XMM3,XMM6) 4005 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4006 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4007 SSE_SUB_PS(XMM7,XMM3) 4008 SSE_INLINE_END_2 4009 4010 v += 16; 4011 } 4012 v = aa + 16*ai[++i]; 4013 PREFETCH_NTA(v); 4014 STORE_PS(&t[idx],XMM7); 4015 } 4016 4017 /* Backward solve the upper triangular factor.*/ 4018 4019 idt = 4*(n-1); 4020 ai16 = 16*diag[n-1]; 4021 v = aa + ai16 + 16; 4022 for (i=n-1; i>=0;){ 4023 PREFETCH_NTA(&v[8]); 4024 vi = aj + diag[i] + 1; 4025 nz = ai[i+1] - diag[i] - 1; 4026 4027 LOAD_PS(&t[idt],XMM7); 4028 4029 while (nz--) { 4030 PREFETCH_NTA(&v[16]); 4031 idx = 4*(*vi++); 4032 /* idx = *vi++; */ 4033 4034 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4035 SSE_INLINE_BEGIN_2(&t[idx],v) 4036 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4037 4038 /* First Column */ 4039 SSE_COPY_PS(XMM0,XMM6) 4040 SSE_SHUFFLE(XMM0,XMM0,0x00) 4041 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4042 SSE_SUB_PS(XMM7,XMM0) 4043 4044 /* Second Column */ 4045 SSE_COPY_PS(XMM1,XMM6) 4046 SSE_SHUFFLE(XMM1,XMM1,0x55) 4047 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4048 SSE_SUB_PS(XMM7,XMM1) 4049 4050 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4051 4052 /* Third Column */ 4053 SSE_COPY_PS(XMM2,XMM6) 4054 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4055 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4056 SSE_SUB_PS(XMM7,XMM2) 4057 4058 /* Fourth Column */ 4059 SSE_COPY_PS(XMM3,XMM6) 4060 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4061 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4062 SSE_SUB_PS(XMM7,XMM3) 4063 SSE_INLINE_END_2 4064 v += 16; 4065 } 4066 v = aa + ai16; 4067 ai16 = 16*diag[--i]; 4068 PREFETCH_NTA(aa+ai16+16); 4069 /* 4070 Scale the result by the diagonal 4x4 block, 4071 which was inverted as part of the factorization 4072 */ 4073 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4074 /* First Column */ 4075 SSE_COPY_PS(XMM0,XMM7) 4076 SSE_SHUFFLE(XMM0,XMM0,0x00) 4077 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4078 4079 /* Second Column */ 4080 SSE_COPY_PS(XMM1,XMM7) 4081 SSE_SHUFFLE(XMM1,XMM1,0x55) 4082 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4083 SSE_ADD_PS(XMM0,XMM1) 4084 4085 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4086 4087 /* Third Column */ 4088 SSE_COPY_PS(XMM2,XMM7) 4089 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4090 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4091 SSE_ADD_PS(XMM0,XMM2) 4092 4093 /* Fourth Column */ 4094 SSE_COPY_PS(XMM3,XMM7) 4095 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4096 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4097 SSE_ADD_PS(XMM0,XMM3) 4098 4099 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4100 SSE_INLINE_END_3 4101 4102 v = aa + ai16 + 16; 4103 idt -= 4; 4104 } 4105 4106 /* Convert t from single precision back to double precision (inplace)*/ 4107 idt = 4*(n-1); 4108 for (i=n-1;i>=0;i--) { 4109 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4110 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4111 PetscScalar *xtemp=&x[idt]; 4112 MatScalar *ttemp=&t[idt]; 4113 xtemp[3] = (PetscScalar)ttemp[3]; 4114 xtemp[2] = (PetscScalar)ttemp[2]; 4115 xtemp[1] = (PetscScalar)ttemp[1]; 4116 xtemp[0] = (PetscScalar)ttemp[0]; 4117 idt -= 4; 4118 } 4119 4120 } /* End of artificial scope. */ 4121 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4122 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4123 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4124 SSE_SCOPE_END; 4125 PetscFunctionReturn(0); 4126 } 4127 4128 #endif 4129 4130 #undef __FUNCT__ 4131 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4132 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4133 { 4134 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4135 IS iscol=a->col,isrow=a->row; 4136 PetscErrorCode ierr; 4137 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4138 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4139 const MatScalar *aa=a->a,*v; 4140 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4141 const PetscScalar *b; 4142 4143 PetscFunctionBegin; 4144 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4145 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4146 t = a->solve_work; 4147 4148 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4149 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4150 4151 /* forward solve the lower triangular */ 4152 idx = 3*(*r++); 4153 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4154 for (i=1; i<n; i++) { 4155 v = aa + 9*ai[i]; 4156 vi = aj + ai[i]; 4157 nz = diag[i] - ai[i]; 4158 idx = 3*(*r++); 4159 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4160 while (nz--) { 4161 idx = 3*(*vi++); 4162 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4163 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4164 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4165 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4166 v += 9; 4167 } 4168 idx = 3*i; 4169 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4170 } 4171 /* backward solve the upper triangular */ 4172 for (i=n-1; i>=0; i--){ 4173 v = aa + 9*diag[i] + 9; 4174 vi = aj + diag[i] + 1; 4175 nz = ai[i+1] - diag[i] - 1; 4176 idt = 3*i; 4177 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4178 while (nz--) { 4179 idx = 3*(*vi++); 4180 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4181 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4182 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4183 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4184 v += 9; 4185 } 4186 idc = 3*(*c--); 4187 v = aa + 9*diag[i]; 4188 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4189 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4190 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4191 } 4192 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4193 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4194 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4195 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4196 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4197 PetscFunctionReturn(0); 4198 } 4199 4200 #undef __FUNCT__ 4201 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 4202 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 4203 { 4204 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4205 IS iscol=a->col,isrow=a->row; 4206 PetscErrorCode ierr; 4207 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 4208 const PetscInt *r,*c,*rout,*cout; 4209 const MatScalar *aa=a->a,*v; 4210 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4211 const PetscScalar *b; 4212 4213 PetscFunctionBegin; 4214 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4215 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4216 t = a->solve_work; 4217 4218 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4219 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4220 4221 /* forward solve the lower triangular */ 4222 idx = 3*r[0]; 4223 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4224 for (i=1; i<n; i++) { 4225 v = aa + 9*ai[i]; 4226 vi = aj + ai[i]; 4227 nz = ai[i+1] - ai[i]; 4228 idx = 3*r[i]; 4229 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4230 for(m=0;m<nz;m++){ 4231 idx = 3*vi[m]; 4232 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4233 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4234 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4235 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4236 v += 9; 4237 } 4238 idx = 3*i; 4239 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4240 } 4241 /* backward solve the upper triangular */ 4242 for (i=n-1; i>=0; i--){ 4243 k = 2*n-i; 4244 v = aa + 9*ai[k]; 4245 vi = aj + ai[k]; 4246 nz = ai[k +1] - ai[k] - 1; 4247 idt = 3*i; 4248 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4249 for(m=0;m<nz;m++){ 4250 idx = 3*vi[m]; 4251 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4252 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4253 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4254 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4255 v += 9; 4256 } 4257 idc = 3*c[i]; 4258 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4259 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4260 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4261 } 4262 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4263 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4264 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4265 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4266 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4267 PetscFunctionReturn(0); 4268 } 4269 4270 #undef __FUNCT__ 4271 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2" 4272 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4273 { 4274 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4275 IS iscol=a->col,isrow=a->row; 4276 PetscErrorCode ierr; 4277 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 4278 const PetscInt *r,*c,*rout,*cout; 4279 const MatScalar *aa=a->a,*v; 4280 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4281 const PetscScalar *b; 4282 4283 PetscFunctionBegin; 4284 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4285 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4286 t = a->solve_work; 4287 4288 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4289 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4290 4291 /* forward solve the lower triangular */ 4292 idx = 3*r[0]; 4293 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4294 for (i=1; i<n; i++) { 4295 v = aa + 9*ai[i]; 4296 vi = aj + ai[i]; 4297 nz = ai[i+1] - ai[i]; 4298 idx = 3*r[i]; 4299 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4300 for(m=0;m<nz;m++){ 4301 idx = 3*vi[m]; 4302 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4303 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4304 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4305 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4306 v += 9; 4307 } 4308 idx = 3*i; 4309 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4310 } 4311 /* backward solve the upper triangular */ 4312 for (i=n-1; i>=0; i--){ 4313 v = aa + 9*(adiag[i+1]+1); 4314 vi = aj + adiag[i+1]+1; 4315 nz = adiag[i] - adiag[i+1] - 1; 4316 idt = 3*i; 4317 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4318 for(m=0;m<nz;m++){ 4319 idx = 3*vi[m]; 4320 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4321 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4322 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4323 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4324 v += 9; 4325 } 4326 idc = 3*c[i]; 4327 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4328 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4329 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4330 } 4331 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4332 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4333 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4334 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4335 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4336 PetscFunctionReturn(0); 4337 } 4338 4339 /* 4340 Special case where the matrix was ILU(0) factored in the natural 4341 ordering. This eliminates the need for the column and row permutation. 4342 */ 4343 #undef __FUNCT__ 4344 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4345 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4346 { 4347 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4348 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4349 PetscErrorCode ierr; 4350 PetscInt *diag = a->diag; 4351 const MatScalar *aa=a->a,*v; 4352 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4353 const PetscScalar *b; 4354 PetscInt jdx,idt,idx,nz,*vi,i; 4355 4356 PetscFunctionBegin; 4357 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4358 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4359 4360 /* forward solve the lower triangular */ 4361 idx = 0; 4362 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4363 for (i=1; i<n; i++) { 4364 v = aa + 9*ai[i]; 4365 vi = aj + ai[i]; 4366 nz = diag[i] - ai[i]; 4367 idx += 3; 4368 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4369 while (nz--) { 4370 jdx = 3*(*vi++); 4371 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4372 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4373 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4374 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4375 v += 9; 4376 } 4377 x[idx] = s1; 4378 x[1+idx] = s2; 4379 x[2+idx] = s3; 4380 } 4381 /* backward solve the upper triangular */ 4382 for (i=n-1; i>=0; i--){ 4383 v = aa + 9*diag[i] + 9; 4384 vi = aj + diag[i] + 1; 4385 nz = ai[i+1] - diag[i] - 1; 4386 idt = 3*i; 4387 s1 = x[idt]; s2 = x[1+idt]; 4388 s3 = x[2+idt]; 4389 while (nz--) { 4390 idx = 3*(*vi++); 4391 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4392 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4393 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4394 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4395 v += 9; 4396 } 4397 v = aa + 9*diag[i]; 4398 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4399 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4400 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4401 } 4402 4403 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4404 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4405 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4406 PetscFunctionReturn(0); 4407 } 4408 4409 #undef __FUNCT__ 4410 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4411 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4412 { 4413 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4414 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4415 PetscErrorCode ierr; 4416 PetscInt idx,jdx,idt; 4417 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4418 const MatScalar *aa=a->a,*v; 4419 PetscScalar *x; 4420 const PetscScalar *b; 4421 PetscScalar s1,s2,s3,x1,x2,x3; 4422 4423 PetscFunctionBegin; 4424 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4425 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4426 /* forward solve the lower triangular */ 4427 idx = 0; 4428 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4429 for (i=1; i<n; i++) { 4430 v = aa + bs2*ai[i]; 4431 vi = aj + ai[i]; 4432 nz = ai[i+1] - ai[i]; 4433 idx = bs*i; 4434 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4435 for(k=0;k<nz;k++){ 4436 jdx = bs*vi[k]; 4437 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4438 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4439 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4440 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4441 4442 v += bs2; 4443 } 4444 4445 x[idx] = s1; 4446 x[1+idx] = s2; 4447 x[2+idx] = s3; 4448 } 4449 4450 /* backward solve the upper triangular */ 4451 for (i=n-1; i>=0; i--){ 4452 v = aa + bs2*ai[2*n-i]; 4453 vi = aj + ai[2*n-i]; 4454 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4455 idt = bs*i; 4456 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4457 4458 for(k=0;k<nz;k++){ 4459 idx = bs*vi[k]; 4460 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4461 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4462 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4463 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4464 4465 v += bs2; 4466 } 4467 /* x = inv_diagonal*x */ 4468 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4469 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4470 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4471 4472 } 4473 4474 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4475 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4476 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4477 PetscFunctionReturn(0); 4478 } 4479 4480 #undef __FUNCT__ 4481 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 4482 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4483 { 4484 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4485 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4486 PetscErrorCode ierr; 4487 PetscInt idx,jdx,idt; 4488 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4489 const MatScalar *aa=a->a,*v; 4490 PetscScalar *x; 4491 const PetscScalar *b; 4492 PetscScalar s1,s2,s3,x1,x2,x3; 4493 4494 PetscFunctionBegin; 4495 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4496 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4497 /* forward solve the lower triangular */ 4498 idx = 0; 4499 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4500 for (i=1; i<n; i++) { 4501 v = aa + bs2*ai[i]; 4502 vi = aj + ai[i]; 4503 nz = ai[i+1] - ai[i]; 4504 idx = bs*i; 4505 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4506 for(k=0;k<nz;k++){ 4507 jdx = bs*vi[k]; 4508 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4509 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4510 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4511 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4512 4513 v += bs2; 4514 } 4515 4516 x[idx] = s1; 4517 x[1+idx] = s2; 4518 x[2+idx] = s3; 4519 } 4520 4521 /* backward solve the upper triangular */ 4522 for (i=n-1; i>=0; i--){ 4523 v = aa + bs2*(adiag[i+1]+1); 4524 vi = aj + adiag[i+1]+1; 4525 nz = adiag[i] - adiag[i+1]-1; 4526 idt = bs*i; 4527 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4528 4529 for(k=0;k<nz;k++){ 4530 idx = bs*vi[k]; 4531 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4532 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4533 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4534 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4535 4536 v += bs2; 4537 } 4538 /* x = inv_diagonal*x */ 4539 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4540 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4541 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4542 4543 } 4544 4545 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4546 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4547 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4548 PetscFunctionReturn(0); 4549 } 4550 4551 #undef __FUNCT__ 4552 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4553 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4554 { 4555 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4556 IS iscol=a->col,isrow=a->row; 4557 PetscErrorCode ierr; 4558 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4559 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4560 const MatScalar *aa=a->a,*v; 4561 PetscScalar *x,s1,s2,x1,x2,*t; 4562 const PetscScalar *b; 4563 4564 PetscFunctionBegin; 4565 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4566 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4567 t = a->solve_work; 4568 4569 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4570 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4571 4572 /* forward solve the lower triangular */ 4573 idx = 2*(*r++); 4574 t[0] = b[idx]; t[1] = b[1+idx]; 4575 for (i=1; i<n; i++) { 4576 v = aa + 4*ai[i]; 4577 vi = aj + ai[i]; 4578 nz = diag[i] - ai[i]; 4579 idx = 2*(*r++); 4580 s1 = b[idx]; s2 = b[1+idx]; 4581 while (nz--) { 4582 idx = 2*(*vi++); 4583 x1 = t[idx]; x2 = t[1+idx]; 4584 s1 -= v[0]*x1 + v[2]*x2; 4585 s2 -= v[1]*x1 + v[3]*x2; 4586 v += 4; 4587 } 4588 idx = 2*i; 4589 t[idx] = s1; t[1+idx] = s2; 4590 } 4591 /* backward solve the upper triangular */ 4592 for (i=n-1; i>=0; i--){ 4593 v = aa + 4*diag[i] + 4; 4594 vi = aj + diag[i] + 1; 4595 nz = ai[i+1] - diag[i] - 1; 4596 idt = 2*i; 4597 s1 = t[idt]; s2 = t[1+idt]; 4598 while (nz--) { 4599 idx = 2*(*vi++); 4600 x1 = t[idx]; x2 = t[1+idx]; 4601 s1 -= v[0]*x1 + v[2]*x2; 4602 s2 -= v[1]*x1 + v[3]*x2; 4603 v += 4; 4604 } 4605 idc = 2*(*c--); 4606 v = aa + 4*diag[i]; 4607 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4608 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4609 } 4610 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4611 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4612 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4613 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4614 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4615 PetscFunctionReturn(0); 4616 } 4617 4618 #undef __FUNCT__ 4619 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4620 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 4621 { 4622 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4623 IS iscol=a->col,isrow=a->row; 4624 PetscErrorCode ierr; 4625 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 4626 const PetscInt *r,*c,*rout,*cout; 4627 const MatScalar *aa=a->a,*v; 4628 PetscScalar *x,s1,s2,x1,x2,*t; 4629 const PetscScalar *b; 4630 4631 PetscFunctionBegin; 4632 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4633 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4634 t = a->solve_work; 4635 4636 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4637 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4638 4639 /* forward solve the lower triangular */ 4640 idx = 2*r[0]; 4641 t[0] = b[idx]; t[1] = b[1+idx]; 4642 for (i=1; i<n; i++) { 4643 v = aa + 4*ai[i]; 4644 vi = aj + ai[i]; 4645 nz = ai[i+1] - ai[i]; 4646 idx = 2*r[i]; 4647 s1 = b[idx]; s2 = b[1+idx]; 4648 for(m=0;m<nz;m++){ 4649 jdx = 2*vi[m]; 4650 x1 = t[jdx]; x2 = t[1+jdx]; 4651 s1 -= v[0]*x1 + v[2]*x2; 4652 s2 -= v[1]*x1 + v[3]*x2; 4653 v += 4; 4654 } 4655 idx = 2*i; 4656 t[idx] = s1; t[1+idx] = s2; 4657 } 4658 /* backward solve the upper triangular */ 4659 for (i=n-1; i>=0; i--){ 4660 k = 2*n-i; 4661 v = aa + 4*ai[k]; 4662 vi = aj + ai[k]; 4663 nz = ai[k +1] - ai[k] - 1; 4664 idt = 2*i; 4665 s1 = t[idt]; s2 = t[1+idt]; 4666 for(m=0;m<nz;m++){ 4667 idx = 2*vi[m]; 4668 x1 = t[idx]; x2 = t[1+idx]; 4669 s1 -= v[0]*x1 + v[2]*x2; 4670 s2 -= v[1]*x1 + v[3]*x2; 4671 v += 4; 4672 } 4673 idc = 2*c[i]; 4674 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4675 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4676 } 4677 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4678 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4679 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4680 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4681 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4682 PetscFunctionReturn(0); 4683 } 4684 4685 #undef __FUNCT__ 4686 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2" 4687 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4688 { 4689 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4690 IS iscol=a->col,isrow=a->row; 4691 PetscErrorCode ierr; 4692 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4693 const PetscInt *r,*c,*rout,*cout; 4694 const MatScalar *aa=a->a,*v; 4695 PetscScalar *x,s1,s2,x1,x2,*t; 4696 const PetscScalar *b; 4697 4698 PetscFunctionBegin; 4699 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4700 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4701 t = a->solve_work; 4702 4703 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4704 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4705 4706 /* forward solve the lower triangular */ 4707 idx = 2*r[0]; 4708 t[0] = b[idx]; t[1] = b[1+idx]; 4709 for (i=1; i<n; i++) { 4710 v = aa + 4*ai[i]; 4711 vi = aj + ai[i]; 4712 nz = ai[i+1] - ai[i]; 4713 idx = 2*r[i]; 4714 s1 = b[idx]; s2 = b[1+idx]; 4715 for(m=0;m<nz;m++){ 4716 jdx = 2*vi[m]; 4717 x1 = t[jdx]; x2 = t[1+jdx]; 4718 s1 -= v[0]*x1 + v[2]*x2; 4719 s2 -= v[1]*x1 + v[3]*x2; 4720 v += 4; 4721 } 4722 idx = 2*i; 4723 t[idx] = s1; t[1+idx] = s2; 4724 } 4725 /* backward solve the upper triangular */ 4726 for (i=n-1; i>=0; i--){ 4727 v = aa + 4*(adiag[i+1]+1); 4728 vi = aj + adiag[i+1]+1; 4729 nz = adiag[i] - adiag[i+1] - 1; 4730 idt = 2*i; 4731 s1 = t[idt]; s2 = t[1+idt]; 4732 for(m=0;m<nz;m++){ 4733 idx = 2*vi[m]; 4734 x1 = t[idx]; x2 = t[1+idx]; 4735 s1 -= v[0]*x1 + v[2]*x2; 4736 s2 -= v[1]*x1 + v[3]*x2; 4737 v += 4; 4738 } 4739 idc = 2*c[i]; 4740 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4741 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4742 } 4743 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4744 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4745 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4746 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4747 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4748 PetscFunctionReturn(0); 4749 } 4750 4751 /* 4752 Special case where the matrix was ILU(0) factored in the natural 4753 ordering. This eliminates the need for the column and row permutation. 4754 */ 4755 #undef __FUNCT__ 4756 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4757 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 4758 { 4759 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4760 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4761 PetscErrorCode ierr; 4762 PetscInt *diag = a->diag; 4763 const MatScalar *aa=a->a,*v; 4764 PetscScalar *x,s1,s2,x1,x2; 4765 const PetscScalar *b; 4766 PetscInt jdx,idt,idx,nz,*vi,i; 4767 4768 PetscFunctionBegin; 4769 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4770 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4771 4772 /* forward solve the lower triangular */ 4773 idx = 0; 4774 x[0] = b[0]; x[1] = b[1]; 4775 for (i=1; i<n; i++) { 4776 v = aa + 4*ai[i]; 4777 vi = aj + ai[i]; 4778 nz = diag[i] - ai[i]; 4779 idx += 2; 4780 s1 = b[idx];s2 = b[1+idx]; 4781 while (nz--) { 4782 jdx = 2*(*vi++); 4783 x1 = x[jdx];x2 = x[1+jdx]; 4784 s1 -= v[0]*x1 + v[2]*x2; 4785 s2 -= v[1]*x1 + v[3]*x2; 4786 v += 4; 4787 } 4788 x[idx] = s1; 4789 x[1+idx] = s2; 4790 } 4791 /* backward solve the upper triangular */ 4792 for (i=n-1; i>=0; i--){ 4793 v = aa + 4*diag[i] + 4; 4794 vi = aj + diag[i] + 1; 4795 nz = ai[i+1] - diag[i] - 1; 4796 idt = 2*i; 4797 s1 = x[idt]; s2 = x[1+idt]; 4798 while (nz--) { 4799 idx = 2*(*vi++); 4800 x1 = x[idx]; x2 = x[1+idx]; 4801 s1 -= v[0]*x1 + v[2]*x2; 4802 s2 -= v[1]*x1 + v[3]*x2; 4803 v += 4; 4804 } 4805 v = aa + 4*diag[i]; 4806 x[idt] = v[0]*s1 + v[2]*s2; 4807 x[1+idt] = v[1]*s1 + v[3]*s2; 4808 } 4809 4810 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4811 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4812 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4813 PetscFunctionReturn(0); 4814 } 4815 4816 #undef __FUNCT__ 4817 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4818 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4819 { 4820 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4821 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4822 PetscErrorCode ierr; 4823 PetscInt jdx; 4824 const MatScalar *aa=a->a,*v; 4825 PetscScalar *x,s1,s2,x1,x2; 4826 const PetscScalar *b; 4827 4828 PetscFunctionBegin; 4829 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4830 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4831 /* forward solve the lower triangular */ 4832 idx = 0; 4833 x[0] = b[idx]; x[1] = b[1+idx]; 4834 for (i=1; i<n; i++) { 4835 v = aa + 4*ai[i]; 4836 vi = aj + ai[i]; 4837 nz = ai[i+1] - ai[i]; 4838 idx = 2*i; 4839 s1 = b[idx];s2 = b[1+idx]; 4840 for(k=0;k<nz;k++){ 4841 jdx = 2*vi[k]; 4842 x1 = x[jdx];x2 = x[1+jdx]; 4843 s1 -= v[0]*x1 + v[2]*x2; 4844 s2 -= v[1]*x1 + v[3]*x2; 4845 v += 4; 4846 } 4847 x[idx] = s1; 4848 x[1+idx] = s2; 4849 } 4850 4851 /* backward solve the upper triangular */ 4852 for (i=n-1; i>=0; i--){ 4853 v = aa + 4*ai[2*n-i]; 4854 vi = aj + ai[2*n-i]; 4855 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4856 idt = 2*i; 4857 s1 = x[idt]; s2 = x[1+idt]; 4858 for(k=0;k<nz;k++){ 4859 idx = 2*vi[k]; 4860 x1 = x[idx]; x2 = x[1+idx]; 4861 s1 -= v[0]*x1 + v[2]*x2; 4862 s2 -= v[1]*x1 + v[3]*x2; 4863 v += 4; 4864 } 4865 /* x = inv_diagonal*x */ 4866 x[idt] = v[0]*s1 + v[2]*s2; 4867 x[1+idt] = v[1]*s1 + v[3]*s2; 4868 } 4869 4870 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4871 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4872 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4873 PetscFunctionReturn(0); 4874 } 4875 4876 #undef __FUNCT__ 4877 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4878 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4879 { 4880 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4881 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4882 PetscErrorCode ierr; 4883 PetscInt jdx; 4884 const MatScalar *aa=a->a,*v; 4885 PetscScalar *x,s1,s2,x1,x2; 4886 const PetscScalar *b; 4887 4888 PetscFunctionBegin; 4889 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4890 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4891 /* forward solve the lower triangular */ 4892 idx = 0; 4893 x[0] = b[idx]; x[1] = b[1+idx]; 4894 for (i=1; i<n; i++) { 4895 v = aa + 4*ai[i]; 4896 vi = aj + ai[i]; 4897 nz = ai[i+1] - ai[i]; 4898 idx = 2*i; 4899 s1 = b[idx];s2 = b[1+idx]; 4900 for(k=0;k<nz;k++){ 4901 jdx = 2*vi[k]; 4902 x1 = x[jdx];x2 = x[1+jdx]; 4903 s1 -= v[0]*x1 + v[2]*x2; 4904 s2 -= v[1]*x1 + v[3]*x2; 4905 v += 4; 4906 } 4907 x[idx] = s1; 4908 x[1+idx] = s2; 4909 } 4910 4911 /* backward solve the upper triangular */ 4912 for (i=n-1; i>=0; i--){ 4913 v = aa + 4*(adiag[i+1]+1); 4914 vi = aj + adiag[i+1]+1; 4915 nz = adiag[i] - adiag[i+1]-1; 4916 idt = 2*i; 4917 s1 = x[idt]; s2 = x[1+idt]; 4918 for(k=0;k<nz;k++){ 4919 idx = 2*vi[k]; 4920 x1 = x[idx]; x2 = x[1+idx]; 4921 s1 -= v[0]*x1 + v[2]*x2; 4922 s2 -= v[1]*x1 + v[3]*x2; 4923 v += 4; 4924 } 4925 /* x = inv_diagonal*x */ 4926 x[idt] = v[0]*s1 + v[2]*s2; 4927 x[1+idt] = v[1]*s1 + v[3]*s2; 4928 } 4929 4930 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4931 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4932 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4933 PetscFunctionReturn(0); 4934 } 4935 4936 #undef __FUNCT__ 4937 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4938 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4939 { 4940 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4941 IS iscol=a->col,isrow=a->row; 4942 PetscErrorCode ierr; 4943 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4944 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4945 MatScalar *aa=a->a,*v; 4946 PetscScalar *x,*b,s1,*t; 4947 4948 PetscFunctionBegin; 4949 if (!n) PetscFunctionReturn(0); 4950 4951 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4952 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4953 t = a->solve_work; 4954 4955 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4956 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4957 4958 /* forward solve the lower triangular */ 4959 t[0] = b[*r++]; 4960 for (i=1; i<n; i++) { 4961 v = aa + ai[i]; 4962 vi = aj + ai[i]; 4963 nz = diag[i] - ai[i]; 4964 s1 = b[*r++]; 4965 while (nz--) { 4966 s1 -= (*v++)*t[*vi++]; 4967 } 4968 t[i] = s1; 4969 } 4970 /* backward solve the upper triangular */ 4971 for (i=n-1; i>=0; i--){ 4972 v = aa + diag[i] + 1; 4973 vi = aj + diag[i] + 1; 4974 nz = ai[i+1] - diag[i] - 1; 4975 s1 = t[i]; 4976 while (nz--) { 4977 s1 -= (*v++)*t[*vi++]; 4978 } 4979 x[*c--] = t[i] = aa[diag[i]]*s1; 4980 } 4981 4982 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4983 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4984 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4985 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4986 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4987 PetscFunctionReturn(0); 4988 } 4989 /* 4990 Special case where the matrix was ILU(0) factored in the natural 4991 ordering. This eliminates the need for the column and row permutation. 4992 */ 4993 #undef __FUNCT__ 4994 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4995 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 4996 { 4997 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4998 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4999 PetscErrorCode ierr; 5000 PetscInt *diag = a->diag; 5001 MatScalar *aa=a->a; 5002 PetscScalar *x,*b; 5003 PetscScalar s1,x1; 5004 MatScalar *v; 5005 PetscInt jdx,idt,idx,nz,*vi,i; 5006 5007 PetscFunctionBegin; 5008 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5009 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5010 5011 /* forward solve the lower triangular */ 5012 idx = 0; 5013 x[0] = b[0]; 5014 for (i=1; i<n; i++) { 5015 v = aa + ai[i]; 5016 vi = aj + ai[i]; 5017 nz = diag[i] - ai[i]; 5018 idx += 1; 5019 s1 = b[idx]; 5020 while (nz--) { 5021 jdx = *vi++; 5022 x1 = x[jdx]; 5023 s1 -= v[0]*x1; 5024 v += 1; 5025 } 5026 x[idx] = s1; 5027 } 5028 /* backward solve the upper triangular */ 5029 for (i=n-1; i>=0; i--){ 5030 v = aa + diag[i] + 1; 5031 vi = aj + diag[i] + 1; 5032 nz = ai[i+1] - diag[i] - 1; 5033 idt = i; 5034 s1 = x[idt]; 5035 while (nz--) { 5036 idx = *vi++; 5037 x1 = x[idx]; 5038 s1 -= v[0]*x1; 5039 v += 1; 5040 } 5041 v = aa + diag[i]; 5042 x[idt] = v[0]*s1; 5043 } 5044 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5045 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5046 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5047 PetscFunctionReturn(0); 5048 } 5049 5050 /* ----------------------------------------------------------------*/ 5051 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 5052 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 5053 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 5054 5055 #undef __FUNCT__ 5056 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 5057 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 5058 { 5059 Mat C=B; 5060 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5061 IS isrow = b->row,isicol = b->icol; 5062 PetscErrorCode ierr; 5063 const PetscInt *r,*ic,*ics; 5064 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5065 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5066 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5067 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5068 MatScalar *v_work; 5069 PetscTruth col_identity,row_identity,both_identity; 5070 5071 PetscFunctionBegin; 5072 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5073 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5074 5075 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5076 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5077 ics = ic; 5078 5079 /* generate work space needed by dense LU factorization */ 5080 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5081 5082 for (i=0; i<n; i++){ 5083 /* zero rtmp */ 5084 /* L part */ 5085 nz = bi[i+1] - bi[i]; 5086 bjtmp = bj + bi[i]; 5087 for (j=0; j<nz; j++){ 5088 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5089 } 5090 5091 /* U part */ 5092 nz = bdiag[i] - bdiag[i+1]; 5093 bjtmp = bj + bdiag[i+1]+1; 5094 for (j=0; j<nz; j++){ 5095 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5096 } 5097 5098 /* load in initial (unfactored row) */ 5099 nz = ai[r[i]+1] - ai[r[i]]; 5100 ajtmp = aj + ai[r[i]]; 5101 v = aa + bs2*ai[r[i]]; 5102 for (j=0; j<nz; j++) { 5103 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5104 } 5105 5106 /* elimination */ 5107 bjtmp = bj + bi[i]; 5108 nzL = bi[i+1] - bi[i]; 5109 for(k=0;k < nzL;k++) { 5110 row = bjtmp[k]; 5111 pc = rtmp + bs2*row; 5112 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5113 if (flg) { 5114 pv = b->a + bs2*bdiag[row]; 5115 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5116 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5117 pv = b->a + bs2*(bdiag[row+1]+1); 5118 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5119 for (j=0; j<nz; j++) { 5120 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5121 } 5122 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5123 } 5124 } 5125 5126 /* finished row so stick it into b->a */ 5127 /* L part */ 5128 pv = b->a + bs2*bi[i] ; 5129 pj = b->j + bi[i] ; 5130 nz = bi[i+1] - bi[i]; 5131 for (j=0; j<nz; j++) { 5132 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5133 } 5134 5135 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5136 pv = b->a + bs2*bdiag[i]; 5137 pj = b->j + bdiag[i]; 5138 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5139 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5140 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5141 5142 /* U part */ 5143 pv = b->a + bs2*(bdiag[i+1]+1); 5144 pj = b->j + bdiag[i+1]+1; 5145 nz = bdiag[i] - bdiag[i+1] - 1; 5146 for (j=0; j<nz; j++){ 5147 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5148 } 5149 } 5150 5151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5152 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5153 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5154 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5155 5156 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5157 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5158 both_identity = (PetscTruth) (row_identity && col_identity); 5159 if (both_identity){ 5160 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2; 5161 } else { 5162 C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct_v2; 5163 } 5164 5165 C->assembled = PETSC_TRUE; 5166 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5167 PetscFunctionReturn(0); 5168 } 5169 5170 /* 5171 ilu(0) with natural ordering under new data structure. 5172 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 5173 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 5174 */ 5175 5176 #undef __FUNCT__ 5177 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 5178 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5179 { 5180 5181 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5182 PetscErrorCode ierr; 5183 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5184 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5185 5186 PetscFunctionBegin; 5187 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5188 b = (Mat_SeqBAIJ*)(fact)->data; 5189 5190 /* allocate matrix arrays for new data structure */ 5191 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5192 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5193 b->singlemalloc = PETSC_TRUE; 5194 if (!b->diag){ 5195 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5196 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5197 } 5198 bdiag = b->diag; 5199 5200 if (n > 0) { 5201 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5202 } 5203 5204 /* set bi and bj with new data structure */ 5205 bi = b->i; 5206 bj = b->j; 5207 5208 /* L part */ 5209 bi[0] = 0; 5210 for (i=0; i<n; i++){ 5211 nz = adiag[i] - ai[i]; 5212 bi[i+1] = bi[i] + nz; 5213 aj = a->j + ai[i]; 5214 for (j=0; j<nz; j++){ 5215 *bj = aj[j]; bj++; 5216 } 5217 } 5218 5219 /* U part */ 5220 bi_temp = bi[n]; 5221 bdiag[n] = bi[n]-1; 5222 for (i=n-1; i>=0; i--){ 5223 nz = ai[i+1] - adiag[i] - 1; 5224 bi_temp = bi_temp + nz + 1; 5225 aj = a->j + adiag[i] + 1; 5226 for (j=0; j<nz; j++){ 5227 *bj = aj[j]; bj++; 5228 } 5229 /* diag[i] */ 5230 *bj = i; bj++; 5231 bdiag[i] = bi_temp - 1; 5232 } 5233 PetscFunctionReturn(0); 5234 } 5235 5236 #undef __FUNCT__ 5237 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 5238 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5239 { 5240 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5241 IS isicol; 5242 PetscErrorCode ierr; 5243 const PetscInt *r,*ic; 5244 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5245 PetscInt *bi,*cols,nnz,*cols_lvl; 5246 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5247 PetscInt i,levels,diagonal_fill; 5248 PetscTruth col_identity,row_identity,both_identity; 5249 PetscReal f; 5250 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5251 PetscBT lnkbt; 5252 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5253 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5254 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5255 PetscTruth missing; 5256 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5257 5258 PetscFunctionBegin; 5259 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5260 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5261 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5262 5263 f = info->fill; 5264 levels = (PetscInt)info->levels; 5265 diagonal_fill = (PetscInt)info->diagonal_fill; 5266 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5267 5268 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5269 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5270 both_identity = (PetscTruth) (row_identity && col_identity); 5271 5272 if (!levels && both_identity) { 5273 /* special case: ilu(0) with natural ordering */ 5274 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5275 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 5276 5277 fact->factor = MAT_FACTOR_ILU; 5278 (fact)->info.factor_mallocs = 0; 5279 (fact)->info.fill_ratio_given = info->fill; 5280 (fact)->info.fill_ratio_needed = 1.0; 5281 b = (Mat_SeqBAIJ*)(fact)->data; 5282 b->row = isrow; 5283 b->col = iscol; 5284 b->icol = isicol; 5285 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5286 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5287 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5288 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5289 PetscFunctionReturn(0); 5290 } 5291 5292 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5293 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5294 5295 /* get new row pointers */ 5296 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5297 bi[0] = 0; 5298 /* bdiag is location of diagonal in factor */ 5299 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5300 bdiag[0] = 0; 5301 5302 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5303 5304 /* create a linked list for storing column indices of the active row */ 5305 nlnk = n + 1; 5306 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5307 5308 /* initial FreeSpace size is f*(ai[n]+1) */ 5309 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5310 current_space = free_space; 5311 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5312 current_space_lvl = free_space_lvl; 5313 5314 for (i=0; i<n; i++) { 5315 nzi = 0; 5316 /* copy current row into linked list */ 5317 nnz = ai[r[i]+1] - ai[r[i]]; 5318 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5319 cols = aj + ai[r[i]]; 5320 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5321 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5322 nzi += nlnk; 5323 5324 /* make sure diagonal entry is included */ 5325 if (diagonal_fill && lnk[i] == -1) { 5326 fm = n; 5327 while (lnk[fm] < i) fm = lnk[fm]; 5328 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5329 lnk[fm] = i; 5330 lnk_lvl[i] = 0; 5331 nzi++; dcount++; 5332 } 5333 5334 /* add pivot rows into the active row */ 5335 nzbd = 0; 5336 prow = lnk[n]; 5337 while (prow < i) { 5338 nnz = bdiag[prow]; 5339 cols = bj_ptr[prow] + nnz + 1; 5340 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5341 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5342 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5343 nzi += nlnk; 5344 prow = lnk[prow]; 5345 nzbd++; 5346 } 5347 bdiag[i] = nzbd; 5348 bi[i+1] = bi[i] + nzi; 5349 5350 /* if free space is not available, make more free space */ 5351 if (current_space->local_remaining<nzi) { 5352 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5353 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5354 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5355 reallocs++; 5356 } 5357 5358 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5359 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5360 bj_ptr[i] = current_space->array; 5361 bjlvl_ptr[i] = current_space_lvl->array; 5362 5363 /* make sure the active row i has diagonal entry */ 5364 if (*(bj_ptr[i]+bdiag[i]) != i) { 5365 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5366 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5367 } 5368 5369 current_space->array += nzi; 5370 current_space->local_used += nzi; 5371 current_space->local_remaining -= nzi; 5372 current_space_lvl->array += nzi; 5373 current_space_lvl->local_used += nzi; 5374 current_space_lvl->local_remaining -= nzi; 5375 } 5376 5377 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5378 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5379 5380 /* destroy list of free space and other temporary arrays */ 5381 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5382 5383 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5384 ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5385 5386 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5387 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5388 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5389 5390 #if defined(PETSC_USE_INFO) 5391 { 5392 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5393 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5394 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5395 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5396 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5397 if (diagonal_fill) { 5398 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5399 } 5400 } 5401 #endif 5402 5403 /* put together the new matrix */ 5404 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5405 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5406 b = (Mat_SeqBAIJ*)(fact)->data; 5407 b->free_a = PETSC_TRUE; 5408 b->free_ij = PETSC_TRUE; 5409 b->singlemalloc = PETSC_FALSE; 5410 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5411 b->j = bj; 5412 b->i = bi; 5413 b->diag = bdiag; 5414 b->free_diag = PETSC_TRUE; 5415 b->ilen = 0; 5416 b->imax = 0; 5417 b->row = isrow; 5418 b->col = iscol; 5419 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5420 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5421 b->icol = isicol; 5422 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5423 /* In b structure: Free imax, ilen, old a, old j. 5424 Allocate bdiag, solve_work, new a, new j */ 5425 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5426 b->maxnz = b->nz = bdiag[0]+1; 5427 fact->info.factor_mallocs = reallocs; 5428 fact->info.fill_ratio_given = f; 5429 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5430 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 5431 PetscFunctionReturn(0); 5432 } 5433 5434 5435 /* 5436 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5437 except that the data structure of Mat_SeqAIJ is slightly different. 5438 Not a good example of code reuse. 5439 */ 5440 #undef __FUNCT__ 5441 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5442 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5443 { 5444 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5445 IS isicol; 5446 PetscErrorCode ierr; 5447 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5448 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5449 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5450 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5451 PetscTruth col_identity,row_identity,both_identity,flg; 5452 PetscReal f; 5453 PetscTruth newdatastruct = PETSC_FALSE; 5454 5455 PetscFunctionBegin; 5456 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 5457 if (newdatastruct){ 5458 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5459 PetscFunctionReturn(0); 5460 } 5461 5462 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5463 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5464 5465 f = info->fill; 5466 levels = (PetscInt)info->levels; 5467 diagonal_fill = (PetscInt)info->diagonal_fill; 5468 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5469 5470 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5471 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5472 both_identity = (PetscTruth) (row_identity && col_identity); 5473 5474 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5475 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5476 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5477 5478 fact->factor = MAT_FACTOR_ILU; 5479 b = (Mat_SeqBAIJ*)fact->data; 5480 b->row = isrow; 5481 b->col = iscol; 5482 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5483 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5484 b->icol = isicol; 5485 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5486 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5487 PetscFunctionReturn(0); 5488 } 5489 5490 /* general case perform the symbolic factorization */ 5491 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5492 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5493 5494 /* get new row pointers */ 5495 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5496 ainew[0] = 0; 5497 /* don't know how many column pointers are needed so estimate */ 5498 jmax = (PetscInt)(f*ai[n] + 1); 5499 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5500 /* ajfill is level of fill for each fill entry */ 5501 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5502 /* fill is a linked list of nonzeros in active row */ 5503 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5504 /* im is level for each filled value */ 5505 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5506 /* dloc is location of diagonal in factor */ 5507 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5508 dloc[0] = 0; 5509 for (prow=0; prow<n; prow++) { 5510 5511 /* copy prow into linked list */ 5512 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5513 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5514 xi = aj + ai[r[prow]]; 5515 fill[n] = n; 5516 fill[prow] = -1; /* marker for diagonal entry */ 5517 while (nz--) { 5518 fm = n; 5519 idx = ic[*xi++]; 5520 do { 5521 m = fm; 5522 fm = fill[m]; 5523 } while (fm < idx); 5524 fill[m] = idx; 5525 fill[idx] = fm; 5526 im[idx] = 0; 5527 } 5528 5529 /* make sure diagonal entry is included */ 5530 if (diagonal_fill && fill[prow] == -1) { 5531 fm = n; 5532 while (fill[fm] < prow) fm = fill[fm]; 5533 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5534 fill[fm] = prow; 5535 im[prow] = 0; 5536 nzf++; 5537 dcount++; 5538 } 5539 5540 nzi = 0; 5541 row = fill[n]; 5542 while (row < prow) { 5543 incrlev = im[row] + 1; 5544 nz = dloc[row]; 5545 xi = ajnew + ainew[row] + nz + 1; 5546 flev = ajfill + ainew[row] + nz + 1; 5547 nnz = ainew[row+1] - ainew[row] - nz - 1; 5548 fm = row; 5549 while (nnz-- > 0) { 5550 idx = *xi++; 5551 if (*flev + incrlev > levels) { 5552 flev++; 5553 continue; 5554 } 5555 do { 5556 m = fm; 5557 fm = fill[m]; 5558 } while (fm < idx); 5559 if (fm != idx) { 5560 im[idx] = *flev + incrlev; 5561 fill[m] = idx; 5562 fill[idx] = fm; 5563 fm = idx; 5564 nzf++; 5565 } else { 5566 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5567 } 5568 flev++; 5569 } 5570 row = fill[row]; 5571 nzi++; 5572 } 5573 /* copy new filled row into permanent storage */ 5574 ainew[prow+1] = ainew[prow] + nzf; 5575 if (ainew[prow+1] > jmax) { 5576 5577 /* estimate how much additional space we will need */ 5578 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5579 /* just double the memory each time */ 5580 PetscInt maxadd = jmax; 5581 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5582 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5583 jmax += maxadd; 5584 5585 /* allocate a longer ajnew and ajfill */ 5586 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5587 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5588 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5589 ajnew = xitmp; 5590 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5591 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5592 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5593 ajfill = xitmp; 5594 reallocate++; /* count how many reallocations are needed */ 5595 } 5596 xitmp = ajnew + ainew[prow]; 5597 flev = ajfill + ainew[prow]; 5598 dloc[prow] = nzi; 5599 fm = fill[n]; 5600 while (nzf--) { 5601 *xitmp++ = fm; 5602 *flev++ = im[fm]; 5603 fm = fill[fm]; 5604 } 5605 /* make sure row has diagonal entry */ 5606 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5607 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5608 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5609 } 5610 } 5611 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5612 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5613 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5614 ierr = PetscFree(fill);CHKERRQ(ierr); 5615 ierr = PetscFree(im);CHKERRQ(ierr); 5616 5617 #if defined(PETSC_USE_INFO) 5618 { 5619 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5620 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5621 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5622 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5623 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5624 if (diagonal_fill) { 5625 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5626 } 5627 } 5628 #endif 5629 5630 /* put together the new matrix */ 5631 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5632 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5633 b = (Mat_SeqBAIJ*)fact->data; 5634 b->free_a = PETSC_TRUE; 5635 b->free_ij = PETSC_TRUE; 5636 b->singlemalloc = PETSC_FALSE; 5637 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5638 b->j = ajnew; 5639 b->i = ainew; 5640 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5641 b->diag = dloc; 5642 b->free_diag = PETSC_TRUE; 5643 b->ilen = 0; 5644 b->imax = 0; 5645 b->row = isrow; 5646 b->col = iscol; 5647 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5648 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5649 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5650 b->icol = isicol; 5651 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5652 /* In b structure: Free imax, ilen, old a, old j. 5653 Allocate dloc, solve_work, new a, new j */ 5654 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5655 b->maxnz = b->nz = ainew[n]; 5656 5657 fact->info.factor_mallocs = reallocate; 5658 fact->info.fill_ratio_given = f; 5659 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5660 5661 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5662 PetscFunctionReturn(0); 5663 } 5664 5665 #undef __FUNCT__ 5666 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5667 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5668 { 5669 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5670 /* int i,*AJ=a->j,nz=a->nz; */ 5671 PetscFunctionBegin; 5672 /* Undo Column scaling */ 5673 /* while (nz--) { */ 5674 /* AJ[i] = AJ[i]/4; */ 5675 /* } */ 5676 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5677 A->ops->setunfactored = PETSC_NULL; 5678 PetscFunctionReturn(0); 5679 } 5680 5681 #undef __FUNCT__ 5682 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5683 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5684 { 5685 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5686 PetscInt *AJ=a->j,nz=a->nz; 5687 unsigned short *aj=(unsigned short *)AJ; 5688 PetscFunctionBegin; 5689 /* Is this really necessary? */ 5690 while (nz--) { 5691 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5692 } 5693 A->ops->setunfactored = PETSC_NULL; 5694 PetscFunctionReturn(0); 5695 } 5696 5697 5698