1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 124 PetscInt nz,idx,idt,j,i,oidx; 125 PetscInt bs=A->rmap->bs,bs2=a->bs2; 126 MatScalar *aa=a->a,*v; 127 PetscScalar s1,s2,x1,x2; 128 PetscScalar *x,*b; 129 130 PetscFunctionBegin; 131 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 132 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 133 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 134 135 /* forward solve the U^T */ 136 idx = 0; 137 for (i=0; i<n; i++) { 138 v = aa + bs2*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; 141 s1 = v[0]*x1 + v[1]*x2; 142 s2 = v[2]*x1 + v[3]*x2; 143 v -= bs2; 144 145 vi = aj + diag[i] - 1; 146 nz = diag[i] - diag[i+1] - 1; 147 for(j=0;j>-nz;j--){ 148 oidx = bs*vi[j]; 149 x[oidx] -= v[0]*s1 + v[1]*s2; 150 x[oidx+1] -= v[2]*s1 + v[3]*s2; 151 v -= bs2; 152 } 153 x[idx] = s1;x[1+idx] = s2; 154 idx += bs; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + bs2*ai[i]; 159 vi = aj + ai[i]; 160 nz = ai[i+1] - ai[i]; 161 idt = bs*i; 162 s1 = x[idt]; s2 = x[1+idt]; 163 for(j=0;j<nz;j++){ 164 idx = bs*vi[j]; 165 x[idx] -= v[0]*s1 + v[1]*s2; 166 x[idx+1] -= v[2]*s1 + v[3]*s2; 167 v += bs2; 168 } 169 } 170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 172 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 179 { 180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181 PetscErrorCode ierr; 182 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183 PetscInt *diag = a->diag,oidx; 184 MatScalar *aa=a->a,*v; 185 PetscScalar s1,s2,s3,x1,x2,x3; 186 PetscScalar *x,*b; 187 188 PetscFunctionBegin; 189 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192 193 /* forward solve the U^T */ 194 idx = 0; 195 for (i=0; i<n; i++) { 196 197 v = aa + 9*diag[i]; 198 /* multiply by the inverse of the block diagonal */ 199 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203 v += 9; 204 205 vi = aj + diag[i] + 1; 206 nz = ai[i+1] - diag[i] - 1; 207 while (nz--) { 208 oidx = 3*(*vi++); 209 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212 v += 9; 213 } 214 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215 idx += 3; 216 } 217 /* backward solve the L^T */ 218 for (i=n-1; i>=0; i--){ 219 v = aa + 9*diag[i] - 9; 220 vi = aj + diag[i] - 1; 221 nz = diag[i] - ai[i]; 222 idt = 3*i; 223 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224 while (nz--) { 225 idx = 3*(*vi--); 226 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229 v -= 9; 230 } 231 } 232 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 240 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 PetscErrorCode ierr; 244 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 245 PetscInt *diag = a->diag,oidx; 246 MatScalar *aa=a->a,*v; 247 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 248 PetscScalar *x,*b; 249 250 PetscFunctionBegin; 251 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 252 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 253 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 254 255 /* forward solve the U^T */ 256 idx = 0; 257 for (i=0; i<n; i++) { 258 259 v = aa + 16*diag[i]; 260 /* multiply by the inverse of the block diagonal */ 261 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 262 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 263 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 264 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 265 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 266 v += 16; 267 268 vi = aj + diag[i] + 1; 269 nz = ai[i+1] - diag[i] - 1; 270 while (nz--) { 271 oidx = 4*(*vi++); 272 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 273 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 274 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 275 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 276 v += 16; 277 } 278 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 279 idx += 4; 280 } 281 /* backward solve the L^T */ 282 for (i=n-1; i>=0; i--){ 283 v = aa + 16*diag[i] - 16; 284 vi = aj + diag[i] - 1; 285 nz = diag[i] - ai[i]; 286 idt = 4*i; 287 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 288 while (nz--) { 289 idx = 4*(*vi--); 290 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 291 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 292 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 293 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 294 v -= 16; 295 } 296 } 297 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 298 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 299 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 300 PetscFunctionReturn(0); 301 } 302 303 #undef __FUNCT__ 304 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 305 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 306 { 307 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 308 PetscErrorCode ierr; 309 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 310 PetscInt *diag = a->diag,oidx; 311 MatScalar *aa=a->a,*v; 312 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 313 PetscScalar *x,*b; 314 315 PetscFunctionBegin; 316 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 317 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 318 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 319 320 /* forward solve the U^T */ 321 idx = 0; 322 for (i=0; i<n; i++) { 323 324 v = aa + 25*diag[i]; 325 /* multiply by the inverse of the block diagonal */ 326 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 327 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 328 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 329 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 330 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 331 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 332 v += 25; 333 334 vi = aj + diag[i] + 1; 335 nz = ai[i+1] - diag[i] - 1; 336 while (nz--) { 337 oidx = 5*(*vi++); 338 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 339 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 340 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 341 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 342 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 343 v += 25; 344 } 345 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 346 idx += 5; 347 } 348 /* backward solve the L^T */ 349 for (i=n-1; i>=0; i--){ 350 v = aa + 25*diag[i] - 25; 351 vi = aj + diag[i] - 1; 352 nz = diag[i] - ai[i]; 353 idt = 5*i; 354 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 355 while (nz--) { 356 idx = 5*(*vi--); 357 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 358 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 359 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 360 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 361 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 362 v -= 25; 363 } 364 } 365 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 366 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 367 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 368 PetscFunctionReturn(0); 369 } 370 371 #undef __FUNCT__ 372 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 373 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 374 { 375 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 376 PetscErrorCode ierr; 377 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 378 PetscInt *diag = a->diag,oidx; 379 MatScalar *aa=a->a,*v; 380 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 381 PetscScalar *x,*b; 382 383 PetscFunctionBegin; 384 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 385 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 386 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 387 388 /* forward solve the U^T */ 389 idx = 0; 390 for (i=0; i<n; i++) { 391 392 v = aa + 36*diag[i]; 393 /* multiply by the inverse of the block diagonal */ 394 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 395 x6 = x[5+idx]; 396 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 397 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 398 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 399 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 400 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 401 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 402 v += 36; 403 404 vi = aj + diag[i] + 1; 405 nz = ai[i+1] - diag[i] - 1; 406 while (nz--) { 407 oidx = 6*(*vi++); 408 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 409 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 410 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 411 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 412 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 413 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 414 v += 36; 415 } 416 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 417 x[5+idx] = s6; 418 idx += 6; 419 } 420 /* backward solve the L^T */ 421 for (i=n-1; i>=0; i--){ 422 v = aa + 36*diag[i] - 36; 423 vi = aj + diag[i] - 1; 424 nz = diag[i] - ai[i]; 425 idt = 6*i; 426 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 427 s6 = x[5+idt]; 428 while (nz--) { 429 idx = 6*(*vi--); 430 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 431 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 432 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 433 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 434 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 435 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 436 v -= 36; 437 } 438 } 439 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 440 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 441 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 442 PetscFunctionReturn(0); 443 } 444 445 #undef __FUNCT__ 446 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 447 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 448 { 449 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 450 PetscErrorCode ierr; 451 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 452 PetscInt *diag = a->diag,oidx; 453 MatScalar *aa=a->a,*v; 454 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 455 PetscScalar *x,*b; 456 457 PetscFunctionBegin; 458 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 459 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 460 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 461 462 /* forward solve the U^T */ 463 idx = 0; 464 for (i=0; i<n; i++) { 465 466 v = aa + 49*diag[i]; 467 /* multiply by the inverse of the block diagonal */ 468 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 469 x6 = x[5+idx]; x7 = x[6+idx]; 470 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 471 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 472 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 473 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 474 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 475 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 476 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 477 v += 49; 478 479 vi = aj + diag[i] + 1; 480 nz = ai[i+1] - diag[i] - 1; 481 while (nz--) { 482 oidx = 7*(*vi++); 483 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 484 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 485 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 486 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 487 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 488 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 489 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 490 v += 49; 491 } 492 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 493 x[5+idx] = s6;x[6+idx] = s7; 494 idx += 7; 495 } 496 /* backward solve the L^T */ 497 for (i=n-1; i>=0; i--){ 498 v = aa + 49*diag[i] - 49; 499 vi = aj + diag[i] - 1; 500 nz = diag[i] - ai[i]; 501 idt = 7*i; 502 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 503 s6 = x[5+idt];s7 = x[6+idt]; 504 while (nz--) { 505 idx = 7*(*vi--); 506 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 507 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 508 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 509 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 510 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 511 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 512 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 513 v -= 49; 514 } 515 } 516 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 517 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 518 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 519 PetscFunctionReturn(0); 520 } 521 522 /*---------------------------------------------------------------------------------------------*/ 523 #undef __FUNCT__ 524 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 525 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 526 { 527 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 528 IS iscol=a->col,isrow=a->row; 529 PetscErrorCode ierr; 530 const PetscInt *r,*c,*rout,*cout; 531 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 532 PetscInt *diag = a->diag; 533 MatScalar *aa=a->a,*v; 534 PetscScalar s1,*x,*b,*t; 535 536 PetscFunctionBegin; 537 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 538 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 539 t = a->solve_work; 540 541 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 542 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 543 544 /* copy the b into temp work space according to permutation */ 545 for (i=0; i<n; i++) { 546 t[i] = b[c[i]]; 547 } 548 549 /* forward solve the U^T */ 550 for (i=0; i<n; i++) { 551 552 v = aa + diag[i]; 553 /* multiply by the inverse of the block diagonal */ 554 s1 = (*v++)*t[i]; 555 vi = aj + diag[i] + 1; 556 nz = ai[i+1] - diag[i] - 1; 557 while (nz--) { 558 t[*vi++] -= (*v++)*s1; 559 } 560 t[i] = s1; 561 } 562 /* backward solve the L^T */ 563 for (i=n-1; i>=0; i--){ 564 v = aa + diag[i] - 1; 565 vi = aj + diag[i] - 1; 566 nz = diag[i] - ai[i]; 567 s1 = t[i]; 568 while (nz--) { 569 t[*vi--] -= (*v--)*s1; 570 } 571 } 572 573 /* copy t into x according to permutation */ 574 for (i=0; i<n; i++) { 575 x[r[i]] = t[i]; 576 } 577 578 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 579 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 580 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 581 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 582 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 583 PetscFunctionReturn(0); 584 } 585 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 588 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 589 { 590 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 591 IS iscol=a->col,isrow=a->row; 592 PetscErrorCode ierr; 593 const PetscInt *r,*c,*rout,*cout; 594 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 595 PetscInt *diag = a->diag,ii,ic,ir,oidx; 596 MatScalar *aa=a->a,*v; 597 PetscScalar s1,s2,x1,x2; 598 PetscScalar *x,*b,*t; 599 600 PetscFunctionBegin; 601 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 602 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 603 t = a->solve_work; 604 605 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 606 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 607 608 /* copy the b into temp work space according to permutation */ 609 ii = 0; 610 for (i=0; i<n; i++) { 611 ic = 2*c[i]; 612 t[ii] = b[ic]; 613 t[ii+1] = b[ic+1]; 614 ii += 2; 615 } 616 617 /* forward solve the U^T */ 618 idx = 0; 619 for (i=0; i<n; i++) { 620 621 v = aa + 4*diag[i]; 622 /* multiply by the inverse of the block diagonal */ 623 x1 = t[idx]; x2 = t[1+idx]; 624 s1 = v[0]*x1 + v[1]*x2; 625 s2 = v[2]*x1 + v[3]*x2; 626 v += 4; 627 628 vi = aj + diag[i] + 1; 629 nz = ai[i+1] - diag[i] - 1; 630 while (nz--) { 631 oidx = 2*(*vi++); 632 t[oidx] -= v[0]*s1 + v[1]*s2; 633 t[oidx+1] -= v[2]*s1 + v[3]*s2; 634 v += 4; 635 } 636 t[idx] = s1;t[1+idx] = s2; 637 idx += 2; 638 } 639 /* backward solve the L^T */ 640 for (i=n-1; i>=0; i--){ 641 v = aa + 4*diag[i] - 4; 642 vi = aj + diag[i] - 1; 643 nz = diag[i] - ai[i]; 644 idt = 2*i; 645 s1 = t[idt]; s2 = t[1+idt]; 646 while (nz--) { 647 idx = 2*(*vi--); 648 t[idx] -= v[0]*s1 + v[1]*s2; 649 t[idx+1] -= v[2]*s1 + v[3]*s2; 650 v -= 4; 651 } 652 } 653 654 /* copy t into x according to permutation */ 655 ii = 0; 656 for (i=0; i<n; i++) { 657 ir = 2*r[i]; 658 x[ir] = t[ii]; 659 x[ir+1] = t[ii+1]; 660 ii += 2; 661 } 662 663 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 664 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 665 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 666 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 667 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 668 PetscFunctionReturn(0); 669 } 670 671 #undef __FUNCT__ 672 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 673 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 674 { 675 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 676 IS iscol=a->col,isrow=a->row; 677 PetscErrorCode ierr; 678 const PetscInt *r,*c,*rout,*cout; 679 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 680 PetscInt *diag = a->diag,ii,ic,ir,oidx; 681 MatScalar *aa=a->a,*v; 682 PetscScalar s1,s2,s3,x1,x2,x3; 683 PetscScalar *x,*b,*t; 684 685 PetscFunctionBegin; 686 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 687 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 688 t = a->solve_work; 689 690 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 691 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 692 693 /* copy the b into temp work space according to permutation */ 694 ii = 0; 695 for (i=0; i<n; i++) { 696 ic = 3*c[i]; 697 t[ii] = b[ic]; 698 t[ii+1] = b[ic+1]; 699 t[ii+2] = b[ic+2]; 700 ii += 3; 701 } 702 703 /* forward solve the U^T */ 704 idx = 0; 705 for (i=0; i<n; i++) { 706 707 v = aa + 9*diag[i]; 708 /* multiply by the inverse of the block diagonal */ 709 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 710 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 711 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 712 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 713 v += 9; 714 715 vi = aj + diag[i] + 1; 716 nz = ai[i+1] - diag[i] - 1; 717 while (nz--) { 718 oidx = 3*(*vi++); 719 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 720 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 721 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 722 v += 9; 723 } 724 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 725 idx += 3; 726 } 727 /* backward solve the L^T */ 728 for (i=n-1; i>=0; i--){ 729 v = aa + 9*diag[i] - 9; 730 vi = aj + diag[i] - 1; 731 nz = diag[i] - ai[i]; 732 idt = 3*i; 733 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 734 while (nz--) { 735 idx = 3*(*vi--); 736 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 737 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 738 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 739 v -= 9; 740 } 741 } 742 743 /* copy t into x according to permutation */ 744 ii = 0; 745 for (i=0; i<n; i++) { 746 ir = 3*r[i]; 747 x[ir] = t[ii]; 748 x[ir+1] = t[ii+1]; 749 x[ir+2] = t[ii+2]; 750 ii += 3; 751 } 752 753 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 754 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 755 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 756 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 757 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 758 PetscFunctionReturn(0); 759 } 760 761 #undef __FUNCT__ 762 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 763 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 764 { 765 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 766 IS iscol=a->col,isrow=a->row; 767 PetscErrorCode ierr; 768 const PetscInt *r,*c,*rout,*cout; 769 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 770 PetscInt *diag = a->diag,ii,ic,ir,oidx; 771 MatScalar *aa=a->a,*v; 772 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 773 PetscScalar *x,*b,*t; 774 775 PetscFunctionBegin; 776 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 777 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 778 t = a->solve_work; 779 780 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 781 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 782 783 /* copy the b into temp work space according to permutation */ 784 ii = 0; 785 for (i=0; i<n; i++) { 786 ic = 4*c[i]; 787 t[ii] = b[ic]; 788 t[ii+1] = b[ic+1]; 789 t[ii+2] = b[ic+2]; 790 t[ii+3] = b[ic+3]; 791 ii += 4; 792 } 793 794 /* forward solve the U^T */ 795 idx = 0; 796 for (i=0; i<n; i++) { 797 798 v = aa + 16*diag[i]; 799 /* multiply by the inverse of the block diagonal */ 800 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 801 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 802 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 803 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 804 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 805 v += 16; 806 807 vi = aj + diag[i] + 1; 808 nz = ai[i+1] - diag[i] - 1; 809 while (nz--) { 810 oidx = 4*(*vi++); 811 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 812 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 813 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 814 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 815 v += 16; 816 } 817 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 818 idx += 4; 819 } 820 /* backward solve the L^T */ 821 for (i=n-1; i>=0; i--){ 822 v = aa + 16*diag[i] - 16; 823 vi = aj + diag[i] - 1; 824 nz = diag[i] - ai[i]; 825 idt = 4*i; 826 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 827 while (nz--) { 828 idx = 4*(*vi--); 829 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 830 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 831 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 832 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 833 v -= 16; 834 } 835 } 836 837 /* copy t into x according to permutation */ 838 ii = 0; 839 for (i=0; i<n; i++) { 840 ir = 4*r[i]; 841 x[ir] = t[ii]; 842 x[ir+1] = t[ii+1]; 843 x[ir+2] = t[ii+2]; 844 x[ir+3] = t[ii+3]; 845 ii += 4; 846 } 847 848 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 849 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 850 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 851 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 852 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 853 PetscFunctionReturn(0); 854 } 855 856 #undef __FUNCT__ 857 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 858 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 859 { 860 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 861 IS iscol=a->col,isrow=a->row; 862 PetscErrorCode ierr; 863 const PetscInt *r,*c,*rout,*cout; 864 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 865 PetscInt *diag = a->diag,ii,ic,ir,oidx; 866 MatScalar *aa=a->a,*v; 867 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 868 PetscScalar *x,*b,*t; 869 870 PetscFunctionBegin; 871 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 872 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 873 t = a->solve_work; 874 875 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 876 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 877 878 /* copy the b into temp work space according to permutation */ 879 ii = 0; 880 for (i=0; i<n; i++) { 881 ic = 5*c[i]; 882 t[ii] = b[ic]; 883 t[ii+1] = b[ic+1]; 884 t[ii+2] = b[ic+2]; 885 t[ii+3] = b[ic+3]; 886 t[ii+4] = b[ic+4]; 887 ii += 5; 888 } 889 890 /* forward solve the U^T */ 891 idx = 0; 892 for (i=0; i<n; i++) { 893 894 v = aa + 25*diag[i]; 895 /* multiply by the inverse of the block diagonal */ 896 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 897 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 898 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 899 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 900 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 901 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 902 v += 25; 903 904 vi = aj + diag[i] + 1; 905 nz = ai[i+1] - diag[i] - 1; 906 while (nz--) { 907 oidx = 5*(*vi++); 908 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 909 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 910 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 911 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 912 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 913 v += 25; 914 } 915 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 916 idx += 5; 917 } 918 /* backward solve the L^T */ 919 for (i=n-1; i>=0; i--){ 920 v = aa + 25*diag[i] - 25; 921 vi = aj + diag[i] - 1; 922 nz = diag[i] - ai[i]; 923 idt = 5*i; 924 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 925 while (nz--) { 926 idx = 5*(*vi--); 927 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 928 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 929 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 930 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 931 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 932 v -= 25; 933 } 934 } 935 936 /* copy t into x according to permutation */ 937 ii = 0; 938 for (i=0; i<n; i++) { 939 ir = 5*r[i]; 940 x[ir] = t[ii]; 941 x[ir+1] = t[ii+1]; 942 x[ir+2] = t[ii+2]; 943 x[ir+3] = t[ii+3]; 944 x[ir+4] = t[ii+4]; 945 ii += 5; 946 } 947 948 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 949 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 950 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 951 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 952 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 953 PetscFunctionReturn(0); 954 } 955 956 #undef __FUNCT__ 957 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 958 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 959 { 960 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 961 IS iscol=a->col,isrow=a->row; 962 PetscErrorCode ierr; 963 const PetscInt *r,*c,*rout,*cout; 964 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 965 PetscInt *diag = a->diag,ii,ic,ir,oidx; 966 MatScalar *aa=a->a,*v; 967 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 968 PetscScalar *x,*b,*t; 969 970 PetscFunctionBegin; 971 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 972 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 973 t = a->solve_work; 974 975 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 976 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 977 978 /* copy the b into temp work space according to permutation */ 979 ii = 0; 980 for (i=0; i<n; i++) { 981 ic = 6*c[i]; 982 t[ii] = b[ic]; 983 t[ii+1] = b[ic+1]; 984 t[ii+2] = b[ic+2]; 985 t[ii+3] = b[ic+3]; 986 t[ii+4] = b[ic+4]; 987 t[ii+5] = b[ic+5]; 988 ii += 6; 989 } 990 991 /* forward solve the U^T */ 992 idx = 0; 993 for (i=0; i<n; i++) { 994 995 v = aa + 36*diag[i]; 996 /* multiply by the inverse of the block diagonal */ 997 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 998 x6 = t[5+idx]; 999 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1000 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1001 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1002 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1003 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1004 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1005 v += 36; 1006 1007 vi = aj + diag[i] + 1; 1008 nz = ai[i+1] - diag[i] - 1; 1009 while (nz--) { 1010 oidx = 6*(*vi++); 1011 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1012 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1013 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1014 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1015 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1016 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1017 v += 36; 1018 } 1019 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1020 t[5+idx] = s6; 1021 idx += 6; 1022 } 1023 /* backward solve the L^T */ 1024 for (i=n-1; i>=0; i--){ 1025 v = aa + 36*diag[i] - 36; 1026 vi = aj + diag[i] - 1; 1027 nz = diag[i] - ai[i]; 1028 idt = 6*i; 1029 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1030 s6 = t[5+idt]; 1031 while (nz--) { 1032 idx = 6*(*vi--); 1033 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1034 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1035 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1036 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1037 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1038 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1039 v -= 36; 1040 } 1041 } 1042 1043 /* copy t into x according to permutation */ 1044 ii = 0; 1045 for (i=0; i<n; i++) { 1046 ir = 6*r[i]; 1047 x[ir] = t[ii]; 1048 x[ir+1] = t[ii+1]; 1049 x[ir+2] = t[ii+2]; 1050 x[ir+3] = t[ii+3]; 1051 x[ir+4] = t[ii+4]; 1052 x[ir+5] = t[ii+5]; 1053 ii += 6; 1054 } 1055 1056 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1057 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1058 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1059 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1060 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1061 PetscFunctionReturn(0); 1062 } 1063 1064 #undef __FUNCT__ 1065 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1066 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1067 { 1068 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1069 IS iscol=a->col,isrow=a->row; 1070 PetscErrorCode ierr; 1071 const PetscInt *r,*c,*rout,*cout; 1072 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1073 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1074 MatScalar *aa=a->a,*v; 1075 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1076 PetscScalar *x,*b,*t; 1077 1078 PetscFunctionBegin; 1079 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1080 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1081 t = a->solve_work; 1082 1083 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1084 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1085 1086 /* copy the b into temp work space according to permutation */ 1087 ii = 0; 1088 for (i=0; i<n; i++) { 1089 ic = 7*c[i]; 1090 t[ii] = b[ic]; 1091 t[ii+1] = b[ic+1]; 1092 t[ii+2] = b[ic+2]; 1093 t[ii+3] = b[ic+3]; 1094 t[ii+4] = b[ic+4]; 1095 t[ii+5] = b[ic+5]; 1096 t[ii+6] = b[ic+6]; 1097 ii += 7; 1098 } 1099 1100 /* forward solve the U^T */ 1101 idx = 0; 1102 for (i=0; i<n; i++) { 1103 1104 v = aa + 49*diag[i]; 1105 /* multiply by the inverse of the block diagonal */ 1106 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1107 x6 = t[5+idx]; x7 = t[6+idx]; 1108 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1109 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1110 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1111 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1112 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1113 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1114 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1115 v += 49; 1116 1117 vi = aj + diag[i] + 1; 1118 nz = ai[i+1] - diag[i] - 1; 1119 while (nz--) { 1120 oidx = 7*(*vi++); 1121 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1122 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1123 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1124 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1125 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1126 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1127 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1128 v += 49; 1129 } 1130 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1131 t[5+idx] = s6;t[6+idx] = s7; 1132 idx += 7; 1133 } 1134 /* backward solve the L^T */ 1135 for (i=n-1; i>=0; i--){ 1136 v = aa + 49*diag[i] - 49; 1137 vi = aj + diag[i] - 1; 1138 nz = diag[i] - ai[i]; 1139 idt = 7*i; 1140 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1141 s6 = t[5+idt];s7 = t[6+idt]; 1142 while (nz--) { 1143 idx = 7*(*vi--); 1144 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1145 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1146 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1147 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1148 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1149 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1150 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1151 v -= 49; 1152 } 1153 } 1154 1155 /* copy t into x according to permutation */ 1156 ii = 0; 1157 for (i=0; i<n; i++) { 1158 ir = 7*r[i]; 1159 x[ir] = t[ii]; 1160 x[ir+1] = t[ii+1]; 1161 x[ir+2] = t[ii+2]; 1162 x[ir+3] = t[ii+3]; 1163 x[ir+4] = t[ii+4]; 1164 x[ir+5] = t[ii+5]; 1165 x[ir+6] = t[ii+6]; 1166 ii += 7; 1167 } 1168 1169 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1170 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1172 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1173 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1174 PetscFunctionReturn(0); 1175 } 1176 1177 /* ----------------------------------------------------------- */ 1178 #undef __FUNCT__ 1179 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1180 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1181 { 1182 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1183 IS iscol=a->col,isrow=a->row; 1184 PetscErrorCode ierr; 1185 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1186 PetscInt i,n=a->mbs; 1187 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1188 MatScalar *aa=a->a,*v; 1189 PetscScalar *x,*b,*s,*t,*ls; 1190 1191 PetscFunctionBegin; 1192 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1193 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1194 t = a->solve_work; 1195 1196 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1197 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1198 1199 /* forward solve the lower triangular */ 1200 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1201 for (i=1; i<n; i++) { 1202 v = aa + bs2*ai[i]; 1203 vi = aj + ai[i]; 1204 nz = a->diag[i] - ai[i]; 1205 s = t + bs*i; 1206 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1207 while (nz--) { 1208 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1209 v += bs2; 1210 } 1211 } 1212 /* backward solve the upper triangular */ 1213 ls = a->solve_work + A->cmap->n; 1214 for (i=n-1; i>=0; i--){ 1215 v = aa + bs2*(a->diag[i] + 1); 1216 vi = aj + a->diag[i] + 1; 1217 nz = ai[i+1] - a->diag[i] - 1; 1218 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1219 while (nz--) { 1220 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1221 v += bs2; 1222 } 1223 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1224 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1225 } 1226 1227 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1228 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1229 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1230 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1231 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1232 PetscFunctionReturn(0); 1233 } 1234 1235 /* ----------------------------------------------------------- */ 1236 #undef __FUNCT__ 1237 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 1238 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1239 { 1240 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1241 IS iscol=a->col,isrow=a->row; 1242 PetscErrorCode ierr; 1243 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1244 PetscInt i,n=a->mbs,j; 1245 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1246 const MatScalar *aa=a->a,*v; 1247 PetscScalar *x,*t,*ls; 1248 const PetscScalar *b; 1249 PetscFunctionBegin; 1250 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1251 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1252 t = a->solve_work; 1253 1254 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1255 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1256 1257 /* copy the b into temp work space according to permutation */ 1258 for (i=0; i<n; i++) { 1259 for (j=0; j<bs; j++) { 1260 t[i*bs+j] = b[c[i]*bs+j]; 1261 } 1262 } 1263 1264 1265 /* forward solve the upper triangular transpose */ 1266 ls = a->solve_work + A->cmap->n; 1267 for (i=0; i<n; i++){ 1268 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1269 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1270 v = aa + bs2*(a->diag[i] + 1); 1271 vi = aj + a->diag[i] + 1; 1272 nz = ai[i+1] - a->diag[i] - 1; 1273 while (nz--) { 1274 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1275 v += bs2; 1276 } 1277 } 1278 1279 /* backward solve the lower triangular transpose */ 1280 for (i=n-1; i>=0; i--) { 1281 v = aa + bs2*ai[i]; 1282 vi = aj + ai[i]; 1283 nz = a->diag[i] - ai[i]; 1284 while (nz--) { 1285 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1286 v += bs2; 1287 } 1288 } 1289 1290 /* copy t into x according to permutation */ 1291 for (i=0; i<n; i++) { 1292 for (j=0; j<bs; j++) { 1293 x[bs*r[i]+j] = t[bs*i+j]; 1294 } 1295 } 1296 1297 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1298 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1299 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1300 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1301 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1302 PetscFunctionReturn(0); 1303 } 1304 1305 #undef __FUNCT__ 1306 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1307 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1308 { 1309 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1310 IS iscol=a->col,isrow=a->row; 1311 PetscErrorCode ierr; 1312 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1313 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1314 MatScalar *aa=a->a,*v; 1315 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1316 PetscScalar *x,*b,*t; 1317 1318 PetscFunctionBegin; 1319 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1320 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1321 t = a->solve_work; 1322 1323 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1324 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1325 1326 /* forward solve the lower triangular */ 1327 idx = 7*(*r++); 1328 t[0] = b[idx]; t[1] = b[1+idx]; 1329 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1330 t[5] = b[5+idx]; t[6] = b[6+idx]; 1331 1332 for (i=1; i<n; i++) { 1333 v = aa + 49*ai[i]; 1334 vi = aj + ai[i]; 1335 nz = diag[i] - ai[i]; 1336 idx = 7*(*r++); 1337 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1338 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1339 while (nz--) { 1340 idx = 7*(*vi++); 1341 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1342 x4 = t[3+idx];x5 = t[4+idx]; 1343 x6 = t[5+idx];x7 = t[6+idx]; 1344 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1345 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1346 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1347 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1348 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1349 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1350 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1351 v += 49; 1352 } 1353 idx = 7*i; 1354 t[idx] = s1;t[1+idx] = s2; 1355 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1356 t[5+idx] = s6;t[6+idx] = s7; 1357 } 1358 /* backward solve the upper triangular */ 1359 for (i=n-1; i>=0; i--){ 1360 v = aa + 49*diag[i] + 49; 1361 vi = aj + diag[i] + 1; 1362 nz = ai[i+1] - diag[i] - 1; 1363 idt = 7*i; 1364 s1 = t[idt]; s2 = t[1+idt]; 1365 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1366 s6 = t[5+idt];s7 = t[6+idt]; 1367 while (nz--) { 1368 idx = 7*(*vi++); 1369 x1 = t[idx]; x2 = t[1+idx]; 1370 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1371 x6 = t[5+idx]; x7 = t[6+idx]; 1372 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1373 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1374 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1375 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1376 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1377 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1378 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1379 v += 49; 1380 } 1381 idc = 7*(*c--); 1382 v = aa + 49*diag[i]; 1383 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1384 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1385 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1386 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1387 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1388 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1389 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1390 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1391 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1392 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1393 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1394 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1395 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1396 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1397 } 1398 1399 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1400 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1401 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1402 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1403 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1404 PetscFunctionReturn(0); 1405 } 1406 1407 #undef __FUNCT__ 1408 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1409 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1410 { 1411 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1412 IS iscol=a->col,isrow=a->row; 1413 PetscErrorCode ierr; 1414 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 1415 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 1416 MatScalar *aa=a->a,*v; 1417 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1418 PetscScalar *x,*b,*t; 1419 1420 PetscFunctionBegin; 1421 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1422 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1423 t = a->solve_work; 1424 1425 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1426 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1427 1428 /* forward solve the lower triangular */ 1429 idx = 7*r[0]; 1430 t[0] = b[idx]; t[1] = b[1+idx]; 1431 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1432 t[5] = b[5+idx]; t[6] = b[6+idx]; 1433 1434 for (i=1; i<n; i++) { 1435 v = aa + 49*ai[i]; 1436 vi = aj + ai[i]; 1437 nz = ai[i+1] - ai[i]; 1438 idx = 7*r[i]; 1439 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1440 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1441 for(m=0;m<nz;m++){ 1442 idx = 7*vi[m]; 1443 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1444 x4 = t[3+idx];x5 = t[4+idx]; 1445 x6 = t[5+idx];x7 = t[6+idx]; 1446 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1447 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1448 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1449 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1450 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1451 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1452 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1453 v += 49; 1454 } 1455 idx = 7*i; 1456 t[idx] = s1;t[1+idx] = s2; 1457 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1458 t[5+idx] = s6;t[6+idx] = s7; 1459 } 1460 /* backward solve the upper triangular */ 1461 for (i=n-1; i>=0; i--){ 1462 v = aa + 49*(adiag[i+1]+1); 1463 vi = aj + adiag[i+1]+1; 1464 nz = adiag[i] - adiag[i+1] - 1; 1465 idt = 7*i; 1466 s1 = t[idt]; s2 = t[1+idt]; 1467 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1468 s6 = t[5+idt];s7 = t[6+idt]; 1469 for(m=0;m<nz;m++){ 1470 idx = 7*vi[m]; 1471 x1 = t[idx]; x2 = t[1+idx]; 1472 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1473 x6 = t[5+idx]; x7 = t[6+idx]; 1474 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1475 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1476 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1477 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1478 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1479 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1480 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1481 v += 49; 1482 } 1483 idc = 7*c[i]; 1484 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1485 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1486 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1487 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1488 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1489 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1490 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1491 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1492 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1493 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1494 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1495 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1496 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1497 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1498 } 1499 1500 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1501 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1502 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1503 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1504 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1505 PetscFunctionReturn(0); 1506 } 1507 1508 #undef __FUNCT__ 1509 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1510 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1511 { 1512 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1513 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1514 PetscErrorCode ierr; 1515 PetscInt *diag = a->diag,jdx; 1516 const MatScalar *aa=a->a,*v; 1517 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1518 const PetscScalar *b; 1519 1520 PetscFunctionBegin; 1521 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1522 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1523 /* forward solve the lower triangular */ 1524 idx = 0; 1525 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1526 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1527 x[6] = b[6+idx]; 1528 for (i=1; i<n; i++) { 1529 v = aa + 49*ai[i]; 1530 vi = aj + ai[i]; 1531 nz = diag[i] - ai[i]; 1532 idx = 7*i; 1533 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1534 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1535 s7 = b[6+idx]; 1536 while (nz--) { 1537 jdx = 7*(*vi++); 1538 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1539 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1540 x7 = x[6+jdx]; 1541 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1542 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1543 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1544 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1545 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1546 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1547 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1548 v += 49; 1549 } 1550 x[idx] = s1; 1551 x[1+idx] = s2; 1552 x[2+idx] = s3; 1553 x[3+idx] = s4; 1554 x[4+idx] = s5; 1555 x[5+idx] = s6; 1556 x[6+idx] = s7; 1557 } 1558 /* backward solve the upper triangular */ 1559 for (i=n-1; i>=0; i--){ 1560 v = aa + 49*diag[i] + 49; 1561 vi = aj + diag[i] + 1; 1562 nz = ai[i+1] - diag[i] - 1; 1563 idt = 7*i; 1564 s1 = x[idt]; s2 = x[1+idt]; 1565 s3 = x[2+idt]; s4 = x[3+idt]; 1566 s5 = x[4+idt]; s6 = x[5+idt]; 1567 s7 = x[6+idt]; 1568 while (nz--) { 1569 idx = 7*(*vi++); 1570 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1571 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1572 x7 = x[6+idx]; 1573 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1574 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1575 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1576 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1577 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1578 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1579 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1580 v += 49; 1581 } 1582 v = aa + 49*diag[i]; 1583 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1584 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1585 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1586 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1587 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1588 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1589 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1590 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1591 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1592 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1593 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1594 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1595 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1596 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1597 } 1598 1599 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1600 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1601 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1602 PetscFunctionReturn(0); 1603 } 1604 1605 #undef __FUNCT__ 1606 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1607 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1608 { 1609 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1610 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 1611 PetscErrorCode ierr; 1612 PetscInt idx,jdx,idt; 1613 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1614 const MatScalar *aa=a->a,*v; 1615 PetscScalar *x; 1616 const PetscScalar *b; 1617 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1618 1619 PetscFunctionBegin; 1620 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1621 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1622 /* forward solve the lower triangular */ 1623 idx = 0; 1624 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1625 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1626 for (i=1; i<n; i++) { 1627 v = aa + bs2*ai[i]; 1628 vi = aj + ai[i]; 1629 nz = ai[i+1] - ai[i]; 1630 idx = bs*i; 1631 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1632 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1633 for(k=0;k<nz;k++) { 1634 jdx = bs*vi[k]; 1635 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1636 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1637 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1638 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1639 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1640 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1641 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1642 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1643 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1644 v += bs2; 1645 } 1646 1647 x[idx] = s1; 1648 x[1+idx] = s2; 1649 x[2+idx] = s3; 1650 x[3+idx] = s4; 1651 x[4+idx] = s5; 1652 x[5+idx] = s6; 1653 x[6+idx] = s7; 1654 } 1655 1656 /* backward solve the upper triangular */ 1657 for (i=n-1; i>=0; i--){ 1658 v = aa + bs2*(adiag[i+1]+1); 1659 vi = aj + adiag[i+1]+1; 1660 nz = adiag[i] - adiag[i+1]-1; 1661 idt = bs*i; 1662 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1663 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1664 for(k=0;k<nz;k++) { 1665 idx = bs*vi[k]; 1666 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1667 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1668 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1669 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1670 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1671 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1672 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1673 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1674 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1675 v += bs2; 1676 } 1677 /* x = inv_diagonal*x */ 1678 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1679 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1680 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1681 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1682 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1683 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1684 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1685 } 1686 1687 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1688 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1689 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1690 PetscFunctionReturn(0); 1691 } 1692 1693 #undef __FUNCT__ 1694 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1695 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1696 { 1697 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1698 IS iscol=a->col,isrow=a->row; 1699 PetscErrorCode ierr; 1700 const PetscInt *r,*c,*rout,*cout; 1701 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1702 const MatScalar *aa=a->a,*v; 1703 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1704 const PetscScalar *b; 1705 PetscFunctionBegin; 1706 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1707 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1708 t = a->solve_work; 1709 1710 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1711 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1712 1713 /* forward solve the lower triangular */ 1714 idx = 6*(*r++); 1715 t[0] = b[idx]; t[1] = b[1+idx]; 1716 t[2] = b[2+idx]; t[3] = b[3+idx]; 1717 t[4] = b[4+idx]; t[5] = b[5+idx]; 1718 for (i=1; i<n; i++) { 1719 v = aa + 36*ai[i]; 1720 vi = aj + ai[i]; 1721 nz = diag[i] - ai[i]; 1722 idx = 6*(*r++); 1723 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1724 s5 = b[4+idx]; s6 = b[5+idx]; 1725 while (nz--) { 1726 idx = 6*(*vi++); 1727 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1728 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1729 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1730 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1731 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1732 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1733 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1734 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1735 v += 36; 1736 } 1737 idx = 6*i; 1738 t[idx] = s1;t[1+idx] = s2; 1739 t[2+idx] = s3;t[3+idx] = s4; 1740 t[4+idx] = s5;t[5+idx] = s6; 1741 } 1742 /* backward solve the upper triangular */ 1743 for (i=n-1; i>=0; i--){ 1744 v = aa + 36*diag[i] + 36; 1745 vi = aj + diag[i] + 1; 1746 nz = ai[i+1] - diag[i] - 1; 1747 idt = 6*i; 1748 s1 = t[idt]; s2 = t[1+idt]; 1749 s3 = t[2+idt];s4 = t[3+idt]; 1750 s5 = t[4+idt];s6 = t[5+idt]; 1751 while (nz--) { 1752 idx = 6*(*vi++); 1753 x1 = t[idx]; x2 = t[1+idx]; 1754 x3 = t[2+idx]; x4 = t[3+idx]; 1755 x5 = t[4+idx]; x6 = t[5+idx]; 1756 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1757 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1758 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1759 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1760 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1761 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1762 v += 36; 1763 } 1764 idc = 6*(*c--); 1765 v = aa + 36*diag[i]; 1766 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1767 v[18]*s4+v[24]*s5+v[30]*s6; 1768 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1769 v[19]*s4+v[25]*s5+v[31]*s6; 1770 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1771 v[20]*s4+v[26]*s5+v[32]*s6; 1772 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1773 v[21]*s4+v[27]*s5+v[33]*s6; 1774 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1775 v[22]*s4+v[28]*s5+v[34]*s6; 1776 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1777 v[23]*s4+v[29]*s5+v[35]*s6; 1778 } 1779 1780 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1781 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1782 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1783 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1784 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1785 PetscFunctionReturn(0); 1786 } 1787 1788 #undef __FUNCT__ 1789 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 1790 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 1791 { 1792 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1793 IS iscol=a->col,isrow=a->row; 1794 PetscErrorCode ierr; 1795 const PetscInt *r,*c,*rout,*cout; 1796 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 1797 const MatScalar *aa=a->a,*v; 1798 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1799 const PetscScalar *b; 1800 PetscFunctionBegin; 1801 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1802 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1803 t = a->solve_work; 1804 1805 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1806 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1807 1808 /* forward solve the lower triangular */ 1809 idx = 6*r[0]; 1810 t[0] = b[idx]; t[1] = b[1+idx]; 1811 t[2] = b[2+idx]; t[3] = b[3+idx]; 1812 t[4] = b[4+idx]; t[5] = b[5+idx]; 1813 for (i=1; i<n; i++) { 1814 v = aa + 36*ai[i]; 1815 vi = aj + ai[i]; 1816 nz = ai[i+1] - ai[i]; 1817 idx = 6*r[i]; 1818 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1819 s5 = b[4+idx]; s6 = b[5+idx]; 1820 for(m=0;m<nz;m++){ 1821 idx = 6*vi[m]; 1822 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1823 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1824 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1825 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1826 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1827 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1828 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1829 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1830 v += 36; 1831 } 1832 idx = 6*i; 1833 t[idx] = s1;t[1+idx] = s2; 1834 t[2+idx] = s3;t[3+idx] = s4; 1835 t[4+idx] = s5;t[5+idx] = s6; 1836 } 1837 /* backward solve the upper triangular */ 1838 for (i=n-1; i>=0; i--){ 1839 v = aa + 36*(adiag[i+1]+1); 1840 vi = aj + adiag[i+1]+1; 1841 nz = adiag[i] - adiag[i+1] - 1; 1842 idt = 6*i; 1843 s1 = t[idt]; s2 = t[1+idt]; 1844 s3 = t[2+idt];s4 = t[3+idt]; 1845 s5 = t[4+idt];s6 = t[5+idt]; 1846 for(m=0;m<nz;m++){ 1847 idx = 6*vi[m]; 1848 x1 = t[idx]; x2 = t[1+idx]; 1849 x3 = t[2+idx]; x4 = t[3+idx]; 1850 x5 = t[4+idx]; x6 = t[5+idx]; 1851 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1852 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1853 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1854 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1855 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1856 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1857 v += 36; 1858 } 1859 idc = 6*c[i]; 1860 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1861 v[18]*s4+v[24]*s5+v[30]*s6; 1862 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1863 v[19]*s4+v[25]*s5+v[31]*s6; 1864 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1865 v[20]*s4+v[26]*s5+v[32]*s6; 1866 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1867 v[21]*s4+v[27]*s5+v[33]*s6; 1868 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1869 v[22]*s4+v[28]*s5+v[34]*s6; 1870 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1871 v[23]*s4+v[29]*s5+v[35]*s6; 1872 } 1873 1874 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1875 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1876 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1877 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1878 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1879 PetscFunctionReturn(0); 1880 } 1881 1882 #undef __FUNCT__ 1883 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1884 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1885 { 1886 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1887 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1888 PetscErrorCode ierr; 1889 PetscInt *diag = a->diag,jdx; 1890 const MatScalar *aa=a->a,*v; 1891 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1892 const PetscScalar *b; 1893 1894 PetscFunctionBegin; 1895 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1896 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1897 /* forward solve the lower triangular */ 1898 idx = 0; 1899 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1900 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1901 for (i=1; i<n; i++) { 1902 v = aa + 36*ai[i]; 1903 vi = aj + ai[i]; 1904 nz = diag[i] - ai[i]; 1905 idx = 6*i; 1906 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1907 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1908 while (nz--) { 1909 jdx = 6*(*vi++); 1910 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1911 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1912 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1913 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1914 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1915 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1916 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1917 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1918 v += 36; 1919 } 1920 x[idx] = s1; 1921 x[1+idx] = s2; 1922 x[2+idx] = s3; 1923 x[3+idx] = s4; 1924 x[4+idx] = s5; 1925 x[5+idx] = s6; 1926 } 1927 /* backward solve the upper triangular */ 1928 for (i=n-1; i>=0; i--){ 1929 v = aa + 36*diag[i] + 36; 1930 vi = aj + diag[i] + 1; 1931 nz = ai[i+1] - diag[i] - 1; 1932 idt = 6*i; 1933 s1 = x[idt]; s2 = x[1+idt]; 1934 s3 = x[2+idt]; s4 = x[3+idt]; 1935 s5 = x[4+idt]; s6 = x[5+idt]; 1936 while (nz--) { 1937 idx = 6*(*vi++); 1938 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1939 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1940 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1941 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1942 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1943 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1944 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1945 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1946 v += 36; 1947 } 1948 v = aa + 36*diag[i]; 1949 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1950 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1951 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1952 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1953 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1954 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1955 } 1956 1957 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1958 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1959 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1960 PetscFunctionReturn(0); 1961 } 1962 1963 #undef __FUNCT__ 1964 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1965 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1966 { 1967 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1968 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 1969 PetscErrorCode ierr; 1970 PetscInt idx,jdx,idt; 1971 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1972 const MatScalar *aa=a->a,*v; 1973 PetscScalar *x; 1974 const PetscScalar *b; 1975 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1976 1977 PetscFunctionBegin; 1978 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1979 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1980 /* forward solve the lower triangular */ 1981 idx = 0; 1982 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1983 x[4] = b[4+idx];x[5] = b[5+idx]; 1984 for (i=1; i<n; i++) { 1985 v = aa + bs2*ai[i]; 1986 vi = aj + ai[i]; 1987 nz = ai[i+1] - ai[i]; 1988 idx = bs*i; 1989 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1990 s5 = b[4+idx];s6 = b[5+idx]; 1991 for(k=0;k<nz;k++){ 1992 jdx = bs*vi[k]; 1993 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1994 x5 = x[4+jdx]; x6 = x[5+jdx]; 1995 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1996 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1997 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1998 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1999 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2000 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2001 v += bs2; 2002 } 2003 2004 x[idx] = s1; 2005 x[1+idx] = s2; 2006 x[2+idx] = s3; 2007 x[3+idx] = s4; 2008 x[4+idx] = s5; 2009 x[5+idx] = s6; 2010 } 2011 2012 /* backward solve the upper triangular */ 2013 for (i=n-1; i>=0; i--){ 2014 v = aa + bs2*(adiag[i+1]+1); 2015 vi = aj + adiag[i+1]+1; 2016 nz = adiag[i] - adiag[i+1]-1; 2017 idt = bs*i; 2018 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2019 s5 = x[4+idt];s6 = x[5+idt]; 2020 for(k=0;k<nz;k++){ 2021 idx = bs*vi[k]; 2022 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2023 x5 = x[4+idx];x6 = x[5+idx]; 2024 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2025 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2026 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2027 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2028 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2029 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2030 v += bs2; 2031 } 2032 /* x = inv_diagonal*x */ 2033 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2034 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2035 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2036 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2037 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2038 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2039 } 2040 2041 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2042 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2043 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2044 PetscFunctionReturn(0); 2045 } 2046 2047 #undef __FUNCT__ 2048 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2049 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 2050 { 2051 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2052 IS iscol=a->col,isrow=a->row; 2053 PetscErrorCode ierr; 2054 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 2055 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2056 const MatScalar *aa=a->a,*v; 2057 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2058 const PetscScalar *b; 2059 2060 PetscFunctionBegin; 2061 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2062 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2063 t = a->solve_work; 2064 2065 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2066 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2067 2068 /* forward solve the lower triangular */ 2069 idx = 5*(*r++); 2070 t[0] = b[idx]; t[1] = b[1+idx]; 2071 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2072 for (i=1; i<n; i++) { 2073 v = aa + 25*ai[i]; 2074 vi = aj + ai[i]; 2075 nz = diag[i] - ai[i]; 2076 idx = 5*(*r++); 2077 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2078 s5 = b[4+idx]; 2079 while (nz--) { 2080 idx = 5*(*vi++); 2081 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2082 x4 = t[3+idx];x5 = t[4+idx]; 2083 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2084 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2085 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2086 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2087 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2088 v += 25; 2089 } 2090 idx = 5*i; 2091 t[idx] = s1;t[1+idx] = s2; 2092 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2093 } 2094 /* backward solve the upper triangular */ 2095 for (i=n-1; i>=0; i--){ 2096 v = aa + 25*diag[i] + 25; 2097 vi = aj + diag[i] + 1; 2098 nz = ai[i+1] - diag[i] - 1; 2099 idt = 5*i; 2100 s1 = t[idt]; s2 = t[1+idt]; 2101 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2102 while (nz--) { 2103 idx = 5*(*vi++); 2104 x1 = t[idx]; x2 = t[1+idx]; 2105 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2106 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2107 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2108 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2109 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2110 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2111 v += 25; 2112 } 2113 idc = 5*(*c--); 2114 v = aa + 25*diag[i]; 2115 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2116 v[15]*s4+v[20]*s5; 2117 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2118 v[16]*s4+v[21]*s5; 2119 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2120 v[17]*s4+v[22]*s5; 2121 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2122 v[18]*s4+v[23]*s5; 2123 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2124 v[19]*s4+v[24]*s5; 2125 } 2126 2127 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2128 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2129 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2130 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2131 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2132 PetscFunctionReturn(0); 2133 } 2134 2135 #undef __FUNCT__ 2136 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2137 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2138 { 2139 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2140 IS iscol=a->col,isrow=a->row; 2141 PetscErrorCode ierr; 2142 const PetscInt *r,*c,*rout,*cout; 2143 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2144 const MatScalar *aa=a->a,*v; 2145 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2146 const PetscScalar *b; 2147 2148 PetscFunctionBegin; 2149 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2150 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2151 t = a->solve_work; 2152 2153 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2154 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2155 2156 /* forward solve the lower triangular */ 2157 idx = 5*r[0]; 2158 t[0] = b[idx]; t[1] = b[1+idx]; 2159 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2160 for (i=1; i<n; i++) { 2161 v = aa + 25*ai[i]; 2162 vi = aj + ai[i]; 2163 nz = ai[i+1] - ai[i]; 2164 idx = 5*r[i]; 2165 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2166 s5 = b[4+idx]; 2167 for(m=0;m<nz;m++){ 2168 idx = 5*vi[m]; 2169 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2170 x4 = t[3+idx];x5 = t[4+idx]; 2171 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2172 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2173 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2174 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2175 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2176 v += 25; 2177 } 2178 idx = 5*i; 2179 t[idx] = s1;t[1+idx] = s2; 2180 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2181 } 2182 /* backward solve the upper triangular */ 2183 for (i=n-1; i>=0; i--){ 2184 v = aa + 25*(adiag[i+1]+1); 2185 vi = aj + adiag[i+1]+1; 2186 nz = adiag[i] - adiag[i+1] - 1; 2187 idt = 5*i; 2188 s1 = t[idt]; s2 = t[1+idt]; 2189 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2190 for(m=0;m<nz;m++){ 2191 idx = 5*vi[m]; 2192 x1 = t[idx]; x2 = t[1+idx]; 2193 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2194 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2195 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2196 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2197 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2198 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2199 v += 25; 2200 } 2201 idc = 5*c[i]; 2202 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2203 v[15]*s4+v[20]*s5; 2204 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2205 v[16]*s4+v[21]*s5; 2206 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2207 v[17]*s4+v[22]*s5; 2208 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2209 v[18]*s4+v[23]*s5; 2210 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2211 v[19]*s4+v[24]*s5; 2212 } 2213 2214 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2215 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2216 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2217 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2218 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2219 PetscFunctionReturn(0); 2220 } 2221 2222 #undef __FUNCT__ 2223 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2224 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 2225 { 2226 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2227 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2228 PetscErrorCode ierr; 2229 PetscInt *diag = a->diag,jdx; 2230 const MatScalar *aa=a->a,*v; 2231 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2232 const PetscScalar *b; 2233 2234 PetscFunctionBegin; 2235 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2236 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2237 /* forward solve the lower triangular */ 2238 idx = 0; 2239 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2240 for (i=1; i<n; i++) { 2241 v = aa + 25*ai[i]; 2242 vi = aj + ai[i]; 2243 nz = diag[i] - ai[i]; 2244 idx = 5*i; 2245 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2246 while (nz--) { 2247 jdx = 5*(*vi++); 2248 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2249 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2250 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2251 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2252 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2253 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2254 v += 25; 2255 } 2256 x[idx] = s1; 2257 x[1+idx] = s2; 2258 x[2+idx] = s3; 2259 x[3+idx] = s4; 2260 x[4+idx] = s5; 2261 } 2262 /* backward solve the upper triangular */ 2263 for (i=n-1; i>=0; i--){ 2264 v = aa + 25*diag[i] + 25; 2265 vi = aj + diag[i] + 1; 2266 nz = ai[i+1] - diag[i] - 1; 2267 idt = 5*i; 2268 s1 = x[idt]; s2 = x[1+idt]; 2269 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2270 while (nz--) { 2271 idx = 5*(*vi++); 2272 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2273 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2274 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2275 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2276 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2277 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2278 v += 25; 2279 } 2280 v = aa + 25*diag[i]; 2281 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2282 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2283 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2284 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2285 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2286 } 2287 2288 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2289 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2290 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2291 PetscFunctionReturn(0); 2292 } 2293 2294 #undef __FUNCT__ 2295 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2296 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2297 { 2298 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2299 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 2300 PetscErrorCode ierr; 2301 PetscInt jdx; 2302 const MatScalar *aa=a->a,*v; 2303 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2304 const PetscScalar *b; 2305 2306 PetscFunctionBegin; 2307 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2308 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2309 /* forward solve the lower triangular */ 2310 idx = 0; 2311 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2312 for (i=1; i<n; i++) { 2313 v = aa + 25*ai[i]; 2314 vi = aj + ai[i]; 2315 nz = ai[i+1] - ai[i]; 2316 idx = 5*i; 2317 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2318 for(k=0;k<nz;k++) { 2319 jdx = 5*vi[k]; 2320 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2321 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2322 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2323 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2324 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2325 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2326 v += 25; 2327 } 2328 x[idx] = s1; 2329 x[1+idx] = s2; 2330 x[2+idx] = s3; 2331 x[3+idx] = s4; 2332 x[4+idx] = s5; 2333 } 2334 2335 /* backward solve the upper triangular */ 2336 for (i=n-1; i>=0; i--){ 2337 v = aa + 25*(adiag[i+1]+1); 2338 vi = aj + adiag[i+1]+1; 2339 nz = adiag[i] - adiag[i+1]-1; 2340 idt = 5*i; 2341 s1 = x[idt]; s2 = x[1+idt]; 2342 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2343 for(k=0;k<nz;k++){ 2344 idx = 5*vi[k]; 2345 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2346 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2347 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2348 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2349 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2350 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2351 v += 25; 2352 } 2353 /* x = inv_diagonal*x */ 2354 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2355 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2356 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2357 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2358 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2359 } 2360 2361 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2362 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2363 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2364 PetscFunctionReturn(0); 2365 } 2366 2367 #undef __FUNCT__ 2368 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2369 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 2370 { 2371 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2372 IS iscol=a->col,isrow=a->row; 2373 PetscErrorCode ierr; 2374 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2375 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2376 const MatScalar *aa=a->a,*v; 2377 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2378 const PetscScalar *b; 2379 2380 PetscFunctionBegin; 2381 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2382 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2383 t = a->solve_work; 2384 2385 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2386 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2387 2388 /* forward solve the lower triangular */ 2389 idx = 4*(*r++); 2390 t[0] = b[idx]; t[1] = b[1+idx]; 2391 t[2] = b[2+idx]; t[3] = b[3+idx]; 2392 for (i=1; i<n; i++) { 2393 v = aa + 16*ai[i]; 2394 vi = aj + ai[i]; 2395 nz = diag[i] - ai[i]; 2396 idx = 4*(*r++); 2397 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2398 while (nz--) { 2399 idx = 4*(*vi++); 2400 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2401 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2402 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2403 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2404 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2405 v += 16; 2406 } 2407 idx = 4*i; 2408 t[idx] = s1;t[1+idx] = s2; 2409 t[2+idx] = s3;t[3+idx] = s4; 2410 } 2411 /* backward solve the upper triangular */ 2412 for (i=n-1; i>=0; i--){ 2413 v = aa + 16*diag[i] + 16; 2414 vi = aj + diag[i] + 1; 2415 nz = ai[i+1] - diag[i] - 1; 2416 idt = 4*i; 2417 s1 = t[idt]; s2 = t[1+idt]; 2418 s3 = t[2+idt];s4 = t[3+idt]; 2419 while (nz--) { 2420 idx = 4*(*vi++); 2421 x1 = t[idx]; x2 = t[1+idx]; 2422 x3 = t[2+idx]; x4 = t[3+idx]; 2423 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2424 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2425 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2426 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2427 v += 16; 2428 } 2429 idc = 4*(*c--); 2430 v = aa + 16*diag[i]; 2431 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2432 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2433 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2434 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2435 } 2436 2437 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2438 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2439 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2440 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2441 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2442 PetscFunctionReturn(0); 2443 } 2444 2445 #undef __FUNCT__ 2446 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2447 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 2448 { 2449 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2450 IS iscol=a->col,isrow=a->row; 2451 PetscErrorCode ierr; 2452 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2453 const PetscInt *r,*c,*rout,*cout; 2454 const MatScalar *aa=a->a,*v; 2455 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2456 const PetscScalar *b; 2457 2458 PetscFunctionBegin; 2459 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2460 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2461 t = a->solve_work; 2462 2463 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2464 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2465 2466 /* forward solve the lower triangular */ 2467 idx = 4*r[0]; 2468 t[0] = b[idx]; t[1] = b[1+idx]; 2469 t[2] = b[2+idx]; t[3] = b[3+idx]; 2470 for (i=1; i<n; i++) { 2471 v = aa + 16*ai[i]; 2472 vi = aj + ai[i]; 2473 nz = ai[i+1] - ai[i]; 2474 idx = 4*r[i]; 2475 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2476 for(m=0;m<nz;m++){ 2477 idx = 4*vi[m]; 2478 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2479 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2480 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2481 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2482 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2483 v += 16; 2484 } 2485 idx = 4*i; 2486 t[idx] = s1;t[1+idx] = s2; 2487 t[2+idx] = s3;t[3+idx] = s4; 2488 } 2489 /* backward solve the upper triangular */ 2490 for (i=n-1; i>=0; i--){ 2491 v = aa + 16*(adiag[i+1]+1); 2492 vi = aj + adiag[i+1]+1; 2493 nz = adiag[i] - adiag[i+1] - 1; 2494 idt = 4*i; 2495 s1 = t[idt]; s2 = t[1+idt]; 2496 s3 = t[2+idt];s4 = t[3+idt]; 2497 for(m=0;m<nz;m++){ 2498 idx = 4*vi[m]; 2499 x1 = t[idx]; x2 = t[1+idx]; 2500 x3 = t[2+idx]; x4 = t[3+idx]; 2501 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2502 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2503 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2504 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2505 v += 16; 2506 } 2507 idc = 4*c[i]; 2508 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2509 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2510 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2511 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2512 } 2513 2514 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2515 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2516 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2517 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2518 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2519 PetscFunctionReturn(0); 2520 } 2521 2522 #undef __FUNCT__ 2523 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2524 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2525 { 2526 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2527 IS iscol=a->col,isrow=a->row; 2528 PetscErrorCode ierr; 2529 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2530 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2531 const MatScalar *aa=a->a,*v; 2532 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2533 PetscScalar *x; 2534 const PetscScalar *b; 2535 2536 PetscFunctionBegin; 2537 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2538 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2539 t = (MatScalar *)a->solve_work; 2540 2541 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2542 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2543 2544 /* forward solve the lower triangular */ 2545 idx = 4*(*r++); 2546 t[0] = (MatScalar)b[idx]; 2547 t[1] = (MatScalar)b[1+idx]; 2548 t[2] = (MatScalar)b[2+idx]; 2549 t[3] = (MatScalar)b[3+idx]; 2550 for (i=1; i<n; i++) { 2551 v = aa + 16*ai[i]; 2552 vi = aj + ai[i]; 2553 nz = diag[i] - ai[i]; 2554 idx = 4*(*r++); 2555 s1 = (MatScalar)b[idx]; 2556 s2 = (MatScalar)b[1+idx]; 2557 s3 = (MatScalar)b[2+idx]; 2558 s4 = (MatScalar)b[3+idx]; 2559 while (nz--) { 2560 idx = 4*(*vi++); 2561 x1 = t[idx]; 2562 x2 = t[1+idx]; 2563 x3 = t[2+idx]; 2564 x4 = t[3+idx]; 2565 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2566 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2567 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2568 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2569 v += 16; 2570 } 2571 idx = 4*i; 2572 t[idx] = s1; 2573 t[1+idx] = s2; 2574 t[2+idx] = s3; 2575 t[3+idx] = s4; 2576 } 2577 /* backward solve the upper triangular */ 2578 for (i=n-1; i>=0; i--){ 2579 v = aa + 16*diag[i] + 16; 2580 vi = aj + diag[i] + 1; 2581 nz = ai[i+1] - diag[i] - 1; 2582 idt = 4*i; 2583 s1 = t[idt]; 2584 s2 = t[1+idt]; 2585 s3 = t[2+idt]; 2586 s4 = t[3+idt]; 2587 while (nz--) { 2588 idx = 4*(*vi++); 2589 x1 = t[idx]; 2590 x2 = t[1+idx]; 2591 x3 = t[2+idx]; 2592 x4 = t[3+idx]; 2593 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2594 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2595 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2596 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2597 v += 16; 2598 } 2599 idc = 4*(*c--); 2600 v = aa + 16*diag[i]; 2601 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2602 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2603 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2604 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2605 x[idc] = (PetscScalar)t[idt]; 2606 x[1+idc] = (PetscScalar)t[1+idt]; 2607 x[2+idc] = (PetscScalar)t[2+idt]; 2608 x[3+idc] = (PetscScalar)t[3+idt]; 2609 } 2610 2611 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2612 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2613 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2614 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2615 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2616 PetscFunctionReturn(0); 2617 } 2618 2619 #if defined (PETSC_HAVE_SSE) 2620 2621 #include PETSC_HAVE_SSE 2622 2623 #undef __FUNCT__ 2624 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2625 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 2626 { 2627 /* 2628 Note: This code uses demotion of double 2629 to float when performing the mixed-mode computation. 2630 This may not be numerically reasonable for all applications. 2631 */ 2632 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2633 IS iscol=a->col,isrow=a->row; 2634 PetscErrorCode ierr; 2635 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 2636 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2637 MatScalar *aa=a->a,*v; 2638 PetscScalar *x,*b,*t; 2639 2640 /* Make space in temp stack for 16 Byte Aligned arrays */ 2641 float ssealignedspace[11],*tmps,*tmpx; 2642 unsigned long offset; 2643 2644 PetscFunctionBegin; 2645 SSE_SCOPE_BEGIN; 2646 2647 offset = (unsigned long)ssealignedspace % 16; 2648 if (offset) offset = (16 - offset)/4; 2649 tmps = &ssealignedspace[offset]; 2650 tmpx = &ssealignedspace[offset+4]; 2651 PREFETCH_NTA(aa+16*ai[1]); 2652 2653 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2654 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2655 t = a->solve_work; 2656 2657 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2658 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2659 2660 /* forward solve the lower triangular */ 2661 idx = 4*(*r++); 2662 t[0] = b[idx]; t[1] = b[1+idx]; 2663 t[2] = b[2+idx]; t[3] = b[3+idx]; 2664 v = aa + 16*ai[1]; 2665 2666 for (i=1; i<n;) { 2667 PREFETCH_NTA(&v[8]); 2668 vi = aj + ai[i]; 2669 nz = diag[i] - ai[i]; 2670 idx = 4*(*r++); 2671 2672 /* Demote sum from double to float */ 2673 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2674 LOAD_PS(tmps,XMM7); 2675 2676 while (nz--) { 2677 PREFETCH_NTA(&v[16]); 2678 idx = 4*(*vi++); 2679 2680 /* Demote solution (so far) from double to float */ 2681 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2682 2683 /* 4x4 Matrix-Vector product with negative accumulation: */ 2684 SSE_INLINE_BEGIN_2(tmpx,v) 2685 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2686 2687 /* First Column */ 2688 SSE_COPY_PS(XMM0,XMM6) 2689 SSE_SHUFFLE(XMM0,XMM0,0x00) 2690 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2691 SSE_SUB_PS(XMM7,XMM0) 2692 2693 /* Second Column */ 2694 SSE_COPY_PS(XMM1,XMM6) 2695 SSE_SHUFFLE(XMM1,XMM1,0x55) 2696 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2697 SSE_SUB_PS(XMM7,XMM1) 2698 2699 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2700 2701 /* Third Column */ 2702 SSE_COPY_PS(XMM2,XMM6) 2703 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2704 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2705 SSE_SUB_PS(XMM7,XMM2) 2706 2707 /* Fourth Column */ 2708 SSE_COPY_PS(XMM3,XMM6) 2709 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2710 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2711 SSE_SUB_PS(XMM7,XMM3) 2712 SSE_INLINE_END_2 2713 2714 v += 16; 2715 } 2716 idx = 4*i; 2717 v = aa + 16*ai[++i]; 2718 PREFETCH_NTA(v); 2719 STORE_PS(tmps,XMM7); 2720 2721 /* Promote result from float to double */ 2722 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 2723 } 2724 /* backward solve the upper triangular */ 2725 idt = 4*(n-1); 2726 ai16 = 16*diag[n-1]; 2727 v = aa + ai16 + 16; 2728 for (i=n-1; i>=0;){ 2729 PREFETCH_NTA(&v[8]); 2730 vi = aj + diag[i] + 1; 2731 nz = ai[i+1] - diag[i] - 1; 2732 2733 /* Demote accumulator from double to float */ 2734 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 2735 LOAD_PS(tmps,XMM7); 2736 2737 while (nz--) { 2738 PREFETCH_NTA(&v[16]); 2739 idx = 4*(*vi++); 2740 2741 /* Demote solution (so far) from double to float */ 2742 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2743 2744 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2745 SSE_INLINE_BEGIN_2(tmpx,v) 2746 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2747 2748 /* First Column */ 2749 SSE_COPY_PS(XMM0,XMM6) 2750 SSE_SHUFFLE(XMM0,XMM0,0x00) 2751 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2752 SSE_SUB_PS(XMM7,XMM0) 2753 2754 /* Second Column */ 2755 SSE_COPY_PS(XMM1,XMM6) 2756 SSE_SHUFFLE(XMM1,XMM1,0x55) 2757 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2758 SSE_SUB_PS(XMM7,XMM1) 2759 2760 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2761 2762 /* Third Column */ 2763 SSE_COPY_PS(XMM2,XMM6) 2764 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2765 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2766 SSE_SUB_PS(XMM7,XMM2) 2767 2768 /* Fourth Column */ 2769 SSE_COPY_PS(XMM3,XMM6) 2770 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2771 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2772 SSE_SUB_PS(XMM7,XMM3) 2773 SSE_INLINE_END_2 2774 v += 16; 2775 } 2776 v = aa + ai16; 2777 ai16 = 16*diag[--i]; 2778 PREFETCH_NTA(aa+ai16+16); 2779 /* 2780 Scale the result by the diagonal 4x4 block, 2781 which was inverted as part of the factorization 2782 */ 2783 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2784 /* First Column */ 2785 SSE_COPY_PS(XMM0,XMM7) 2786 SSE_SHUFFLE(XMM0,XMM0,0x00) 2787 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2788 2789 /* Second Column */ 2790 SSE_COPY_PS(XMM1,XMM7) 2791 SSE_SHUFFLE(XMM1,XMM1,0x55) 2792 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2793 SSE_ADD_PS(XMM0,XMM1) 2794 2795 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2796 2797 /* Third Column */ 2798 SSE_COPY_PS(XMM2,XMM7) 2799 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2800 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2801 SSE_ADD_PS(XMM0,XMM2) 2802 2803 /* Fourth Column */ 2804 SSE_COPY_PS(XMM3,XMM7) 2805 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2806 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2807 SSE_ADD_PS(XMM0,XMM3) 2808 2809 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2810 SSE_INLINE_END_3 2811 2812 /* Promote solution from float to double */ 2813 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2814 2815 /* Apply reordering to t and stream into x. */ 2816 /* This way, x doesn't pollute the cache. */ 2817 /* Be careful with size: 2 doubles = 4 floats! */ 2818 idc = 4*(*c--); 2819 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2820 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2821 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2822 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2823 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2824 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2825 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2826 SSE_INLINE_END_2 2827 v = aa + ai16 + 16; 2828 idt -= 4; 2829 } 2830 2831 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2832 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2833 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2834 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2835 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2836 SSE_SCOPE_END; 2837 PetscFunctionReturn(0); 2838 } 2839 2840 #endif 2841 2842 2843 /* 2844 Special case where the matrix was ILU(0) factored in the natural 2845 ordering. This eliminates the need for the column and row permutation. 2846 */ 2847 #undef __FUNCT__ 2848 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2849 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2850 { 2851 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2852 PetscInt n=a->mbs; 2853 const PetscInt *ai=a->i,*aj=a->j; 2854 PetscErrorCode ierr; 2855 const PetscInt *diag = a->diag; 2856 const MatScalar *aa=a->a; 2857 PetscScalar *x; 2858 const PetscScalar *b; 2859 2860 PetscFunctionBegin; 2861 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2862 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2863 2864 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2865 { 2866 static PetscScalar w[2000]; /* very BAD need to fix */ 2867 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2868 } 2869 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2870 { 2871 static PetscScalar w[2000]; /* very BAD need to fix */ 2872 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2873 } 2874 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2875 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2876 #else 2877 { 2878 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2879 const MatScalar *v; 2880 PetscInt jdx,idt,idx,nz,i,ai16; 2881 const PetscInt *vi; 2882 2883 /* forward solve the lower triangular */ 2884 idx = 0; 2885 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2886 for (i=1; i<n; i++) { 2887 v = aa + 16*ai[i]; 2888 vi = aj + ai[i]; 2889 nz = diag[i] - ai[i]; 2890 idx += 4; 2891 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2892 while (nz--) { 2893 jdx = 4*(*vi++); 2894 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2895 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2896 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2897 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2898 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2899 v += 16; 2900 } 2901 x[idx] = s1; 2902 x[1+idx] = s2; 2903 x[2+idx] = s3; 2904 x[3+idx] = s4; 2905 } 2906 /* backward solve the upper triangular */ 2907 idt = 4*(n-1); 2908 for (i=n-1; i>=0; i--){ 2909 ai16 = 16*diag[i]; 2910 v = aa + ai16 + 16; 2911 vi = aj + diag[i] + 1; 2912 nz = ai[i+1] - diag[i] - 1; 2913 s1 = x[idt]; s2 = x[1+idt]; 2914 s3 = x[2+idt];s4 = x[3+idt]; 2915 while (nz--) { 2916 idx = 4*(*vi++); 2917 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2918 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2919 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2920 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2921 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2922 v += 16; 2923 } 2924 v = aa + ai16; 2925 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2926 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2927 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2928 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2929 idt -= 4; 2930 } 2931 } 2932 #endif 2933 2934 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2935 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2936 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2937 PetscFunctionReturn(0); 2938 } 2939 2940 #undef __FUNCT__ 2941 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 2942 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2943 { 2944 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2945 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2946 PetscErrorCode ierr; 2947 PetscInt idx,jdx,idt; 2948 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2949 const MatScalar *aa=a->a,*v; 2950 PetscScalar *x; 2951 const PetscScalar *b; 2952 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2953 2954 PetscFunctionBegin; 2955 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2956 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2957 /* forward solve the lower triangular */ 2958 idx = 0; 2959 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2960 for (i=1; i<n; i++) { 2961 v = aa + bs2*ai[i]; 2962 vi = aj + ai[i]; 2963 nz = ai[i+1] - ai[i]; 2964 idx = bs*i; 2965 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2966 for(k=0;k<nz;k++) { 2967 jdx = bs*vi[k]; 2968 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2969 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2970 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2971 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2972 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2973 2974 v += bs2; 2975 } 2976 2977 x[idx] = s1; 2978 x[1+idx] = s2; 2979 x[2+idx] = s3; 2980 x[3+idx] = s4; 2981 } 2982 2983 /* backward solve the upper triangular */ 2984 for (i=n-1; i>=0; i--){ 2985 v = aa + bs2*(adiag[i+1]+1); 2986 vi = aj + adiag[i+1]+1; 2987 nz = adiag[i] - adiag[i+1]-1; 2988 idt = bs*i; 2989 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2990 2991 for(k=0;k<nz;k++){ 2992 idx = bs*vi[k]; 2993 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2994 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2995 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2996 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2997 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2998 2999 v += bs2; 3000 } 3001 /* x = inv_diagonal*x */ 3002 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3003 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3004 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3005 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3006 3007 } 3008 3009 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3010 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3011 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3012 PetscFunctionReturn(0); 3013 } 3014 3015 #undef __FUNCT__ 3016 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3017 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3018 { 3019 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3020 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3021 PetscErrorCode ierr; 3022 PetscInt *diag = a->diag; 3023 MatScalar *aa=a->a; 3024 PetscScalar *x,*b; 3025 3026 PetscFunctionBegin; 3027 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3028 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3029 3030 { 3031 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3032 MatScalar *v,*t=(MatScalar *)x; 3033 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3034 3035 /* forward solve the lower triangular */ 3036 idx = 0; 3037 t[0] = (MatScalar)b[0]; 3038 t[1] = (MatScalar)b[1]; 3039 t[2] = (MatScalar)b[2]; 3040 t[3] = (MatScalar)b[3]; 3041 for (i=1; i<n; i++) { 3042 v = aa + 16*ai[i]; 3043 vi = aj + ai[i]; 3044 nz = diag[i] - ai[i]; 3045 idx += 4; 3046 s1 = (MatScalar)b[idx]; 3047 s2 = (MatScalar)b[1+idx]; 3048 s3 = (MatScalar)b[2+idx]; 3049 s4 = (MatScalar)b[3+idx]; 3050 while (nz--) { 3051 jdx = 4*(*vi++); 3052 x1 = t[jdx]; 3053 x2 = t[1+jdx]; 3054 x3 = t[2+jdx]; 3055 x4 = t[3+jdx]; 3056 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3057 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3058 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3059 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3060 v += 16; 3061 } 3062 t[idx] = s1; 3063 t[1+idx] = s2; 3064 t[2+idx] = s3; 3065 t[3+idx] = s4; 3066 } 3067 /* backward solve the upper triangular */ 3068 idt = 4*(n-1); 3069 for (i=n-1; i>=0; i--){ 3070 ai16 = 16*diag[i]; 3071 v = aa + ai16 + 16; 3072 vi = aj + diag[i] + 1; 3073 nz = ai[i+1] - diag[i] - 1; 3074 s1 = t[idt]; 3075 s2 = t[1+idt]; 3076 s3 = t[2+idt]; 3077 s4 = t[3+idt]; 3078 while (nz--) { 3079 idx = 4*(*vi++); 3080 x1 = (MatScalar)x[idx]; 3081 x2 = (MatScalar)x[1+idx]; 3082 x3 = (MatScalar)x[2+idx]; 3083 x4 = (MatScalar)x[3+idx]; 3084 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3085 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3086 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3087 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3088 v += 16; 3089 } 3090 v = aa + ai16; 3091 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3092 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3093 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3094 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3095 idt -= 4; 3096 } 3097 } 3098 3099 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3100 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3101 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3102 PetscFunctionReturn(0); 3103 } 3104 3105 #if defined (PETSC_HAVE_SSE) 3106 3107 #include PETSC_HAVE_SSE 3108 #undef __FUNCT__ 3109 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3110 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 3111 { 3112 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3113 unsigned short *aj=(unsigned short *)a->j; 3114 PetscErrorCode ierr; 3115 int *ai=a->i,n=a->mbs,*diag = a->diag; 3116 MatScalar *aa=a->a; 3117 PetscScalar *x,*b; 3118 3119 PetscFunctionBegin; 3120 SSE_SCOPE_BEGIN; 3121 /* 3122 Note: This code currently uses demotion of double 3123 to float when performing the mixed-mode computation. 3124 This may not be numerically reasonable for all applications. 3125 */ 3126 PREFETCH_NTA(aa+16*ai[1]); 3127 3128 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3129 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3130 { 3131 /* x will first be computed in single precision then promoted inplace to double */ 3132 MatScalar *v,*t=(MatScalar *)x; 3133 int nz,i,idt,ai16; 3134 unsigned int jdx,idx; 3135 unsigned short *vi; 3136 /* Forward solve the lower triangular factor. */ 3137 3138 /* First block is the identity. */ 3139 idx = 0; 3140 CONVERT_DOUBLE4_FLOAT4(t,b); 3141 v = aa + 16*((unsigned int)ai[1]); 3142 3143 for (i=1; i<n;) { 3144 PREFETCH_NTA(&v[8]); 3145 vi = aj + ai[i]; 3146 nz = diag[i] - ai[i]; 3147 idx += 4; 3148 3149 /* Demote RHS from double to float. */ 3150 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3151 LOAD_PS(&t[idx],XMM7); 3152 3153 while (nz--) { 3154 PREFETCH_NTA(&v[16]); 3155 jdx = 4*((unsigned int)(*vi++)); 3156 3157 /* 4x4 Matrix-Vector product with negative accumulation: */ 3158 SSE_INLINE_BEGIN_2(&t[jdx],v) 3159 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3160 3161 /* First Column */ 3162 SSE_COPY_PS(XMM0,XMM6) 3163 SSE_SHUFFLE(XMM0,XMM0,0x00) 3164 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3165 SSE_SUB_PS(XMM7,XMM0) 3166 3167 /* Second Column */ 3168 SSE_COPY_PS(XMM1,XMM6) 3169 SSE_SHUFFLE(XMM1,XMM1,0x55) 3170 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3171 SSE_SUB_PS(XMM7,XMM1) 3172 3173 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3174 3175 /* Third Column */ 3176 SSE_COPY_PS(XMM2,XMM6) 3177 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3178 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3179 SSE_SUB_PS(XMM7,XMM2) 3180 3181 /* Fourth Column */ 3182 SSE_COPY_PS(XMM3,XMM6) 3183 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3184 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3185 SSE_SUB_PS(XMM7,XMM3) 3186 SSE_INLINE_END_2 3187 3188 v += 16; 3189 } 3190 v = aa + 16*ai[++i]; 3191 PREFETCH_NTA(v); 3192 STORE_PS(&t[idx],XMM7); 3193 } 3194 3195 /* Backward solve the upper triangular factor.*/ 3196 3197 idt = 4*(n-1); 3198 ai16 = 16*diag[n-1]; 3199 v = aa + ai16 + 16; 3200 for (i=n-1; i>=0;){ 3201 PREFETCH_NTA(&v[8]); 3202 vi = aj + diag[i] + 1; 3203 nz = ai[i+1] - diag[i] - 1; 3204 3205 LOAD_PS(&t[idt],XMM7); 3206 3207 while (nz--) { 3208 PREFETCH_NTA(&v[16]); 3209 idx = 4*((unsigned int)(*vi++)); 3210 3211 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3212 SSE_INLINE_BEGIN_2(&t[idx],v) 3213 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3214 3215 /* First Column */ 3216 SSE_COPY_PS(XMM0,XMM6) 3217 SSE_SHUFFLE(XMM0,XMM0,0x00) 3218 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3219 SSE_SUB_PS(XMM7,XMM0) 3220 3221 /* Second Column */ 3222 SSE_COPY_PS(XMM1,XMM6) 3223 SSE_SHUFFLE(XMM1,XMM1,0x55) 3224 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3225 SSE_SUB_PS(XMM7,XMM1) 3226 3227 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3228 3229 /* Third Column */ 3230 SSE_COPY_PS(XMM2,XMM6) 3231 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3232 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3233 SSE_SUB_PS(XMM7,XMM2) 3234 3235 /* Fourth Column */ 3236 SSE_COPY_PS(XMM3,XMM6) 3237 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3238 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3239 SSE_SUB_PS(XMM7,XMM3) 3240 SSE_INLINE_END_2 3241 v += 16; 3242 } 3243 v = aa + ai16; 3244 ai16 = 16*diag[--i]; 3245 PREFETCH_NTA(aa+ai16+16); 3246 /* 3247 Scale the result by the diagonal 4x4 block, 3248 which was inverted as part of the factorization 3249 */ 3250 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3251 /* First Column */ 3252 SSE_COPY_PS(XMM0,XMM7) 3253 SSE_SHUFFLE(XMM0,XMM0,0x00) 3254 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3255 3256 /* Second Column */ 3257 SSE_COPY_PS(XMM1,XMM7) 3258 SSE_SHUFFLE(XMM1,XMM1,0x55) 3259 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3260 SSE_ADD_PS(XMM0,XMM1) 3261 3262 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3263 3264 /* Third Column */ 3265 SSE_COPY_PS(XMM2,XMM7) 3266 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3267 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3268 SSE_ADD_PS(XMM0,XMM2) 3269 3270 /* Fourth Column */ 3271 SSE_COPY_PS(XMM3,XMM7) 3272 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3273 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3274 SSE_ADD_PS(XMM0,XMM3) 3275 3276 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3277 SSE_INLINE_END_3 3278 3279 v = aa + ai16 + 16; 3280 idt -= 4; 3281 } 3282 3283 /* Convert t from single precision back to double precision (inplace)*/ 3284 idt = 4*(n-1); 3285 for (i=n-1;i>=0;i--) { 3286 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3287 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3288 PetscScalar *xtemp=&x[idt]; 3289 MatScalar *ttemp=&t[idt]; 3290 xtemp[3] = (PetscScalar)ttemp[3]; 3291 xtemp[2] = (PetscScalar)ttemp[2]; 3292 xtemp[1] = (PetscScalar)ttemp[1]; 3293 xtemp[0] = (PetscScalar)ttemp[0]; 3294 idt -= 4; 3295 } 3296 3297 } /* End of artificial scope. */ 3298 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3299 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3300 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3301 SSE_SCOPE_END; 3302 PetscFunctionReturn(0); 3303 } 3304 3305 #undef __FUNCT__ 3306 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3307 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 3308 { 3309 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3310 int *aj=a->j; 3311 PetscErrorCode ierr; 3312 int *ai=a->i,n=a->mbs,*diag = a->diag; 3313 MatScalar *aa=a->a; 3314 PetscScalar *x,*b; 3315 3316 PetscFunctionBegin; 3317 SSE_SCOPE_BEGIN; 3318 /* 3319 Note: This code currently uses demotion of double 3320 to float when performing the mixed-mode computation. 3321 This may not be numerically reasonable for all applications. 3322 */ 3323 PREFETCH_NTA(aa+16*ai[1]); 3324 3325 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3326 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3327 { 3328 /* x will first be computed in single precision then promoted inplace to double */ 3329 MatScalar *v,*t=(MatScalar *)x; 3330 int nz,i,idt,ai16; 3331 int jdx,idx; 3332 int *vi; 3333 /* Forward solve the lower triangular factor. */ 3334 3335 /* First block is the identity. */ 3336 idx = 0; 3337 CONVERT_DOUBLE4_FLOAT4(t,b); 3338 v = aa + 16*ai[1]; 3339 3340 for (i=1; i<n;) { 3341 PREFETCH_NTA(&v[8]); 3342 vi = aj + ai[i]; 3343 nz = diag[i] - ai[i]; 3344 idx += 4; 3345 3346 /* Demote RHS from double to float. */ 3347 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3348 LOAD_PS(&t[idx],XMM7); 3349 3350 while (nz--) { 3351 PREFETCH_NTA(&v[16]); 3352 jdx = 4*(*vi++); 3353 /* jdx = *vi++; */ 3354 3355 /* 4x4 Matrix-Vector product with negative accumulation: */ 3356 SSE_INLINE_BEGIN_2(&t[jdx],v) 3357 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3358 3359 /* First Column */ 3360 SSE_COPY_PS(XMM0,XMM6) 3361 SSE_SHUFFLE(XMM0,XMM0,0x00) 3362 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3363 SSE_SUB_PS(XMM7,XMM0) 3364 3365 /* Second Column */ 3366 SSE_COPY_PS(XMM1,XMM6) 3367 SSE_SHUFFLE(XMM1,XMM1,0x55) 3368 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3369 SSE_SUB_PS(XMM7,XMM1) 3370 3371 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3372 3373 /* Third Column */ 3374 SSE_COPY_PS(XMM2,XMM6) 3375 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3376 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3377 SSE_SUB_PS(XMM7,XMM2) 3378 3379 /* Fourth Column */ 3380 SSE_COPY_PS(XMM3,XMM6) 3381 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3382 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3383 SSE_SUB_PS(XMM7,XMM3) 3384 SSE_INLINE_END_2 3385 3386 v += 16; 3387 } 3388 v = aa + 16*ai[++i]; 3389 PREFETCH_NTA(v); 3390 STORE_PS(&t[idx],XMM7); 3391 } 3392 3393 /* Backward solve the upper triangular factor.*/ 3394 3395 idt = 4*(n-1); 3396 ai16 = 16*diag[n-1]; 3397 v = aa + ai16 + 16; 3398 for (i=n-1; i>=0;){ 3399 PREFETCH_NTA(&v[8]); 3400 vi = aj + diag[i] + 1; 3401 nz = ai[i+1] - diag[i] - 1; 3402 3403 LOAD_PS(&t[idt],XMM7); 3404 3405 while (nz--) { 3406 PREFETCH_NTA(&v[16]); 3407 idx = 4*(*vi++); 3408 /* idx = *vi++; */ 3409 3410 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3411 SSE_INLINE_BEGIN_2(&t[idx],v) 3412 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3413 3414 /* First Column */ 3415 SSE_COPY_PS(XMM0,XMM6) 3416 SSE_SHUFFLE(XMM0,XMM0,0x00) 3417 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3418 SSE_SUB_PS(XMM7,XMM0) 3419 3420 /* Second Column */ 3421 SSE_COPY_PS(XMM1,XMM6) 3422 SSE_SHUFFLE(XMM1,XMM1,0x55) 3423 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3424 SSE_SUB_PS(XMM7,XMM1) 3425 3426 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3427 3428 /* Third Column */ 3429 SSE_COPY_PS(XMM2,XMM6) 3430 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3431 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3432 SSE_SUB_PS(XMM7,XMM2) 3433 3434 /* Fourth Column */ 3435 SSE_COPY_PS(XMM3,XMM6) 3436 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3437 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3438 SSE_SUB_PS(XMM7,XMM3) 3439 SSE_INLINE_END_2 3440 v += 16; 3441 } 3442 v = aa + ai16; 3443 ai16 = 16*diag[--i]; 3444 PREFETCH_NTA(aa+ai16+16); 3445 /* 3446 Scale the result by the diagonal 4x4 block, 3447 which was inverted as part of the factorization 3448 */ 3449 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3450 /* First Column */ 3451 SSE_COPY_PS(XMM0,XMM7) 3452 SSE_SHUFFLE(XMM0,XMM0,0x00) 3453 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3454 3455 /* Second Column */ 3456 SSE_COPY_PS(XMM1,XMM7) 3457 SSE_SHUFFLE(XMM1,XMM1,0x55) 3458 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3459 SSE_ADD_PS(XMM0,XMM1) 3460 3461 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3462 3463 /* Third Column */ 3464 SSE_COPY_PS(XMM2,XMM7) 3465 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3466 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3467 SSE_ADD_PS(XMM0,XMM2) 3468 3469 /* Fourth Column */ 3470 SSE_COPY_PS(XMM3,XMM7) 3471 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3472 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3473 SSE_ADD_PS(XMM0,XMM3) 3474 3475 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3476 SSE_INLINE_END_3 3477 3478 v = aa + ai16 + 16; 3479 idt -= 4; 3480 } 3481 3482 /* Convert t from single precision back to double precision (inplace)*/ 3483 idt = 4*(n-1); 3484 for (i=n-1;i>=0;i--) { 3485 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3486 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3487 PetscScalar *xtemp=&x[idt]; 3488 MatScalar *ttemp=&t[idt]; 3489 xtemp[3] = (PetscScalar)ttemp[3]; 3490 xtemp[2] = (PetscScalar)ttemp[2]; 3491 xtemp[1] = (PetscScalar)ttemp[1]; 3492 xtemp[0] = (PetscScalar)ttemp[0]; 3493 idt -= 4; 3494 } 3495 3496 } /* End of artificial scope. */ 3497 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3498 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3499 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3500 SSE_SCOPE_END; 3501 PetscFunctionReturn(0); 3502 } 3503 3504 #endif 3505 3506 #undef __FUNCT__ 3507 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3508 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 3509 { 3510 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3511 IS iscol=a->col,isrow=a->row; 3512 PetscErrorCode ierr; 3513 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3514 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3515 const MatScalar *aa=a->a,*v; 3516 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3517 const PetscScalar *b; 3518 3519 PetscFunctionBegin; 3520 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3521 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3522 t = a->solve_work; 3523 3524 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3525 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3526 3527 /* forward solve the lower triangular */ 3528 idx = 3*(*r++); 3529 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3530 for (i=1; i<n; i++) { 3531 v = aa + 9*ai[i]; 3532 vi = aj + ai[i]; 3533 nz = diag[i] - ai[i]; 3534 idx = 3*(*r++); 3535 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3536 while (nz--) { 3537 idx = 3*(*vi++); 3538 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3539 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3540 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3541 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3542 v += 9; 3543 } 3544 idx = 3*i; 3545 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3546 } 3547 /* backward solve the upper triangular */ 3548 for (i=n-1; i>=0; i--){ 3549 v = aa + 9*diag[i] + 9; 3550 vi = aj + diag[i] + 1; 3551 nz = ai[i+1] - diag[i] - 1; 3552 idt = 3*i; 3553 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3554 while (nz--) { 3555 idx = 3*(*vi++); 3556 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3557 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3558 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3559 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3560 v += 9; 3561 } 3562 idc = 3*(*c--); 3563 v = aa + 9*diag[i]; 3564 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3565 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3566 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3567 } 3568 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3569 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3570 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3571 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3572 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3573 PetscFunctionReturn(0); 3574 } 3575 3576 #undef __FUNCT__ 3577 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 3578 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 3579 { 3580 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3581 IS iscol=a->col,isrow=a->row; 3582 PetscErrorCode ierr; 3583 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3584 const PetscInt *r,*c,*rout,*cout; 3585 const MatScalar *aa=a->a,*v; 3586 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3587 const PetscScalar *b; 3588 3589 PetscFunctionBegin; 3590 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3591 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3592 t = a->solve_work; 3593 3594 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3595 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3596 3597 /* forward solve the lower triangular */ 3598 idx = 3*r[0]; 3599 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3600 for (i=1; i<n; i++) { 3601 v = aa + 9*ai[i]; 3602 vi = aj + ai[i]; 3603 nz = ai[i+1] - ai[i]; 3604 idx = 3*r[i]; 3605 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3606 for(m=0;m<nz;m++){ 3607 idx = 3*vi[m]; 3608 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3609 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3610 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3611 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3612 v += 9; 3613 } 3614 idx = 3*i; 3615 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3616 } 3617 /* backward solve the upper triangular */ 3618 for (i=n-1; i>=0; i--){ 3619 v = aa + 9*(adiag[i+1]+1); 3620 vi = aj + adiag[i+1]+1; 3621 nz = adiag[i] - adiag[i+1] - 1; 3622 idt = 3*i; 3623 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3624 for(m=0;m<nz;m++){ 3625 idx = 3*vi[m]; 3626 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3627 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3628 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3629 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3630 v += 9; 3631 } 3632 idc = 3*c[i]; 3633 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3634 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3635 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3636 } 3637 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3638 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3639 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3640 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3641 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3642 PetscFunctionReturn(0); 3643 } 3644 3645 /* 3646 Special case where the matrix was ILU(0) factored in the natural 3647 ordering. This eliminates the need for the column and row permutation. 3648 */ 3649 #undef __FUNCT__ 3650 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 3651 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 3652 { 3653 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3654 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3655 PetscErrorCode ierr; 3656 PetscInt *diag = a->diag; 3657 const MatScalar *aa=a->a,*v; 3658 PetscScalar *x,s1,s2,s3,x1,x2,x3; 3659 const PetscScalar *b; 3660 PetscInt jdx,idt,idx,nz,*vi,i; 3661 3662 PetscFunctionBegin; 3663 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3664 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3665 3666 /* forward solve the lower triangular */ 3667 idx = 0; 3668 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 3669 for (i=1; i<n; i++) { 3670 v = aa + 9*ai[i]; 3671 vi = aj + ai[i]; 3672 nz = diag[i] - ai[i]; 3673 idx += 3; 3674 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3675 while (nz--) { 3676 jdx = 3*(*vi++); 3677 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 3678 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3679 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3680 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3681 v += 9; 3682 } 3683 x[idx] = s1; 3684 x[1+idx] = s2; 3685 x[2+idx] = s3; 3686 } 3687 /* backward solve the upper triangular */ 3688 for (i=n-1; i>=0; i--){ 3689 v = aa + 9*diag[i] + 9; 3690 vi = aj + diag[i] + 1; 3691 nz = ai[i+1] - diag[i] - 1; 3692 idt = 3*i; 3693 s1 = x[idt]; s2 = x[1+idt]; 3694 s3 = x[2+idt]; 3695 while (nz--) { 3696 idx = 3*(*vi++); 3697 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 3698 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3699 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3700 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3701 v += 9; 3702 } 3703 v = aa + 9*diag[i]; 3704 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3705 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3706 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3707 } 3708 3709 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3711 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3712 PetscFunctionReturn(0); 3713 } 3714 3715 #undef __FUNCT__ 3716 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 3717 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3718 { 3719 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3720 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3721 PetscErrorCode ierr; 3722 PetscInt idx,jdx,idt; 3723 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3724 const MatScalar *aa=a->a,*v; 3725 PetscScalar *x; 3726 const PetscScalar *b; 3727 PetscScalar s1,s2,s3,x1,x2,x3; 3728 3729 PetscFunctionBegin; 3730 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3731 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3732 /* forward solve the lower triangular */ 3733 idx = 0; 3734 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3735 for (i=1; i<n; i++) { 3736 v = aa + bs2*ai[i]; 3737 vi = aj + ai[i]; 3738 nz = ai[i+1] - ai[i]; 3739 idx = bs*i; 3740 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3741 for(k=0;k<nz;k++){ 3742 jdx = bs*vi[k]; 3743 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3744 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3745 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3746 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3747 3748 v += bs2; 3749 } 3750 3751 x[idx] = s1; 3752 x[1+idx] = s2; 3753 x[2+idx] = s3; 3754 } 3755 3756 /* backward solve the upper triangular */ 3757 for (i=n-1; i>=0; i--){ 3758 v = aa + bs2*(adiag[i+1]+1); 3759 vi = aj + adiag[i+1]+1; 3760 nz = adiag[i] - adiag[i+1]-1; 3761 idt = bs*i; 3762 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3763 3764 for(k=0;k<nz;k++){ 3765 idx = bs*vi[k]; 3766 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3767 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3768 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3769 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3770 3771 v += bs2; 3772 } 3773 /* x = inv_diagonal*x */ 3774 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3775 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3776 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3777 3778 } 3779 3780 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3781 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3782 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3783 PetscFunctionReturn(0); 3784 } 3785 3786 #undef __FUNCT__ 3787 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 3788 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 3789 { 3790 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3791 IS iscol=a->col,isrow=a->row; 3792 PetscErrorCode ierr; 3793 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3794 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3795 const MatScalar *aa=a->a,*v; 3796 PetscScalar *x,s1,s2,x1,x2,*t; 3797 const PetscScalar *b; 3798 3799 PetscFunctionBegin; 3800 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3801 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3802 t = a->solve_work; 3803 3804 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3805 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3806 3807 /* forward solve the lower triangular */ 3808 idx = 2*(*r++); 3809 t[0] = b[idx]; t[1] = b[1+idx]; 3810 for (i=1; i<n; i++) { 3811 v = aa + 4*ai[i]; 3812 vi = aj + ai[i]; 3813 nz = diag[i] - ai[i]; 3814 idx = 2*(*r++); 3815 s1 = b[idx]; s2 = b[1+idx]; 3816 while (nz--) { 3817 idx = 2*(*vi++); 3818 x1 = t[idx]; x2 = t[1+idx]; 3819 s1 -= v[0]*x1 + v[2]*x2; 3820 s2 -= v[1]*x1 + v[3]*x2; 3821 v += 4; 3822 } 3823 idx = 2*i; 3824 t[idx] = s1; t[1+idx] = s2; 3825 } 3826 /* backward solve the upper triangular */ 3827 for (i=n-1; i>=0; i--){ 3828 v = aa + 4*diag[i] + 4; 3829 vi = aj + diag[i] + 1; 3830 nz = ai[i+1] - diag[i] - 1; 3831 idt = 2*i; 3832 s1 = t[idt]; s2 = t[1+idt]; 3833 while (nz--) { 3834 idx = 2*(*vi++); 3835 x1 = t[idx]; x2 = t[1+idx]; 3836 s1 -= v[0]*x1 + v[2]*x2; 3837 s2 -= v[1]*x1 + v[3]*x2; 3838 v += 4; 3839 } 3840 idc = 2*(*c--); 3841 v = aa + 4*diag[i]; 3842 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3843 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 3844 } 3845 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3846 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3847 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3848 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3849 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3850 PetscFunctionReturn(0); 3851 } 3852 3853 #undef __FUNCT__ 3854 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 3855 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 3856 { 3857 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3858 IS iscol=a->col,isrow=a->row; 3859 PetscErrorCode ierr; 3860 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 3861 const PetscInt *r,*c,*rout,*cout; 3862 const MatScalar *aa=a->a,*v; 3863 PetscScalar *x,s1,s2,x1,x2,*t; 3864 const PetscScalar *b; 3865 3866 PetscFunctionBegin; 3867 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3868 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3869 t = a->solve_work; 3870 3871 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3872 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3873 3874 /* forward solve the lower triangular */ 3875 idx = 2*r[0]; 3876 t[0] = b[idx]; t[1] = b[1+idx]; 3877 for (i=1; i<n; i++) { 3878 v = aa + 4*ai[i]; 3879 vi = aj + ai[i]; 3880 nz = ai[i+1] - ai[i]; 3881 idx = 2*r[i]; 3882 s1 = b[idx]; s2 = b[1+idx]; 3883 for(m=0;m<nz;m++){ 3884 jdx = 2*vi[m]; 3885 x1 = t[jdx]; x2 = t[1+jdx]; 3886 s1 -= v[0]*x1 + v[2]*x2; 3887 s2 -= v[1]*x1 + v[3]*x2; 3888 v += 4; 3889 } 3890 idx = 2*i; 3891 t[idx] = s1; t[1+idx] = s2; 3892 } 3893 /* backward solve the upper triangular */ 3894 for (i=n-1; i>=0; i--){ 3895 v = aa + 4*(adiag[i+1]+1); 3896 vi = aj + adiag[i+1]+1; 3897 nz = adiag[i] - adiag[i+1] - 1; 3898 idt = 2*i; 3899 s1 = t[idt]; s2 = t[1+idt]; 3900 for(m=0;m<nz;m++){ 3901 idx = 2*vi[m]; 3902 x1 = t[idx]; x2 = t[1+idx]; 3903 s1 -= v[0]*x1 + v[2]*x2; 3904 s2 -= v[1]*x1 + v[3]*x2; 3905 v += 4; 3906 } 3907 idc = 2*c[i]; 3908 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3909 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 3910 } 3911 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3912 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3913 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3914 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3915 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3916 PetscFunctionReturn(0); 3917 } 3918 3919 /* 3920 Special case where the matrix was ILU(0) factored in the natural 3921 ordering. This eliminates the need for the column and row permutation. 3922 */ 3923 #undef __FUNCT__ 3924 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3925 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 3926 { 3927 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3928 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3929 PetscErrorCode ierr; 3930 PetscInt *diag = a->diag; 3931 const MatScalar *aa=a->a,*v; 3932 PetscScalar *x,s1,s2,x1,x2; 3933 const PetscScalar *b; 3934 PetscInt jdx,idt,idx,nz,*vi,i; 3935 3936 PetscFunctionBegin; 3937 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3938 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3939 3940 /* forward solve the lower triangular */ 3941 idx = 0; 3942 x[0] = b[0]; x[1] = b[1]; 3943 for (i=1; i<n; i++) { 3944 v = aa + 4*ai[i]; 3945 vi = aj + ai[i]; 3946 nz = diag[i] - ai[i]; 3947 idx += 2; 3948 s1 = b[idx];s2 = b[1+idx]; 3949 while (nz--) { 3950 jdx = 2*(*vi++); 3951 x1 = x[jdx];x2 = x[1+jdx]; 3952 s1 -= v[0]*x1 + v[2]*x2; 3953 s2 -= v[1]*x1 + v[3]*x2; 3954 v += 4; 3955 } 3956 x[idx] = s1; 3957 x[1+idx] = s2; 3958 } 3959 /* backward solve the upper triangular */ 3960 for (i=n-1; i>=0; i--){ 3961 v = aa + 4*diag[i] + 4; 3962 vi = aj + diag[i] + 1; 3963 nz = ai[i+1] - diag[i] - 1; 3964 idt = 2*i; 3965 s1 = x[idt]; s2 = x[1+idt]; 3966 while (nz--) { 3967 idx = 2*(*vi++); 3968 x1 = x[idx]; x2 = x[1+idx]; 3969 s1 -= v[0]*x1 + v[2]*x2; 3970 s2 -= v[1]*x1 + v[3]*x2; 3971 v += 4; 3972 } 3973 v = aa + 4*diag[i]; 3974 x[idt] = v[0]*s1 + v[2]*s2; 3975 x[1+idt] = v[1]*s1 + v[3]*s2; 3976 } 3977 3978 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3979 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3980 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3981 PetscFunctionReturn(0); 3982 } 3983 3984 #undef __FUNCT__ 3985 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 3986 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3987 { 3988 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3989 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 3990 PetscErrorCode ierr; 3991 PetscInt jdx; 3992 const MatScalar *aa=a->a,*v; 3993 PetscScalar *x,s1,s2,x1,x2; 3994 const PetscScalar *b; 3995 3996 PetscFunctionBegin; 3997 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3998 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3999 /* forward solve the lower triangular */ 4000 idx = 0; 4001 x[0] = b[idx]; x[1] = b[1+idx]; 4002 for (i=1; i<n; i++) { 4003 v = aa + 4*ai[i]; 4004 vi = aj + ai[i]; 4005 nz = ai[i+1] - ai[i]; 4006 idx = 2*i; 4007 s1 = b[idx];s2 = b[1+idx]; 4008 for(k=0;k<nz;k++){ 4009 jdx = 2*vi[k]; 4010 x1 = x[jdx];x2 = x[1+jdx]; 4011 s1 -= v[0]*x1 + v[2]*x2; 4012 s2 -= v[1]*x1 + v[3]*x2; 4013 v += 4; 4014 } 4015 x[idx] = s1; 4016 x[1+idx] = s2; 4017 } 4018 4019 /* backward solve the upper triangular */ 4020 for (i=n-1; i>=0; i--){ 4021 v = aa + 4*(adiag[i+1]+1); 4022 vi = aj + adiag[i+1]+1; 4023 nz = adiag[i] - adiag[i+1]-1; 4024 idt = 2*i; 4025 s1 = x[idt]; s2 = x[1+idt]; 4026 for(k=0;k<nz;k++){ 4027 idx = 2*vi[k]; 4028 x1 = x[idx]; x2 = x[1+idx]; 4029 s1 -= v[0]*x1 + v[2]*x2; 4030 s2 -= v[1]*x1 + v[3]*x2; 4031 v += 4; 4032 } 4033 /* x = inv_diagonal*x */ 4034 x[idt] = v[0]*s1 + v[2]*s2; 4035 x[1+idt] = v[1]*s1 + v[3]*s2; 4036 } 4037 4038 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4039 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4040 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4041 PetscFunctionReturn(0); 4042 } 4043 4044 #undef __FUNCT__ 4045 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4046 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4047 { 4048 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4049 IS iscol=a->col,isrow=a->row; 4050 PetscErrorCode ierr; 4051 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4052 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4053 MatScalar *aa=a->a,*v; 4054 PetscScalar *x,*b,s1,*t; 4055 4056 PetscFunctionBegin; 4057 if (!n) PetscFunctionReturn(0); 4058 4059 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4060 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4061 t = a->solve_work; 4062 4063 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4064 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4065 4066 /* forward solve the lower triangular */ 4067 t[0] = b[*r++]; 4068 for (i=1; i<n; i++) { 4069 v = aa + ai[i]; 4070 vi = aj + ai[i]; 4071 nz = diag[i] - ai[i]; 4072 s1 = b[*r++]; 4073 while (nz--) { 4074 s1 -= (*v++)*t[*vi++]; 4075 } 4076 t[i] = s1; 4077 } 4078 /* backward solve the upper triangular */ 4079 for (i=n-1; i>=0; i--){ 4080 v = aa + diag[i] + 1; 4081 vi = aj + diag[i] + 1; 4082 nz = ai[i+1] - diag[i] - 1; 4083 s1 = t[i]; 4084 while (nz--) { 4085 s1 -= (*v++)*t[*vi++]; 4086 } 4087 x[*c--] = t[i] = aa[diag[i]]*s1; 4088 } 4089 4090 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4091 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4092 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4093 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4094 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4095 PetscFunctionReturn(0); 4096 } 4097 /* 4098 Special case where the matrix was ILU(0) factored in the natural 4099 ordering. This eliminates the need for the column and row permutation. 4100 */ 4101 #undef __FUNCT__ 4102 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4103 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 4104 { 4105 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4106 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4107 PetscErrorCode ierr; 4108 PetscInt *diag = a->diag; 4109 MatScalar *aa=a->a; 4110 PetscScalar *x,*b; 4111 PetscScalar s1,x1; 4112 MatScalar *v; 4113 PetscInt jdx,idt,idx,nz,*vi,i; 4114 4115 PetscFunctionBegin; 4116 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4117 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4118 4119 /* forward solve the lower triangular */ 4120 idx = 0; 4121 x[0] = b[0]; 4122 for (i=1; i<n; i++) { 4123 v = aa + ai[i]; 4124 vi = aj + ai[i]; 4125 nz = diag[i] - ai[i]; 4126 idx += 1; 4127 s1 = b[idx]; 4128 while (nz--) { 4129 jdx = *vi++; 4130 x1 = x[jdx]; 4131 s1 -= v[0]*x1; 4132 v += 1; 4133 } 4134 x[idx] = s1; 4135 } 4136 /* backward solve the upper triangular */ 4137 for (i=n-1; i>=0; i--){ 4138 v = aa + diag[i] + 1; 4139 vi = aj + diag[i] + 1; 4140 nz = ai[i+1] - diag[i] - 1; 4141 idt = i; 4142 s1 = x[idt]; 4143 while (nz--) { 4144 idx = *vi++; 4145 x1 = x[idx]; 4146 s1 -= v[0]*x1; 4147 v += 1; 4148 } 4149 v = aa + diag[i]; 4150 x[idt] = v[0]*s1; 4151 } 4152 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4153 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4154 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4155 PetscFunctionReturn(0); 4156 } 4157 4158 /* ----------------------------------------------------------------*/ 4159 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 4160 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 4161 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 4162 4163 #undef __FUNCT__ 4164 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 4165 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 4166 { 4167 Mat C=B; 4168 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 4169 IS isrow = b->row,isicol = b->icol; 4170 PetscErrorCode ierr; 4171 const PetscInt *r,*ic,*ics; 4172 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 4173 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4174 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4175 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4176 MatScalar *v_work; 4177 PetscTruth col_identity,row_identity,both_identity; 4178 4179 PetscFunctionBegin; 4180 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4181 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4182 4183 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4184 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 4185 ics = ic; 4186 4187 /* generate work space needed by dense LU factorization */ 4188 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 4189 4190 for (i=0; i<n; i++){ 4191 /* zero rtmp */ 4192 /* L part */ 4193 nz = bi[i+1] - bi[i]; 4194 bjtmp = bj + bi[i]; 4195 for (j=0; j<nz; j++){ 4196 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4197 } 4198 4199 /* U part */ 4200 nz = bdiag[i] - bdiag[i+1]; 4201 bjtmp = bj + bdiag[i+1]+1; 4202 for (j=0; j<nz; j++){ 4203 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4204 } 4205 4206 /* load in initial (unfactored row) */ 4207 nz = ai[r[i]+1] - ai[r[i]]; 4208 ajtmp = aj + ai[r[i]]; 4209 v = aa + bs2*ai[r[i]]; 4210 for (j=0; j<nz; j++) { 4211 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 4212 } 4213 4214 /* elimination */ 4215 bjtmp = bj + bi[i]; 4216 nzL = bi[i+1] - bi[i]; 4217 for(k=0;k < nzL;k++) { 4218 row = bjtmp[k]; 4219 pc = rtmp + bs2*row; 4220 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4221 if (flg) { 4222 pv = b->a + bs2*bdiag[row]; 4223 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 4224 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 4225 pv = b->a + bs2*(bdiag[row+1]+1); 4226 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 4227 for (j=0; j<nz; j++) { 4228 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4229 } 4230 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 4231 } 4232 } 4233 4234 /* finished row so stick it into b->a */ 4235 /* L part */ 4236 pv = b->a + bs2*bi[i] ; 4237 pj = b->j + bi[i] ; 4238 nz = bi[i+1] - bi[i]; 4239 for (j=0; j<nz; j++) { 4240 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4241 } 4242 4243 /* Mark diagonal and invert diagonal for simplier triangular solves */ 4244 pv = b->a + bs2*bdiag[i]; 4245 pj = b->j + bdiag[i]; 4246 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4247 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4248 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 4249 4250 /* U part */ 4251 pv = b->a + bs2*(bdiag[i+1]+1); 4252 pj = b->j + bdiag[i+1]+1; 4253 nz = bdiag[i] - bdiag[i+1] - 1; 4254 for (j=0; j<nz; j++){ 4255 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4256 } 4257 } 4258 4259 ierr = PetscFree(rtmp);CHKERRQ(ierr); 4260 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 4261 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4262 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4263 4264 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4265 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 4266 both_identity = (PetscTruth) (row_identity && col_identity); 4267 if (both_identity){ 4268 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 4269 } else { 4270 C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 4271 } 4272 4273 C->assembled = PETSC_TRUE; 4274 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 4275 PetscFunctionReturn(0); 4276 } 4277 4278 /* 4279 ilu(0) with natural ordering under new data structure. 4280 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 4281 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 4282 */ 4283 4284 #undef __FUNCT__ 4285 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 4286 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4287 { 4288 4289 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4290 PetscErrorCode ierr; 4291 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 4292 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 4293 4294 PetscFunctionBegin; 4295 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 4296 b = (Mat_SeqBAIJ*)(fact)->data; 4297 4298 /* allocate matrix arrays for new data structure */ 4299 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 4300 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 4301 b->singlemalloc = PETSC_TRUE; 4302 if (!b->diag){ 4303 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 4304 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 4305 } 4306 bdiag = b->diag; 4307 4308 if (n > 0) { 4309 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 4310 } 4311 4312 /* set bi and bj with new data structure */ 4313 bi = b->i; 4314 bj = b->j; 4315 4316 /* L part */ 4317 bi[0] = 0; 4318 for (i=0; i<n; i++){ 4319 nz = adiag[i] - ai[i]; 4320 bi[i+1] = bi[i] + nz; 4321 aj = a->j + ai[i]; 4322 for (j=0; j<nz; j++){ 4323 *bj = aj[j]; bj++; 4324 } 4325 } 4326 4327 /* U part */ 4328 bi_temp = bi[n]; 4329 bdiag[n] = bi[n]-1; 4330 for (i=n-1; i>=0; i--){ 4331 nz = ai[i+1] - adiag[i] - 1; 4332 bi_temp = bi_temp + nz + 1; 4333 aj = a->j + adiag[i] + 1; 4334 for (j=0; j<nz; j++){ 4335 *bj = aj[j]; bj++; 4336 } 4337 /* diag[i] */ 4338 *bj = i; bj++; 4339 bdiag[i] = bi_temp - 1; 4340 } 4341 PetscFunctionReturn(0); 4342 } 4343 4344 #undef __FUNCT__ 4345 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 4346 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4347 { 4348 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4349 IS isicol; 4350 PetscErrorCode ierr; 4351 const PetscInt *r,*ic; 4352 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 4353 PetscInt *bi,*cols,nnz,*cols_lvl; 4354 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 4355 PetscInt i,levels,diagonal_fill; 4356 PetscTruth col_identity,row_identity,both_identity; 4357 PetscReal f; 4358 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 4359 PetscBT lnkbt; 4360 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 4361 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 4362 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 4363 PetscTruth missing; 4364 PetscInt bs=A->rmap->bs,bs2=a->bs2; 4365 4366 PetscFunctionBegin; 4367 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 4368 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 4369 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 4370 4371 f = info->fill; 4372 levels = (PetscInt)info->levels; 4373 diagonal_fill = (PetscInt)info->diagonal_fill; 4374 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4375 4376 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4377 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4378 both_identity = (PetscTruth) (row_identity && col_identity); 4379 4380 if (!levels && both_identity) { 4381 /* special case: ilu(0) with natural ordering */ 4382 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4383 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 4384 4385 fact->factor = MAT_FACTOR_ILU; 4386 (fact)->info.factor_mallocs = 0; 4387 (fact)->info.fill_ratio_given = info->fill; 4388 (fact)->info.fill_ratio_needed = 1.0; 4389 b = (Mat_SeqBAIJ*)(fact)->data; 4390 b->row = isrow; 4391 b->col = iscol; 4392 b->icol = isicol; 4393 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4394 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4395 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4396 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4397 PetscFunctionReturn(0); 4398 } 4399 4400 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4401 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4402 4403 /* get new row pointers */ 4404 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 4405 bi[0] = 0; 4406 /* bdiag is location of diagonal in factor */ 4407 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 4408 bdiag[0] = 0; 4409 4410 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 4411 4412 /* create a linked list for storing column indices of the active row */ 4413 nlnk = n + 1; 4414 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 4415 4416 /* initial FreeSpace size is f*(ai[n]+1) */ 4417 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 4418 current_space = free_space; 4419 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 4420 current_space_lvl = free_space_lvl; 4421 4422 for (i=0; i<n; i++) { 4423 nzi = 0; 4424 /* copy current row into linked list */ 4425 nnz = ai[r[i]+1] - ai[r[i]]; 4426 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 4427 cols = aj + ai[r[i]]; 4428 lnk[i] = -1; /* marker to indicate if diagonal exists */ 4429 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 4430 nzi += nlnk; 4431 4432 /* make sure diagonal entry is included */ 4433 if (diagonal_fill && lnk[i] == -1) { 4434 fm = n; 4435 while (lnk[fm] < i) fm = lnk[fm]; 4436 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 4437 lnk[fm] = i; 4438 lnk_lvl[i] = 0; 4439 nzi++; dcount++; 4440 } 4441 4442 /* add pivot rows into the active row */ 4443 nzbd = 0; 4444 prow = lnk[n]; 4445 while (prow < i) { 4446 nnz = bdiag[prow]; 4447 cols = bj_ptr[prow] + nnz + 1; 4448 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 4449 nnz = bi[prow+1] - bi[prow] - nnz - 1; 4450 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 4451 nzi += nlnk; 4452 prow = lnk[prow]; 4453 nzbd++; 4454 } 4455 bdiag[i] = nzbd; 4456 bi[i+1] = bi[i] + nzi; 4457 4458 /* if free space is not available, make more free space */ 4459 if (current_space->local_remaining<nzi) { 4460 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 4461 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 4462 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 4463 reallocs++; 4464 } 4465 4466 /* copy data into free_space and free_space_lvl, then initialize lnk */ 4467 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 4468 bj_ptr[i] = current_space->array; 4469 bjlvl_ptr[i] = current_space_lvl->array; 4470 4471 /* make sure the active row i has diagonal entry */ 4472 if (*(bj_ptr[i]+bdiag[i]) != i) { 4473 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 4474 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 4475 } 4476 4477 current_space->array += nzi; 4478 current_space->local_used += nzi; 4479 current_space->local_remaining -= nzi; 4480 current_space_lvl->array += nzi; 4481 current_space_lvl->local_used += nzi; 4482 current_space_lvl->local_remaining -= nzi; 4483 } 4484 4485 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4486 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4487 4488 /* destroy list of free space and other temporary arrays */ 4489 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 4490 4491 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 4492 ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 4493 4494 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 4495 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 4496 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 4497 4498 #if defined(PETSC_USE_INFO) 4499 { 4500 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 4501 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 4502 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4503 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 4504 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4505 if (diagonal_fill) { 4506 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 4507 } 4508 } 4509 #endif 4510 4511 /* put together the new matrix */ 4512 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4513 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4514 b = (Mat_SeqBAIJ*)(fact)->data; 4515 b->free_a = PETSC_TRUE; 4516 b->free_ij = PETSC_TRUE; 4517 b->singlemalloc = PETSC_FALSE; 4518 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 4519 b->j = bj; 4520 b->i = bi; 4521 b->diag = bdiag; 4522 b->free_diag = PETSC_TRUE; 4523 b->ilen = 0; 4524 b->imax = 0; 4525 b->row = isrow; 4526 b->col = iscol; 4527 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4528 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4529 b->icol = isicol; 4530 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4531 /* In b structure: Free imax, ilen, old a, old j. 4532 Allocate bdiag, solve_work, new a, new j */ 4533 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 4534 b->maxnz = b->nz = bdiag[0]+1; 4535 fact->info.factor_mallocs = reallocs; 4536 fact->info.fill_ratio_given = f; 4537 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 4538 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 4539 PetscFunctionReturn(0); 4540 } 4541 4542 4543 /* 4544 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 4545 except that the data structure of Mat_SeqAIJ is slightly different. 4546 Not a good example of code reuse. 4547 */ 4548 #undef __FUNCT__ 4549 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 4550 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4551 { 4552 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4553 IS isicol; 4554 PetscErrorCode ierr; 4555 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 4556 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 4557 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 4558 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 4559 PetscTruth col_identity,row_identity,both_identity,flg; 4560 PetscReal f; 4561 PetscTruth newdatastruct = PETSC_FALSE; 4562 4563 PetscFunctionBegin; 4564 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 4565 if (newdatastruct){ 4566 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4567 PetscFunctionReturn(0); 4568 } 4569 4570 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 4571 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 4572 4573 f = info->fill; 4574 levels = (PetscInt)info->levels; 4575 diagonal_fill = (PetscInt)info->diagonal_fill; 4576 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4577 4578 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4579 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4580 both_identity = (PetscTruth) (row_identity && col_identity); 4581 4582 if (!levels && both_identity) { /* special case copy the nonzero structure */ 4583 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 4584 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 4585 4586 fact->factor = MAT_FACTOR_ILU; 4587 b = (Mat_SeqBAIJ*)fact->data; 4588 b->row = isrow; 4589 b->col = iscol; 4590 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4591 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4592 b->icol = isicol; 4593 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4594 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4595 PetscFunctionReturn(0); 4596 } 4597 4598 /* general case perform the symbolic factorization */ 4599 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4600 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4601 4602 /* get new row pointers */ 4603 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 4604 ainew[0] = 0; 4605 /* don't know how many column pointers are needed so estimate */ 4606 jmax = (PetscInt)(f*ai[n] + 1); 4607 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 4608 /* ajfill is level of fill for each fill entry */ 4609 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 4610 /* fill is a linked list of nonzeros in active row */ 4611 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 4612 /* im is level for each filled value */ 4613 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 4614 /* dloc is location of diagonal in factor */ 4615 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 4616 dloc[0] = 0; 4617 for (prow=0; prow<n; prow++) { 4618 4619 /* copy prow into linked list */ 4620 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 4621 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 4622 xi = aj + ai[r[prow]]; 4623 fill[n] = n; 4624 fill[prow] = -1; /* marker for diagonal entry */ 4625 while (nz--) { 4626 fm = n; 4627 idx = ic[*xi++]; 4628 do { 4629 m = fm; 4630 fm = fill[m]; 4631 } while (fm < idx); 4632 fill[m] = idx; 4633 fill[idx] = fm; 4634 im[idx] = 0; 4635 } 4636 4637 /* make sure diagonal entry is included */ 4638 if (diagonal_fill && fill[prow] == -1) { 4639 fm = n; 4640 while (fill[fm] < prow) fm = fill[fm]; 4641 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 4642 fill[fm] = prow; 4643 im[prow] = 0; 4644 nzf++; 4645 dcount++; 4646 } 4647 4648 nzi = 0; 4649 row = fill[n]; 4650 while (row < prow) { 4651 incrlev = im[row] + 1; 4652 nz = dloc[row]; 4653 xi = ajnew + ainew[row] + nz + 1; 4654 flev = ajfill + ainew[row] + nz + 1; 4655 nnz = ainew[row+1] - ainew[row] - nz - 1; 4656 fm = row; 4657 while (nnz-- > 0) { 4658 idx = *xi++; 4659 if (*flev + incrlev > levels) { 4660 flev++; 4661 continue; 4662 } 4663 do { 4664 m = fm; 4665 fm = fill[m]; 4666 } while (fm < idx); 4667 if (fm != idx) { 4668 im[idx] = *flev + incrlev; 4669 fill[m] = idx; 4670 fill[idx] = fm; 4671 fm = idx; 4672 nzf++; 4673 } else { 4674 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 4675 } 4676 flev++; 4677 } 4678 row = fill[row]; 4679 nzi++; 4680 } 4681 /* copy new filled row into permanent storage */ 4682 ainew[prow+1] = ainew[prow] + nzf; 4683 if (ainew[prow+1] > jmax) { 4684 4685 /* estimate how much additional space we will need */ 4686 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 4687 /* just double the memory each time */ 4688 PetscInt maxadd = jmax; 4689 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 4690 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 4691 jmax += maxadd; 4692 4693 /* allocate a longer ajnew and ajfill */ 4694 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 4695 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4696 ierr = PetscFree(ajnew);CHKERRQ(ierr); 4697 ajnew = xitmp; 4698 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 4699 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4700 ierr = PetscFree(ajfill);CHKERRQ(ierr); 4701 ajfill = xitmp; 4702 reallocate++; /* count how many reallocations are needed */ 4703 } 4704 xitmp = ajnew + ainew[prow]; 4705 flev = ajfill + ainew[prow]; 4706 dloc[prow] = nzi; 4707 fm = fill[n]; 4708 while (nzf--) { 4709 *xitmp++ = fm; 4710 *flev++ = im[fm]; 4711 fm = fill[fm]; 4712 } 4713 /* make sure row has diagonal entry */ 4714 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 4715 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 4716 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 4717 } 4718 } 4719 ierr = PetscFree(ajfill);CHKERRQ(ierr); 4720 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4721 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4722 ierr = PetscFree(fill);CHKERRQ(ierr); 4723 ierr = PetscFree(im);CHKERRQ(ierr); 4724 4725 #if defined(PETSC_USE_INFO) 4726 { 4727 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 4728 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 4729 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4730 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 4731 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4732 if (diagonal_fill) { 4733 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 4734 } 4735 } 4736 #endif 4737 4738 /* put together the new matrix */ 4739 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4740 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4741 b = (Mat_SeqBAIJ*)fact->data; 4742 b->free_a = PETSC_TRUE; 4743 b->free_ij = PETSC_TRUE; 4744 b->singlemalloc = PETSC_FALSE; 4745 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 4746 b->j = ajnew; 4747 b->i = ainew; 4748 for (i=0; i<n; i++) dloc[i] += ainew[i]; 4749 b->diag = dloc; 4750 b->free_diag = PETSC_TRUE; 4751 b->ilen = 0; 4752 b->imax = 0; 4753 b->row = isrow; 4754 b->col = iscol; 4755 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4756 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4757 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4758 b->icol = isicol; 4759 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4760 /* In b structure: Free imax, ilen, old a, old j. 4761 Allocate dloc, solve_work, new a, new j */ 4762 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 4763 b->maxnz = b->nz = ainew[n]; 4764 4765 fact->info.factor_mallocs = reallocate; 4766 fact->info.fill_ratio_given = f; 4767 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 4768 4769 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 4770 PetscFunctionReturn(0); 4771 } 4772 4773 #undef __FUNCT__ 4774 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 4775 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 4776 { 4777 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 4778 /* int i,*AJ=a->j,nz=a->nz; */ 4779 PetscFunctionBegin; 4780 /* Undo Column scaling */ 4781 /* while (nz--) { */ 4782 /* AJ[i] = AJ[i]/4; */ 4783 /* } */ 4784 /* This should really invoke a push/pop logic, but we don't have that yet. */ 4785 A->ops->setunfactored = PETSC_NULL; 4786 PetscFunctionReturn(0); 4787 } 4788 4789 #undef __FUNCT__ 4790 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 4791 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 4792 { 4793 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4794 PetscInt *AJ=a->j,nz=a->nz; 4795 unsigned short *aj=(unsigned short *)AJ; 4796 PetscFunctionBegin; 4797 /* Is this really necessary? */ 4798 while (nz--) { 4799 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 4800 } 4801 A->ops->setunfactored = PETSC_NULL; 4802 PetscFunctionReturn(0); 4803 } 4804 4805 4806