1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124 PetscInt *diag = a->diag,oidx; 125 MatScalar *aa=a->a,*v; 126 PetscScalar s1,s2,s3,x1,x2,x3; 127 PetscScalar *x,*b; 128 129 PetscFunctionBegin; 130 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133 134 /* forward solve the U^T */ 135 idx = 0; 136 for (i=0; i<n; i++) { 137 138 v = aa + 9*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144 v += 9; 145 146 vi = aj + diag[i] + 1; 147 nz = ai[i+1] - diag[i] - 1; 148 while (nz--) { 149 oidx = 3*(*vi++); 150 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153 v += 9; 154 } 155 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156 idx += 3; 157 } 158 /* backward solve the L^T */ 159 for (i=n-1; i>=0; i--){ 160 v = aa + 9*diag[i] - 9; 161 vi = aj + diag[i] - 1; 162 nz = diag[i] - ai[i]; 163 idt = 3*i; 164 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165 while (nz--) { 166 idx = 3*(*vi--); 167 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170 v -= 9; 171 } 172 } 173 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182 { 183 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184 PetscErrorCode ierr; 185 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186 PetscInt *diag = a->diag,oidx; 187 MatScalar *aa=a->a,*v; 188 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 189 PetscScalar *x,*b; 190 191 PetscFunctionBegin; 192 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 193 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195 196 /* forward solve the U^T */ 197 idx = 0; 198 for (i=0; i<n; i++) { 199 200 v = aa + 16*diag[i]; 201 /* multiply by the inverse of the block diagonal */ 202 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207 v += 16; 208 209 vi = aj + diag[i] + 1; 210 nz = ai[i+1] - diag[i] - 1; 211 while (nz--) { 212 oidx = 4*(*vi++); 213 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217 v += 16; 218 } 219 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220 idx += 4; 221 } 222 /* backward solve the L^T */ 223 for (i=n-1; i>=0; i--){ 224 v = aa + 16*diag[i] - 16; 225 vi = aj + diag[i] - 1; 226 nz = diag[i] - ai[i]; 227 idt = 4*i; 228 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229 while (nz--) { 230 idx = 4*(*vi--); 231 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235 v -= 16; 236 } 237 } 238 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 239 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247 { 248 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249 PetscErrorCode ierr; 250 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251 PetscInt *diag = a->diag,oidx; 252 MatScalar *aa=a->a,*v; 253 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 254 PetscScalar *x,*b; 255 256 PetscFunctionBegin; 257 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 258 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 259 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260 261 /* forward solve the U^T */ 262 idx = 0; 263 for (i=0; i<n; i++) { 264 265 v = aa + 25*diag[i]; 266 /* multiply by the inverse of the block diagonal */ 267 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273 v += 25; 274 275 vi = aj + diag[i] + 1; 276 nz = ai[i+1] - diag[i] - 1; 277 while (nz--) { 278 oidx = 5*(*vi++); 279 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284 v += 25; 285 } 286 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287 idx += 5; 288 } 289 /* backward solve the L^T */ 290 for (i=n-1; i>=0; i--){ 291 v = aa + 25*diag[i] - 25; 292 vi = aj + diag[i] - 1; 293 nz = diag[i] - ai[i]; 294 idt = 5*i; 295 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296 while (nz--) { 297 idx = 5*(*vi--); 298 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303 v -= 25; 304 } 305 } 306 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 307 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311 312 #undef __FUNCT__ 313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315 { 316 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317 PetscErrorCode ierr; 318 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319 PetscInt *diag = a->diag,oidx; 320 MatScalar *aa=a->a,*v; 321 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 322 PetscScalar *x,*b; 323 324 PetscFunctionBegin; 325 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 326 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 327 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328 329 /* forward solve the U^T */ 330 idx = 0; 331 for (i=0; i<n; i++) { 332 333 v = aa + 36*diag[i]; 334 /* multiply by the inverse of the block diagonal */ 335 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336 x6 = x[5+idx]; 337 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343 v += 36; 344 345 vi = aj + diag[i] + 1; 346 nz = ai[i+1] - diag[i] - 1; 347 while (nz--) { 348 oidx = 6*(*vi++); 349 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355 v += 36; 356 } 357 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358 x[5+idx] = s6; 359 idx += 6; 360 } 361 /* backward solve the L^T */ 362 for (i=n-1; i>=0; i--){ 363 v = aa + 36*diag[i] - 36; 364 vi = aj + diag[i] - 1; 365 nz = diag[i] - ai[i]; 366 idt = 6*i; 367 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368 s6 = x[5+idt]; 369 while (nz--) { 370 idx = 6*(*vi--); 371 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377 v -= 36; 378 } 379 } 380 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389 { 390 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391 PetscErrorCode ierr; 392 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393 PetscInt *diag = a->diag,oidx; 394 MatScalar *aa=a->a,*v; 395 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 396 PetscScalar *x,*b; 397 398 PetscFunctionBegin; 399 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 400 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 401 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402 403 /* forward solve the U^T */ 404 idx = 0; 405 for (i=0; i<n; i++) { 406 407 v = aa + 49*diag[i]; 408 /* multiply by the inverse of the block diagonal */ 409 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410 x6 = x[5+idx]; x7 = x[6+idx]; 411 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418 v += 49; 419 420 vi = aj + diag[i] + 1; 421 nz = ai[i+1] - diag[i] - 1; 422 while (nz--) { 423 oidx = 7*(*vi++); 424 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431 v += 49; 432 } 433 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434 x[5+idx] = s6;x[6+idx] = s7; 435 idx += 7; 436 } 437 /* backward solve the L^T */ 438 for (i=n-1; i>=0; i--){ 439 v = aa + 49*diag[i] - 49; 440 vi = aj + diag[i] - 1; 441 nz = diag[i] - ai[i]; 442 idt = 7*i; 443 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444 s6 = x[5+idt];s7 = x[6+idt]; 445 while (nz--) { 446 idx = 7*(*vi--); 447 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454 v -= 49; 455 } 456 } 457 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460 PetscFunctionReturn(0); 461 } 462 463 /*---------------------------------------------------------------------------------------------*/ 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469 IS iscol=a->col,isrow=a->row; 470 PetscErrorCode ierr; 471 const PetscInt *r,*c,*rout,*cout; 472 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473 PetscInt *diag = a->diag; 474 MatScalar *aa=a->a,*v; 475 PetscScalar s1,*x,*b,*t; 476 477 PetscFunctionBegin; 478 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 482 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484 485 /* copy the b into temp work space according to permutation */ 486 for (i=0; i<n; i++) { 487 t[i] = b[c[i]]; 488 } 489 490 /* forward solve the U^T */ 491 for (i=0; i<n; i++) { 492 493 v = aa + diag[i]; 494 /* multiply by the inverse of the block diagonal */ 495 s1 = (*v++)*t[i]; 496 vi = aj + diag[i] + 1; 497 nz = ai[i+1] - diag[i] - 1; 498 while (nz--) { 499 t[*vi++] -= (*v++)*s1; 500 } 501 t[i] = s1; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--){ 505 v = aa + diag[i] - 1; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 s1 = t[i]; 509 while (nz--) { 510 t[*vi--] -= (*v--)*s1; 511 } 512 } 513 514 /* copy t into x according to permutation */ 515 for (i=0; i<n; i++) { 516 x[r[i]] = t[i]; 517 } 518 519 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 521 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530 { 531 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532 IS iscol=a->col,isrow=a->row; 533 PetscErrorCode ierr; 534 const PetscInt *r,*c,*rout,*cout; 535 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536 PetscInt *diag = a->diag,ii,ic,ir,oidx; 537 MatScalar *aa=a->a,*v; 538 PetscScalar s1,s2,x1,x2; 539 PetscScalar *x,*b,*t; 540 541 PetscFunctionBegin; 542 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544 t = a->solve_work; 545 546 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548 549 /* copy the b into temp work space according to permutation */ 550 ii = 0; 551 for (i=0; i<n; i++) { 552 ic = 2*c[i]; 553 t[ii] = b[ic]; 554 t[ii+1] = b[ic+1]; 555 ii += 2; 556 } 557 558 /* forward solve the U^T */ 559 idx = 0; 560 for (i=0; i<n; i++) { 561 562 v = aa + 4*diag[i]; 563 /* multiply by the inverse of the block diagonal */ 564 x1 = t[idx]; x2 = t[1+idx]; 565 s1 = v[0]*x1 + v[1]*x2; 566 s2 = v[2]*x1 + v[3]*x2; 567 v += 4; 568 569 vi = aj + diag[i] + 1; 570 nz = ai[i+1] - diag[i] - 1; 571 while (nz--) { 572 oidx = 2*(*vi++); 573 t[oidx] -= v[0]*s1 + v[1]*s2; 574 t[oidx+1] -= v[2]*s1 + v[3]*s2; 575 v += 4; 576 } 577 t[idx] = s1;t[1+idx] = s2; 578 idx += 2; 579 } 580 /* backward solve the L^T */ 581 for (i=n-1; i>=0; i--){ 582 v = aa + 4*diag[i] - 4; 583 vi = aj + diag[i] - 1; 584 nz = diag[i] - ai[i]; 585 idt = 2*i; 586 s1 = t[idt]; s2 = t[1+idt]; 587 while (nz--) { 588 idx = 2*(*vi--); 589 t[idx] -= v[0]*s1 + v[1]*s2; 590 t[idx+1] -= v[2]*s1 + v[3]*s2; 591 v -= 4; 592 } 593 } 594 595 /* copy t into x according to permutation */ 596 ii = 0; 597 for (i=0; i<n; i++) { 598 ir = 2*r[i]; 599 x[ir] = t[ii]; 600 x[ir+1] = t[ii+1]; 601 ii += 2; 602 } 603 604 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 606 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 #undef __FUNCT__ 613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615 { 616 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617 IS iscol=a->col,isrow=a->row; 618 PetscErrorCode ierr; 619 const PetscInt *r,*c,*rout,*cout; 620 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621 PetscInt *diag = a->diag,ii,ic,ir,oidx; 622 MatScalar *aa=a->a,*v; 623 PetscScalar s1,s2,s3,x1,x2,x3; 624 PetscScalar *x,*b,*t; 625 626 PetscFunctionBegin; 627 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 628 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629 t = a->solve_work; 630 631 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633 634 /* copy the b into temp work space according to permutation */ 635 ii = 0; 636 for (i=0; i<n; i++) { 637 ic = 3*c[i]; 638 t[ii] = b[ic]; 639 t[ii+1] = b[ic+1]; 640 t[ii+2] = b[ic+2]; 641 ii += 3; 642 } 643 644 /* forward solve the U^T */ 645 idx = 0; 646 for (i=0; i<n; i++) { 647 648 v = aa + 9*diag[i]; 649 /* multiply by the inverse of the block diagonal */ 650 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654 v += 9; 655 656 vi = aj + diag[i] + 1; 657 nz = ai[i+1] - diag[i] - 1; 658 while (nz--) { 659 oidx = 3*(*vi++); 660 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663 v += 9; 664 } 665 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666 idx += 3; 667 } 668 /* backward solve the L^T */ 669 for (i=n-1; i>=0; i--){ 670 v = aa + 9*diag[i] - 9; 671 vi = aj + diag[i] - 1; 672 nz = diag[i] - ai[i]; 673 idt = 3*i; 674 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675 while (nz--) { 676 idx = 3*(*vi--); 677 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680 v -= 9; 681 } 682 } 683 684 /* copy t into x according to permutation */ 685 ii = 0; 686 for (i=0; i<n; i++) { 687 ir = 3*r[i]; 688 x[ir] = t[ii]; 689 x[ir+1] = t[ii+1]; 690 x[ir+2] = t[ii+2]; 691 ii += 3; 692 } 693 694 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 696 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707 IS iscol=a->col,isrow=a->row; 708 PetscErrorCode ierr; 709 const PetscInt *r,*c,*rout,*cout; 710 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711 PetscInt *diag = a->diag,ii,ic,ir,oidx; 712 MatScalar *aa=a->a,*v; 713 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 714 PetscScalar *x,*b,*t; 715 716 PetscFunctionBegin; 717 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 t = a->solve_work; 720 721 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723 724 /* copy the b into temp work space according to permutation */ 725 ii = 0; 726 for (i=0; i<n; i++) { 727 ic = 4*c[i]; 728 t[ii] = b[ic]; 729 t[ii+1] = b[ic+1]; 730 t[ii+2] = b[ic+2]; 731 t[ii+3] = b[ic+3]; 732 ii += 4; 733 } 734 735 /* forward solve the U^T */ 736 idx = 0; 737 for (i=0; i<n; i++) { 738 739 v = aa + 16*diag[i]; 740 /* multiply by the inverse of the block diagonal */ 741 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746 v += 16; 747 748 vi = aj + diag[i] + 1; 749 nz = ai[i+1] - diag[i] - 1; 750 while (nz--) { 751 oidx = 4*(*vi++); 752 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756 v += 16; 757 } 758 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759 idx += 4; 760 } 761 /* backward solve the L^T */ 762 for (i=n-1; i>=0; i--){ 763 v = aa + 16*diag[i] - 16; 764 vi = aj + diag[i] - 1; 765 nz = diag[i] - ai[i]; 766 idt = 4*i; 767 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768 while (nz--) { 769 idx = 4*(*vi--); 770 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774 v -= 16; 775 } 776 } 777 778 /* copy t into x according to permutation */ 779 ii = 0; 780 for (i=0; i<n; i++) { 781 ir = 4*r[i]; 782 x[ir] = t[ii]; 783 x[ir+1] = t[ii+1]; 784 x[ir+2] = t[ii+2]; 785 x[ir+3] = t[ii+3]; 786 ii += 4; 787 } 788 789 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 791 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800 { 801 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802 IS iscol=a->col,isrow=a->row; 803 PetscErrorCode ierr; 804 const PetscInt *r,*c,*rout,*cout; 805 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806 PetscInt *diag = a->diag,ii,ic,ir,oidx; 807 MatScalar *aa=a->a,*v; 808 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 809 PetscScalar *x,*b,*t; 810 811 PetscFunctionBegin; 812 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 813 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814 t = a->solve_work; 815 816 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818 819 /* copy the b into temp work space according to permutation */ 820 ii = 0; 821 for (i=0; i<n; i++) { 822 ic = 5*c[i]; 823 t[ii] = b[ic]; 824 t[ii+1] = b[ic+1]; 825 t[ii+2] = b[ic+2]; 826 t[ii+3] = b[ic+3]; 827 t[ii+4] = b[ic+4]; 828 ii += 5; 829 } 830 831 /* forward solve the U^T */ 832 idx = 0; 833 for (i=0; i<n; i++) { 834 835 v = aa + 25*diag[i]; 836 /* multiply by the inverse of the block diagonal */ 837 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843 v += 25; 844 845 vi = aj + diag[i] + 1; 846 nz = ai[i+1] - diag[i] - 1; 847 while (nz--) { 848 oidx = 5*(*vi++); 849 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v += 25; 855 } 856 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857 idx += 5; 858 } 859 /* backward solve the L^T */ 860 for (i=n-1; i>=0; i--){ 861 v = aa + 25*diag[i] - 25; 862 vi = aj + diag[i] - 1; 863 nz = diag[i] - ai[i]; 864 idt = 5*i; 865 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866 while (nz--) { 867 idx = 5*(*vi--); 868 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873 v -= 25; 874 } 875 } 876 877 /* copy t into x according to permutation */ 878 ii = 0; 879 for (i=0; i<n; i++) { 880 ir = 5*r[i]; 881 x[ir] = t[ii]; 882 x[ir+1] = t[ii+1]; 883 x[ir+2] = t[ii+2]; 884 x[ir+3] = t[ii+3]; 885 x[ir+4] = t[ii+4]; 886 ii += 5; 887 } 888 889 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 891 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902 IS iscol=a->col,isrow=a->row; 903 PetscErrorCode ierr; 904 const PetscInt *r,*c,*rout,*cout; 905 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906 PetscInt *diag = a->diag,ii,ic,ir,oidx; 907 MatScalar *aa=a->a,*v; 908 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 909 PetscScalar *x,*b,*t; 910 911 PetscFunctionBegin; 912 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 t = a->solve_work; 915 916 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918 919 /* copy the b into temp work space according to permutation */ 920 ii = 0; 921 for (i=0; i<n; i++) { 922 ic = 6*c[i]; 923 t[ii] = b[ic]; 924 t[ii+1] = b[ic+1]; 925 t[ii+2] = b[ic+2]; 926 t[ii+3] = b[ic+3]; 927 t[ii+4] = b[ic+4]; 928 t[ii+5] = b[ic+5]; 929 ii += 6; 930 } 931 932 /* forward solve the U^T */ 933 idx = 0; 934 for (i=0; i<n; i++) { 935 936 v = aa + 36*diag[i]; 937 /* multiply by the inverse of the block diagonal */ 938 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939 x6 = t[5+idx]; 940 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946 v += 36; 947 948 vi = aj + diag[i] + 1; 949 nz = ai[i+1] - diag[i] - 1; 950 while (nz--) { 951 oidx = 6*(*vi++); 952 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958 v += 36; 959 } 960 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961 t[5+idx] = s6; 962 idx += 6; 963 } 964 /* backward solve the L^T */ 965 for (i=n-1; i>=0; i--){ 966 v = aa + 36*diag[i] - 36; 967 vi = aj + diag[i] - 1; 968 nz = diag[i] - ai[i]; 969 idt = 6*i; 970 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971 s6 = t[5+idt]; 972 while (nz--) { 973 idx = 6*(*vi--); 974 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980 v -= 36; 981 } 982 } 983 984 /* copy t into x according to permutation */ 985 ii = 0; 986 for (i=0; i<n; i++) { 987 ir = 6*r[i]; 988 x[ir] = t[ii]; 989 x[ir+1] = t[ii+1]; 990 x[ir+2] = t[ii+2]; 991 x[ir+3] = t[ii+3]; 992 x[ir+4] = t[ii+4]; 993 x[ir+5] = t[ii+5]; 994 ii += 6; 995 } 996 997 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 999 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1000 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008 { 1009 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010 IS iscol=a->col,isrow=a->row; 1011 PetscErrorCode ierr; 1012 const PetscInt *r,*c,*rout,*cout; 1013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015 MatScalar *aa=a->a,*v; 1016 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1017 PetscScalar *x,*b,*t; 1018 1019 PetscFunctionBegin; 1020 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022 t = a->solve_work; 1023 1024 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026 1027 /* copy the b into temp work space according to permutation */ 1028 ii = 0; 1029 for (i=0; i<n; i++) { 1030 ic = 7*c[i]; 1031 t[ii] = b[ic]; 1032 t[ii+1] = b[ic+1]; 1033 t[ii+2] = b[ic+2]; 1034 t[ii+3] = b[ic+3]; 1035 t[ii+4] = b[ic+4]; 1036 t[ii+5] = b[ic+5]; 1037 t[ii+6] = b[ic+6]; 1038 ii += 7; 1039 } 1040 1041 /* forward solve the U^T */ 1042 idx = 0; 1043 for (i=0; i<n; i++) { 1044 1045 v = aa + 49*diag[i]; 1046 /* multiply by the inverse of the block diagonal */ 1047 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048 x6 = t[5+idx]; x7 = t[6+idx]; 1049 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056 v += 49; 1057 1058 vi = aj + diag[i] + 1; 1059 nz = ai[i+1] - diag[i] - 1; 1060 while (nz--) { 1061 oidx = 7*(*vi++); 1062 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v += 49; 1070 } 1071 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072 t[5+idx] = s6;t[6+idx] = s7; 1073 idx += 7; 1074 } 1075 /* backward solve the L^T */ 1076 for (i=n-1; i>=0; i--){ 1077 v = aa + 49*diag[i] - 49; 1078 vi = aj + diag[i] - 1; 1079 nz = diag[i] - ai[i]; 1080 idt = 7*i; 1081 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082 s6 = t[5+idt];s7 = t[6+idt]; 1083 while (nz--) { 1084 idx = 7*(*vi--); 1085 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092 v -= 49; 1093 } 1094 } 1095 1096 /* copy t into x according to permutation */ 1097 ii = 0; 1098 for (i=0; i<n; i++) { 1099 ir = 7*r[i]; 1100 x[ir] = t[ii]; 1101 x[ir+1] = t[ii+1]; 1102 x[ir+2] = t[ii+2]; 1103 x[ir+3] = t[ii+3]; 1104 x[ir+4] = t[ii+4]; 1105 x[ir+5] = t[ii+5]; 1106 x[ir+6] = t[ii+6]; 1107 ii += 7; 1108 } 1109 1110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1112 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 /* ----------------------------------------------------------- */ 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1124 IS iscol=a->col,isrow=a->row; 1125 PetscErrorCode ierr; 1126 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1127 PetscInt i,n=a->mbs; 1128 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1129 MatScalar *aa=a->a,*v; 1130 PetscScalar *x,*b,*s,*t,*ls; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135 t = a->solve_work; 1136 1137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1139 1140 /* forward solve the lower triangular */ 1141 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1142 for (i=1; i<n; i++) { 1143 v = aa + bs2*ai[i]; 1144 vi = aj + ai[i]; 1145 nz = a->diag[i] - ai[i]; 1146 s = t + bs*i; 1147 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 while (nz--) { 1149 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1150 v += bs2; 1151 } 1152 } 1153 /* backward solve the upper triangular */ 1154 ls = a->solve_work + A->cmap->n; 1155 for (i=n-1; i>=0; i--){ 1156 v = aa + bs2*(a->diag[i] + 1); 1157 vi = aj + a->diag[i] + 1; 1158 nz = ai[i+1] - a->diag[i] - 1; 1159 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1160 while (nz--) { 1161 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1162 v += bs2; 1163 } 1164 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1165 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1166 } 1167 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 /* ----------------------------------------------------------- */ 1177 #undef __FUNCT__ 1178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 1179 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1180 { 1181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1182 IS iscol=a->col,isrow=a->row; 1183 PetscErrorCode ierr; 1184 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1185 PetscInt i,n=a->mbs,j; 1186 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1187 const MatScalar *aa=a->a,*v; 1188 PetscScalar *x,*t,*ls; 1189 const PetscScalar *b; 1190 PetscFunctionBegin; 1191 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1193 t = a->solve_work; 1194 1195 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1196 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1197 1198 /* copy the b into temp work space according to permutation */ 1199 for (i=0; i<n; i++) { 1200 for (j=0; j<bs; j++) { 1201 t[i*bs+j] = b[c[i]*bs+j]; 1202 } 1203 } 1204 1205 1206 /* forward solve the upper triangular transpose */ 1207 ls = a->solve_work + A->cmap->n; 1208 for (i=0; i<n; i++){ 1209 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1210 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1211 v = aa + bs2*(a->diag[i] + 1); 1212 vi = aj + a->diag[i] + 1; 1213 nz = ai[i+1] - a->diag[i] - 1; 1214 while (nz--) { 1215 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1216 v += bs2; 1217 } 1218 } 1219 1220 /* backward solve the lower triangular transpose */ 1221 for (i=n-1; i>=0; i--) { 1222 v = aa + bs2*ai[i]; 1223 vi = aj + ai[i]; 1224 nz = a->diag[i] - ai[i]; 1225 while (nz--) { 1226 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1227 v += bs2; 1228 } 1229 } 1230 1231 /* copy t into x according to permutation */ 1232 for (i=0; i<n; i++) { 1233 for (j=0; j<bs; j++) { 1234 x[bs*r[i]+j] = t[bs*i+j]; 1235 } 1236 } 1237 1238 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1239 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1240 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1241 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1242 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1243 PetscFunctionReturn(0); 1244 } 1245 1246 #undef __FUNCT__ 1247 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1248 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1249 { 1250 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1251 IS iscol=a->col,isrow=a->row; 1252 PetscErrorCode ierr; 1253 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1254 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1255 MatScalar *aa=a->a,*v; 1256 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1257 PetscScalar *x,*b,*t; 1258 1259 PetscFunctionBegin; 1260 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1261 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1262 t = a->solve_work; 1263 1264 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1265 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1266 1267 /* forward solve the lower triangular */ 1268 idx = 7*(*r++); 1269 t[0] = b[idx]; t[1] = b[1+idx]; 1270 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1271 t[5] = b[5+idx]; t[6] = b[6+idx]; 1272 1273 for (i=1; i<n; i++) { 1274 v = aa + 49*ai[i]; 1275 vi = aj + ai[i]; 1276 nz = diag[i] - ai[i]; 1277 idx = 7*(*r++); 1278 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1279 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1280 while (nz--) { 1281 idx = 7*(*vi++); 1282 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1283 x4 = t[3+idx];x5 = t[4+idx]; 1284 x6 = t[5+idx];x7 = t[6+idx]; 1285 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1286 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1287 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1288 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1289 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1290 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1291 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1292 v += 49; 1293 } 1294 idx = 7*i; 1295 t[idx] = s1;t[1+idx] = s2; 1296 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1297 t[5+idx] = s6;t[6+idx] = s7; 1298 } 1299 /* backward solve the upper triangular */ 1300 for (i=n-1; i>=0; i--){ 1301 v = aa + 49*diag[i] + 49; 1302 vi = aj + diag[i] + 1; 1303 nz = ai[i+1] - diag[i] - 1; 1304 idt = 7*i; 1305 s1 = t[idt]; s2 = t[1+idt]; 1306 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1307 s6 = t[5+idt];s7 = t[6+idt]; 1308 while (nz--) { 1309 idx = 7*(*vi++); 1310 x1 = t[idx]; x2 = t[1+idx]; 1311 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1312 x6 = t[5+idx]; x7 = t[6+idx]; 1313 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1314 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1315 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1316 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1317 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1318 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1319 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1320 v += 49; 1321 } 1322 idc = 7*(*c--); 1323 v = aa + 49*diag[i]; 1324 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1325 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1326 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1327 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1328 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1329 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1330 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1331 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1332 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1333 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1334 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1335 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1336 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1337 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1338 } 1339 1340 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1341 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1342 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1343 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1344 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 1349 #undef __FUNCT__ 1350 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1351 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1352 { 1353 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1354 IS iscol=a->col,isrow=a->row; 1355 PetscErrorCode ierr; 1356 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 1357 PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 1358 MatScalar *aa=a->a,*v; 1359 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1360 PetscScalar *x,*b,*t; 1361 1362 PetscFunctionBegin; 1363 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1364 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1365 t = a->solve_work; 1366 1367 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1368 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1369 1370 /* forward solve the lower triangular */ 1371 idx = 7*r[0]; 1372 t[0] = b[idx]; t[1] = b[1+idx]; 1373 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1374 t[5] = b[5+idx]; t[6] = b[6+idx]; 1375 1376 for (i=1; i<n; i++) { 1377 v = aa + 49*ai[i]; 1378 vi = aj + ai[i]; 1379 nz = ai[i+1] - ai[i]; 1380 idx = 7*r[i]; 1381 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1382 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1383 for(m=0;m<nz;m++){ 1384 idx = 7*vi[m]; 1385 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1386 x4 = t[3+idx];x5 = t[4+idx]; 1387 x6 = t[5+idx];x7 = t[6+idx]; 1388 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1389 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1390 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1391 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1392 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1393 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1394 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1395 v += 49; 1396 } 1397 idx = 7*i; 1398 t[idx] = s1;t[1+idx] = s2; 1399 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1400 t[5+idx] = s6;t[6+idx] = s7; 1401 } 1402 /* backward solve the upper triangular */ 1403 for (i=n-1; i>=0; i--){ 1404 k = 2*n-i; 1405 v = aa + 49*ai[k]; 1406 vi = aj + ai[k]; 1407 nz = ai[k+1] - ai[k] - 1; 1408 idt = 7*i; 1409 s1 = t[idt]; s2 = t[1+idt]; 1410 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1411 s6 = t[5+idt];s7 = t[6+idt]; 1412 for(m=0;m<nz;m++){ 1413 idx = 7*vi[m]; 1414 x1 = t[idx]; x2 = t[1+idx]; 1415 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1416 x6 = t[5+idx]; x7 = t[6+idx]; 1417 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1418 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1419 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1420 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1421 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1422 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1423 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1424 v += 49; 1425 } 1426 idc = 7*c[i]; 1427 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1428 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1429 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1430 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1431 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1432 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1433 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1434 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1435 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1436 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1437 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1438 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1439 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1440 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1441 } 1442 1443 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1444 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1445 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1446 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1447 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1448 PetscFunctionReturn(0); 1449 } 1450 #endif 1451 1452 #undef __FUNCT__ 1453 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1454 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1455 { 1456 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1457 IS iscol=a->col,isrow=a->row; 1458 PetscErrorCode ierr; 1459 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 1460 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 1461 MatScalar *aa=a->a,*v; 1462 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1463 PetscScalar *x,*b,*t; 1464 1465 PetscFunctionBegin; 1466 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1467 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1468 t = a->solve_work; 1469 1470 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1471 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1472 1473 /* forward solve the lower triangular */ 1474 idx = 7*r[0]; 1475 t[0] = b[idx]; t[1] = b[1+idx]; 1476 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1477 t[5] = b[5+idx]; t[6] = b[6+idx]; 1478 1479 for (i=1; i<n; i++) { 1480 v = aa + 49*ai[i]; 1481 vi = aj + ai[i]; 1482 nz = ai[i+1] - ai[i]; 1483 idx = 7*r[i]; 1484 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1485 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1486 for(m=0;m<nz;m++){ 1487 idx = 7*vi[m]; 1488 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1489 x4 = t[3+idx];x5 = t[4+idx]; 1490 x6 = t[5+idx];x7 = t[6+idx]; 1491 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1492 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1493 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1494 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1495 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1496 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1497 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1498 v += 49; 1499 } 1500 idx = 7*i; 1501 t[idx] = s1;t[1+idx] = s2; 1502 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1503 t[5+idx] = s6;t[6+idx] = s7; 1504 } 1505 /* backward solve the upper triangular */ 1506 for (i=n-1; i>=0; i--){ 1507 v = aa + 49*(adiag[i+1]+1); 1508 vi = aj + adiag[i+1]+1; 1509 nz = adiag[i] - adiag[i+1] - 1; 1510 idt = 7*i; 1511 s1 = t[idt]; s2 = t[1+idt]; 1512 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1513 s6 = t[5+idt];s7 = t[6+idt]; 1514 for(m=0;m<nz;m++){ 1515 idx = 7*vi[m]; 1516 x1 = t[idx]; x2 = t[1+idx]; 1517 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1518 x6 = t[5+idx]; x7 = t[6+idx]; 1519 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1520 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1521 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1522 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1523 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1524 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1525 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1526 v += 49; 1527 } 1528 idc = 7*c[i]; 1529 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1530 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1531 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1532 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1533 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1534 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1535 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1536 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1537 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1538 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1539 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1540 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1541 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1542 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1543 } 1544 1545 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1546 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1547 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1548 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1549 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1550 PetscFunctionReturn(0); 1551 } 1552 1553 #undef __FUNCT__ 1554 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1555 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1556 { 1557 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1558 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1559 PetscErrorCode ierr; 1560 PetscInt *diag = a->diag,jdx; 1561 const MatScalar *aa=a->a,*v; 1562 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1563 const PetscScalar *b; 1564 1565 PetscFunctionBegin; 1566 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1567 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1568 /* forward solve the lower triangular */ 1569 idx = 0; 1570 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1571 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1572 x[6] = b[6+idx]; 1573 for (i=1; i<n; i++) { 1574 v = aa + 49*ai[i]; 1575 vi = aj + ai[i]; 1576 nz = diag[i] - ai[i]; 1577 idx = 7*i; 1578 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1579 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1580 s7 = b[6+idx]; 1581 while (nz--) { 1582 jdx = 7*(*vi++); 1583 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1584 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1585 x7 = x[6+jdx]; 1586 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1587 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1588 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1589 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1590 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1591 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1592 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1593 v += 49; 1594 } 1595 x[idx] = s1; 1596 x[1+idx] = s2; 1597 x[2+idx] = s3; 1598 x[3+idx] = s4; 1599 x[4+idx] = s5; 1600 x[5+idx] = s6; 1601 x[6+idx] = s7; 1602 } 1603 /* backward solve the upper triangular */ 1604 for (i=n-1; i>=0; i--){ 1605 v = aa + 49*diag[i] + 49; 1606 vi = aj + diag[i] + 1; 1607 nz = ai[i+1] - diag[i] - 1; 1608 idt = 7*i; 1609 s1 = x[idt]; s2 = x[1+idt]; 1610 s3 = x[2+idt]; s4 = x[3+idt]; 1611 s5 = x[4+idt]; s6 = x[5+idt]; 1612 s7 = x[6+idt]; 1613 while (nz--) { 1614 idx = 7*(*vi++); 1615 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1616 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1617 x7 = x[6+idx]; 1618 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1619 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1620 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1621 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1622 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1623 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1624 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1625 v += 49; 1626 } 1627 v = aa + 49*diag[i]; 1628 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1629 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1630 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1631 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1632 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1633 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1634 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1635 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1636 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1637 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1638 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1639 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1640 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1641 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1642 } 1643 1644 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1645 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1646 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1647 PetscFunctionReturn(0); 1648 } 1649 1650 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 1651 #undef __FUNCT__ 1652 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1653 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1654 { 1655 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1656 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1657 PetscErrorCode ierr; 1658 PetscInt idx,jdx,idt; 1659 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1660 const MatScalar *aa=a->a,*v; 1661 PetscScalar *x; 1662 const PetscScalar *b; 1663 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1664 1665 PetscFunctionBegin; 1666 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1667 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1668 /* forward solve the lower triangular */ 1669 idx = 0; 1670 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1671 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1672 for (i=1; i<n; i++) { 1673 v = aa + bs2*ai[i]; 1674 vi = aj + ai[i]; 1675 nz = ai[i+1] - ai[i]; 1676 idx = bs*i; 1677 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1678 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1679 for(k=0;k<nz;k++) { 1680 jdx = bs*vi[k]; 1681 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1682 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1683 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1684 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1685 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1686 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1687 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1688 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1689 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1690 v += bs2; 1691 } 1692 1693 x[idx] = s1; 1694 x[1+idx] = s2; 1695 x[2+idx] = s3; 1696 x[3+idx] = s4; 1697 x[4+idx] = s5; 1698 x[5+idx] = s6; 1699 x[6+idx] = s7; 1700 } 1701 1702 /* backward solve the upper triangular */ 1703 for (i=n-1; i>=0; i--){ 1704 v = aa + bs2*ai[2*n-i]; 1705 vi = aj + ai[2*n-i]; 1706 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1707 idt = bs*i; 1708 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1709 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1710 for(k=0;k<nz;k++) { 1711 idx = bs*vi[k]; 1712 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1713 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1714 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1715 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1716 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1717 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1718 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1719 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1720 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1721 v += bs2; 1722 } 1723 /* x = inv_diagonal*x */ 1724 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1725 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1726 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1727 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1728 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1729 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1730 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1731 } 1732 1733 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1734 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1735 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1736 PetscFunctionReturn(0); 1737 } 1738 #endif 1739 1740 #undef __FUNCT__ 1741 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1742 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1743 { 1744 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1745 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 1746 PetscErrorCode ierr; 1747 PetscInt idx,jdx,idt; 1748 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1749 const MatScalar *aa=a->a,*v; 1750 PetscScalar *x; 1751 const PetscScalar *b; 1752 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1753 1754 PetscFunctionBegin; 1755 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1756 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1757 /* forward solve the lower triangular */ 1758 idx = 0; 1759 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1760 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1761 for (i=1; i<n; i++) { 1762 v = aa + bs2*ai[i]; 1763 vi = aj + ai[i]; 1764 nz = ai[i+1] - ai[i]; 1765 idx = bs*i; 1766 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1767 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1768 for(k=0;k<nz;k++) { 1769 jdx = bs*vi[k]; 1770 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1771 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1772 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1773 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1774 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1775 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1776 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1777 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1778 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1779 v += bs2; 1780 } 1781 1782 x[idx] = s1; 1783 x[1+idx] = s2; 1784 x[2+idx] = s3; 1785 x[3+idx] = s4; 1786 x[4+idx] = s5; 1787 x[5+idx] = s6; 1788 x[6+idx] = s7; 1789 } 1790 1791 /* backward solve the upper triangular */ 1792 for (i=n-1; i>=0; i--){ 1793 v = aa + bs2*(adiag[i+1]+1); 1794 vi = aj + adiag[i+1]+1; 1795 nz = adiag[i] - adiag[i+1]-1; 1796 idt = bs*i; 1797 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1798 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1799 for(k=0;k<nz;k++) { 1800 idx = bs*vi[k]; 1801 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1802 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1803 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1804 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1805 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1806 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1807 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1808 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1809 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1810 v += bs2; 1811 } 1812 /* x = inv_diagonal*x */ 1813 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1814 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1815 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1816 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1817 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1818 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1819 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1820 } 1821 1822 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1823 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1824 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1825 PetscFunctionReturn(0); 1826 } 1827 1828 #undef __FUNCT__ 1829 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1830 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1831 { 1832 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1833 IS iscol=a->col,isrow=a->row; 1834 PetscErrorCode ierr; 1835 const PetscInt *r,*c,*rout,*cout; 1836 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1837 const MatScalar *aa=a->a,*v; 1838 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1839 const PetscScalar *b; 1840 PetscFunctionBegin; 1841 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1842 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1843 t = a->solve_work; 1844 1845 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1846 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1847 1848 /* forward solve the lower triangular */ 1849 idx = 6*(*r++); 1850 t[0] = b[idx]; t[1] = b[1+idx]; 1851 t[2] = b[2+idx]; t[3] = b[3+idx]; 1852 t[4] = b[4+idx]; t[5] = b[5+idx]; 1853 for (i=1; i<n; i++) { 1854 v = aa + 36*ai[i]; 1855 vi = aj + ai[i]; 1856 nz = diag[i] - ai[i]; 1857 idx = 6*(*r++); 1858 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1859 s5 = b[4+idx]; s6 = b[5+idx]; 1860 while (nz--) { 1861 idx = 6*(*vi++); 1862 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1863 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1864 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1865 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1866 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1867 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1868 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1869 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1870 v += 36; 1871 } 1872 idx = 6*i; 1873 t[idx] = s1;t[1+idx] = s2; 1874 t[2+idx] = s3;t[3+idx] = s4; 1875 t[4+idx] = s5;t[5+idx] = s6; 1876 } 1877 /* backward solve the upper triangular */ 1878 for (i=n-1; i>=0; i--){ 1879 v = aa + 36*diag[i] + 36; 1880 vi = aj + diag[i] + 1; 1881 nz = ai[i+1] - diag[i] - 1; 1882 idt = 6*i; 1883 s1 = t[idt]; s2 = t[1+idt]; 1884 s3 = t[2+idt];s4 = t[3+idt]; 1885 s5 = t[4+idt];s6 = t[5+idt]; 1886 while (nz--) { 1887 idx = 6*(*vi++); 1888 x1 = t[idx]; x2 = t[1+idx]; 1889 x3 = t[2+idx]; x4 = t[3+idx]; 1890 x5 = t[4+idx]; x6 = t[5+idx]; 1891 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1892 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1893 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1894 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1895 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1896 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1897 v += 36; 1898 } 1899 idc = 6*(*c--); 1900 v = aa + 36*diag[i]; 1901 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1902 v[18]*s4+v[24]*s5+v[30]*s6; 1903 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1904 v[19]*s4+v[25]*s5+v[31]*s6; 1905 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1906 v[20]*s4+v[26]*s5+v[32]*s6; 1907 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1908 v[21]*s4+v[27]*s5+v[33]*s6; 1909 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1910 v[22]*s4+v[28]*s5+v[34]*s6; 1911 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1912 v[23]*s4+v[29]*s5+v[35]*s6; 1913 } 1914 1915 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1916 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1917 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1918 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1919 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1920 PetscFunctionReturn(0); 1921 } 1922 1923 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 1924 #undef __FUNCT__ 1925 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 1926 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 1927 { 1928 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1929 IS iscol=a->col,isrow=a->row; 1930 PetscErrorCode ierr; 1931 const PetscInt *r,*c,*rout,*cout; 1932 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 1933 const MatScalar *aa=a->a,*v; 1934 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1935 const PetscScalar *b; 1936 PetscFunctionBegin; 1937 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1938 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1939 t = a->solve_work; 1940 1941 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1942 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1943 1944 /* forward solve the lower triangular */ 1945 idx = 6*r[0]; 1946 t[0] = b[idx]; t[1] = b[1+idx]; 1947 t[2] = b[2+idx]; t[3] = b[3+idx]; 1948 t[4] = b[4+idx]; t[5] = b[5+idx]; 1949 for (i=1; i<n; i++) { 1950 v = aa + 36*ai[i]; 1951 vi = aj + ai[i]; 1952 nz = ai[i+1] - ai[i]; 1953 idx = 6*r[i]; 1954 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1955 s5 = b[4+idx]; s6 = b[5+idx]; 1956 for(m=0;m<nz;m++){ 1957 idx = 6*vi[m]; 1958 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1959 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1960 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1961 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1962 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1963 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1964 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1965 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1966 v += 36; 1967 } 1968 idx = 6*i; 1969 t[idx] = s1;t[1+idx] = s2; 1970 t[2+idx] = s3;t[3+idx] = s4; 1971 t[4+idx] = s5;t[5+idx] = s6; 1972 } 1973 /* backward solve the upper triangular */ 1974 for (i=n-1; i>=0; i--){ 1975 k = 2*n-i; 1976 v = aa + 36*ai[k]; 1977 vi = aj + ai[k]; 1978 nz = ai[k+1] - ai[k] - 1; 1979 idt = 6*i; 1980 s1 = t[idt]; s2 = t[1+idt]; 1981 s3 = t[2+idt];s4 = t[3+idt]; 1982 s5 = t[4+idt];s6 = t[5+idt]; 1983 for(m=0;m<nz;m++){ 1984 idx = 6*vi[m]; 1985 x1 = t[idx]; x2 = t[1+idx]; 1986 x3 = t[2+idx]; x4 = t[3+idx]; 1987 x5 = t[4+idx]; x6 = t[5+idx]; 1988 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1989 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1990 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1991 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1992 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1993 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1994 v += 36; 1995 } 1996 idc = 6*c[i]; 1997 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1998 v[18]*s4+v[24]*s5+v[30]*s6; 1999 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2000 v[19]*s4+v[25]*s5+v[31]*s6; 2001 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2002 v[20]*s4+v[26]*s5+v[32]*s6; 2003 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2004 v[21]*s4+v[27]*s5+v[33]*s6; 2005 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2006 v[22]*s4+v[28]*s5+v[34]*s6; 2007 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2008 v[23]*s4+v[29]*s5+v[35]*s6; 2009 } 2010 2011 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2012 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2013 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2014 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2015 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2016 PetscFunctionReturn(0); 2017 } 2018 #endif 2019 2020 #undef __FUNCT__ 2021 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 2022 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 2023 { 2024 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2025 IS iscol=a->col,isrow=a->row; 2026 PetscErrorCode ierr; 2027 const PetscInt *r,*c,*rout,*cout; 2028 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2029 const MatScalar *aa=a->a,*v; 2030 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2031 const PetscScalar *b; 2032 PetscFunctionBegin; 2033 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2034 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2035 t = a->solve_work; 2036 2037 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2038 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2039 2040 /* forward solve the lower triangular */ 2041 idx = 6*r[0]; 2042 t[0] = b[idx]; t[1] = b[1+idx]; 2043 t[2] = b[2+idx]; t[3] = b[3+idx]; 2044 t[4] = b[4+idx]; t[5] = b[5+idx]; 2045 for (i=1; i<n; i++) { 2046 v = aa + 36*ai[i]; 2047 vi = aj + ai[i]; 2048 nz = ai[i+1] - ai[i]; 2049 idx = 6*r[i]; 2050 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2051 s5 = b[4+idx]; s6 = b[5+idx]; 2052 for(m=0;m<nz;m++){ 2053 idx = 6*vi[m]; 2054 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2055 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2056 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2057 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2058 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2059 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2060 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2061 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2062 v += 36; 2063 } 2064 idx = 6*i; 2065 t[idx] = s1;t[1+idx] = s2; 2066 t[2+idx] = s3;t[3+idx] = s4; 2067 t[4+idx] = s5;t[5+idx] = s6; 2068 } 2069 /* backward solve the upper triangular */ 2070 for (i=n-1; i>=0; i--){ 2071 v = aa + 36*(adiag[i+1]+1); 2072 vi = aj + adiag[i+1]+1; 2073 nz = adiag[i] - adiag[i+1] - 1; 2074 idt = 6*i; 2075 s1 = t[idt]; s2 = t[1+idt]; 2076 s3 = t[2+idt];s4 = t[3+idt]; 2077 s5 = t[4+idt];s6 = t[5+idt]; 2078 for(m=0;m<nz;m++){ 2079 idx = 6*vi[m]; 2080 x1 = t[idx]; x2 = t[1+idx]; 2081 x3 = t[2+idx]; x4 = t[3+idx]; 2082 x5 = t[4+idx]; x6 = t[5+idx]; 2083 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2084 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2085 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2086 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2087 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2088 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2089 v += 36; 2090 } 2091 idc = 6*c[i]; 2092 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2093 v[18]*s4+v[24]*s5+v[30]*s6; 2094 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2095 v[19]*s4+v[25]*s5+v[31]*s6; 2096 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2097 v[20]*s4+v[26]*s5+v[32]*s6; 2098 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2099 v[21]*s4+v[27]*s5+v[33]*s6; 2100 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2101 v[22]*s4+v[28]*s5+v[34]*s6; 2102 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2103 v[23]*s4+v[29]*s5+v[35]*s6; 2104 } 2105 2106 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2107 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2108 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2109 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2110 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2111 PetscFunctionReturn(0); 2112 } 2113 2114 #undef __FUNCT__ 2115 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2116 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 2117 { 2118 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2119 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2120 PetscErrorCode ierr; 2121 PetscInt *diag = a->diag,jdx; 2122 const MatScalar *aa=a->a,*v; 2123 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2124 const PetscScalar *b; 2125 2126 PetscFunctionBegin; 2127 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2128 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2129 /* forward solve the lower triangular */ 2130 idx = 0; 2131 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2132 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2133 for (i=1; i<n; i++) { 2134 v = aa + 36*ai[i]; 2135 vi = aj + ai[i]; 2136 nz = diag[i] - ai[i]; 2137 idx = 6*i; 2138 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2139 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2140 while (nz--) { 2141 jdx = 6*(*vi++); 2142 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2143 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2144 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2145 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2146 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2147 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2148 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2149 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2150 v += 36; 2151 } 2152 x[idx] = s1; 2153 x[1+idx] = s2; 2154 x[2+idx] = s3; 2155 x[3+idx] = s4; 2156 x[4+idx] = s5; 2157 x[5+idx] = s6; 2158 } 2159 /* backward solve the upper triangular */ 2160 for (i=n-1; i>=0; i--){ 2161 v = aa + 36*diag[i] + 36; 2162 vi = aj + diag[i] + 1; 2163 nz = ai[i+1] - diag[i] - 1; 2164 idt = 6*i; 2165 s1 = x[idt]; s2 = x[1+idt]; 2166 s3 = x[2+idt]; s4 = x[3+idt]; 2167 s5 = x[4+idt]; s6 = x[5+idt]; 2168 while (nz--) { 2169 idx = 6*(*vi++); 2170 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2171 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2172 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2173 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2174 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2175 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2176 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2177 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2178 v += 36; 2179 } 2180 v = aa + 36*diag[i]; 2181 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2182 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2183 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2184 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2185 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2186 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2187 } 2188 2189 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2190 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2191 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2192 PetscFunctionReturn(0); 2193 } 2194 2195 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 2196 #undef __FUNCT__ 2197 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2198 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2199 { 2200 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2201 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2202 PetscErrorCode ierr; 2203 PetscInt idx,jdx,idt; 2204 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2205 const MatScalar *aa=a->a,*v; 2206 PetscScalar *x; 2207 const PetscScalar *b; 2208 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2209 2210 PetscFunctionBegin; 2211 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2212 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2213 /* forward solve the lower triangular */ 2214 idx = 0; 2215 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2216 x[4] = b[4+idx];x[5] = b[5+idx]; 2217 for (i=1; i<n; i++) { 2218 v = aa + bs2*ai[i]; 2219 vi = aj + ai[i]; 2220 nz = ai[i+1] - ai[i]; 2221 idx = bs*i; 2222 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2223 s5 = b[4+idx];s6 = b[5+idx]; 2224 for(k=0;k<nz;k++){ 2225 jdx = bs*vi[k]; 2226 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2227 x5 = x[4+jdx]; x6 = x[5+jdx]; 2228 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2229 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2230 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2231 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2232 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2233 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2234 v += bs2; 2235 } 2236 2237 x[idx] = s1; 2238 x[1+idx] = s2; 2239 x[2+idx] = s3; 2240 x[3+idx] = s4; 2241 x[4+idx] = s5; 2242 x[5+idx] = s6; 2243 } 2244 2245 /* backward solve the upper triangular */ 2246 for (i=n-1; i>=0; i--){ 2247 v = aa + bs2*ai[2*n-i]; 2248 vi = aj + ai[2*n-i]; 2249 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2250 idt = bs*i; 2251 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2252 s5 = x[4+idt];s6 = x[5+idt]; 2253 for(k=0;k<nz;k++){ 2254 idx = bs*vi[k]; 2255 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2256 x5 = x[4+idx];x6 = x[5+idx]; 2257 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2258 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2259 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2260 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2261 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2262 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2263 v += bs2; 2264 } 2265 /* x = inv_diagonal*x */ 2266 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2267 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2268 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2269 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2270 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2271 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2272 } 2273 2274 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2275 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2276 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2277 PetscFunctionReturn(0); 2278 } 2279 #endif 2280 2281 #undef __FUNCT__ 2282 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2283 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2284 { 2285 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2286 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2287 PetscErrorCode ierr; 2288 PetscInt idx,jdx,idt; 2289 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2290 const MatScalar *aa=a->a,*v; 2291 PetscScalar *x; 2292 const PetscScalar *b; 2293 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2294 2295 PetscFunctionBegin; 2296 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2297 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2298 /* forward solve the lower triangular */ 2299 idx = 0; 2300 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2301 x[4] = b[4+idx];x[5] = b[5+idx]; 2302 for (i=1; i<n; i++) { 2303 v = aa + bs2*ai[i]; 2304 vi = aj + ai[i]; 2305 nz = ai[i+1] - ai[i]; 2306 idx = bs*i; 2307 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2308 s5 = b[4+idx];s6 = b[5+idx]; 2309 for(k=0;k<nz;k++){ 2310 jdx = bs*vi[k]; 2311 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2312 x5 = x[4+jdx]; x6 = x[5+jdx]; 2313 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2314 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2315 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2316 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2317 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2318 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2319 v += bs2; 2320 } 2321 2322 x[idx] = s1; 2323 x[1+idx] = s2; 2324 x[2+idx] = s3; 2325 x[3+idx] = s4; 2326 x[4+idx] = s5; 2327 x[5+idx] = s6; 2328 } 2329 2330 /* backward solve the upper triangular */ 2331 for (i=n-1; i>=0; i--){ 2332 v = aa + bs2*(adiag[i+1]+1); 2333 vi = aj + adiag[i+1]+1; 2334 nz = adiag[i] - adiag[i+1]-1; 2335 idt = bs*i; 2336 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2337 s5 = x[4+idt];s6 = x[5+idt]; 2338 for(k=0;k<nz;k++){ 2339 idx = bs*vi[k]; 2340 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2341 x5 = x[4+idx];x6 = x[5+idx]; 2342 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2343 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2344 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2345 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2346 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2347 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2348 v += bs2; 2349 } 2350 /* x = inv_diagonal*x */ 2351 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2352 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2353 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2354 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2355 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2356 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2357 } 2358 2359 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2361 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2362 PetscFunctionReturn(0); 2363 } 2364 2365 #undef __FUNCT__ 2366 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2367 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 2368 { 2369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2370 IS iscol=a->col,isrow=a->row; 2371 PetscErrorCode ierr; 2372 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 2373 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2374 const MatScalar *aa=a->a,*v; 2375 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2376 const PetscScalar *b; 2377 2378 PetscFunctionBegin; 2379 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2380 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2381 t = a->solve_work; 2382 2383 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2384 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2385 2386 /* forward solve the lower triangular */ 2387 idx = 5*(*r++); 2388 t[0] = b[idx]; t[1] = b[1+idx]; 2389 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2390 for (i=1; i<n; i++) { 2391 v = aa + 25*ai[i]; 2392 vi = aj + ai[i]; 2393 nz = diag[i] - ai[i]; 2394 idx = 5*(*r++); 2395 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2396 s5 = b[4+idx]; 2397 while (nz--) { 2398 idx = 5*(*vi++); 2399 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2400 x4 = t[3+idx];x5 = t[4+idx]; 2401 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2402 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2403 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2404 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2405 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2406 v += 25; 2407 } 2408 idx = 5*i; 2409 t[idx] = s1;t[1+idx] = s2; 2410 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2411 } 2412 /* backward solve the upper triangular */ 2413 for (i=n-1; i>=0; i--){ 2414 v = aa + 25*diag[i] + 25; 2415 vi = aj + diag[i] + 1; 2416 nz = ai[i+1] - diag[i] - 1; 2417 idt = 5*i; 2418 s1 = t[idt]; s2 = t[1+idt]; 2419 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2420 while (nz--) { 2421 idx = 5*(*vi++); 2422 x1 = t[idx]; x2 = t[1+idx]; 2423 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2424 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2425 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2426 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2427 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2428 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2429 v += 25; 2430 } 2431 idc = 5*(*c--); 2432 v = aa + 25*diag[i]; 2433 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2434 v[15]*s4+v[20]*s5; 2435 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2436 v[16]*s4+v[21]*s5; 2437 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2438 v[17]*s4+v[22]*s5; 2439 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2440 v[18]*s4+v[23]*s5; 2441 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2442 v[19]*s4+v[24]*s5; 2443 } 2444 2445 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2446 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2447 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2448 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2449 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2450 PetscFunctionReturn(0); 2451 } 2452 2453 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 2454 #undef __FUNCT__ 2455 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2456 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2457 { 2458 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2459 IS iscol=a->col,isrow=a->row; 2460 PetscErrorCode ierr; 2461 const PetscInt *r,*c,*rout,*cout; 2462 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2463 const MatScalar *aa=a->a,*v; 2464 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2465 const PetscScalar *b; 2466 2467 PetscFunctionBegin; 2468 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2469 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2470 t = a->solve_work; 2471 2472 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2473 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2474 2475 /* forward solve the lower triangular */ 2476 idx = 5*r[0]; 2477 t[0] = b[idx]; t[1] = b[1+idx]; 2478 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2479 for (i=1; i<n; i++) { 2480 v = aa + 25*ai[i]; 2481 vi = aj + ai[i]; 2482 nz = ai[i+1] - ai[i]; 2483 idx = 5*r[i]; 2484 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2485 s5 = b[4+idx]; 2486 for(m=0;m<nz;m++){ 2487 idx = 5*vi[m]; 2488 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2489 x4 = t[3+idx];x5 = t[4+idx]; 2490 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2491 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2492 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2493 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2494 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2495 v += 25; 2496 } 2497 idx = 5*i; 2498 t[idx] = s1;t[1+idx] = s2; 2499 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2500 } 2501 /* backward solve the upper triangular */ 2502 for (i=n-1; i>=0; i--){ 2503 k = 2*n-i; 2504 v = aa + 25*ai[k]; 2505 vi = aj + ai[k]; 2506 nz = ai[k+1] - ai[k] - 1; 2507 idt = 5*i; 2508 s1 = t[idt]; s2 = t[1+idt]; 2509 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2510 for(m=0;m<nz;m++){ 2511 idx = 5*vi[m]; 2512 x1 = t[idx]; x2 = t[1+idx]; 2513 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2514 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2515 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2516 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2517 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2518 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2519 v += 25; 2520 } 2521 idc = 5*c[i]; 2522 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2523 v[15]*s4+v[20]*s5; 2524 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2525 v[16]*s4+v[21]*s5; 2526 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2527 v[17]*s4+v[22]*s5; 2528 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2529 v[18]*s4+v[23]*s5; 2530 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2531 v[19]*s4+v[24]*s5; 2532 } 2533 2534 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2535 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2536 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2537 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2538 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2539 PetscFunctionReturn(0); 2540 } 2541 #endif 2542 2543 #undef __FUNCT__ 2544 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2545 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2546 { 2547 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2548 IS iscol=a->col,isrow=a->row; 2549 PetscErrorCode ierr; 2550 const PetscInt *r,*c,*rout,*cout; 2551 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2552 const MatScalar *aa=a->a,*v; 2553 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2554 const PetscScalar *b; 2555 2556 PetscFunctionBegin; 2557 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2558 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2559 t = a->solve_work; 2560 2561 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2562 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2563 2564 /* forward solve the lower triangular */ 2565 idx = 5*r[0]; 2566 t[0] = b[idx]; t[1] = b[1+idx]; 2567 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2568 for (i=1; i<n; i++) { 2569 v = aa + 25*ai[i]; 2570 vi = aj + ai[i]; 2571 nz = ai[i+1] - ai[i]; 2572 idx = 5*r[i]; 2573 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2574 s5 = b[4+idx]; 2575 for(m=0;m<nz;m++){ 2576 idx = 5*vi[m]; 2577 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2578 x4 = t[3+idx];x5 = t[4+idx]; 2579 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2580 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2581 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2582 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2583 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2584 v += 25; 2585 } 2586 idx = 5*i; 2587 t[idx] = s1;t[1+idx] = s2; 2588 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2589 } 2590 /* backward solve the upper triangular */ 2591 for (i=n-1; i>=0; i--){ 2592 v = aa + 25*(adiag[i+1]+1); 2593 vi = aj + adiag[i+1]+1; 2594 nz = adiag[i] - adiag[i+1] - 1; 2595 idt = 5*i; 2596 s1 = t[idt]; s2 = t[1+idt]; 2597 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2598 for(m=0;m<nz;m++){ 2599 idx = 5*vi[m]; 2600 x1 = t[idx]; x2 = t[1+idx]; 2601 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2602 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2603 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2604 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2605 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2606 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2607 v += 25; 2608 } 2609 idc = 5*c[i]; 2610 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2611 v[15]*s4+v[20]*s5; 2612 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2613 v[16]*s4+v[21]*s5; 2614 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2615 v[17]*s4+v[22]*s5; 2616 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2617 v[18]*s4+v[23]*s5; 2618 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2619 v[19]*s4+v[24]*s5; 2620 } 2621 2622 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2623 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2624 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2625 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2626 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2627 PetscFunctionReturn(0); 2628 } 2629 2630 #undef __FUNCT__ 2631 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2632 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 2633 { 2634 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2635 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2636 PetscErrorCode ierr; 2637 PetscInt *diag = a->diag,jdx; 2638 const MatScalar *aa=a->a,*v; 2639 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2640 const PetscScalar *b; 2641 2642 PetscFunctionBegin; 2643 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2644 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2645 /* forward solve the lower triangular */ 2646 idx = 0; 2647 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2648 for (i=1; i<n; i++) { 2649 v = aa + 25*ai[i]; 2650 vi = aj + ai[i]; 2651 nz = diag[i] - ai[i]; 2652 idx = 5*i; 2653 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2654 while (nz--) { 2655 jdx = 5*(*vi++); 2656 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2657 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2658 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2659 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2660 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2661 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2662 v += 25; 2663 } 2664 x[idx] = s1; 2665 x[1+idx] = s2; 2666 x[2+idx] = s3; 2667 x[3+idx] = s4; 2668 x[4+idx] = s5; 2669 } 2670 /* backward solve the upper triangular */ 2671 for (i=n-1; i>=0; i--){ 2672 v = aa + 25*diag[i] + 25; 2673 vi = aj + diag[i] + 1; 2674 nz = ai[i+1] - diag[i] - 1; 2675 idt = 5*i; 2676 s1 = x[idt]; s2 = x[1+idt]; 2677 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2678 while (nz--) { 2679 idx = 5*(*vi++); 2680 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2681 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2682 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2683 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2684 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2685 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2686 v += 25; 2687 } 2688 v = aa + 25*diag[i]; 2689 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2690 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2691 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2692 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2693 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2694 } 2695 2696 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2698 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2699 PetscFunctionReturn(0); 2700 } 2701 2702 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 2703 #undef __FUNCT__ 2704 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2705 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2706 { 2707 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2708 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2709 PetscErrorCode ierr; 2710 PetscInt jdx; 2711 const MatScalar *aa=a->a,*v; 2712 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2713 const PetscScalar *b; 2714 2715 PetscFunctionBegin; 2716 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2717 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2718 /* forward solve the lower triangular */ 2719 idx = 0; 2720 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2721 for (i=1; i<n; i++) { 2722 v = aa + 25*ai[i]; 2723 vi = aj + ai[i]; 2724 nz = ai[i+1] - ai[i]; 2725 idx = 5*i; 2726 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2727 for(k=0;k<nz;k++) { 2728 jdx = 5*vi[k]; 2729 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2730 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2731 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2732 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2733 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2734 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2735 v += 25; 2736 } 2737 x[idx] = s1; 2738 x[1+idx] = s2; 2739 x[2+idx] = s3; 2740 x[3+idx] = s4; 2741 x[4+idx] = s5; 2742 } 2743 2744 /* backward solve the upper triangular */ 2745 for (i=n-1; i>=0; i--){ 2746 v = aa + 25*ai[2*n-i]; 2747 vi = aj + ai[2*n-i]; 2748 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2749 idt = 5*i; 2750 s1 = x[idt]; s2 = x[1+idt]; 2751 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2752 for(k=0;k<nz;k++){ 2753 idx = 5*vi[k]; 2754 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2755 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2756 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2757 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2758 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2759 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2760 v += 25; 2761 } 2762 /* x = inv_diagonal*x */ 2763 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2764 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2765 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2766 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2767 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2768 } 2769 2770 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2771 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2772 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2773 PetscFunctionReturn(0); 2774 } 2775 #endif 2776 2777 #undef __FUNCT__ 2778 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2779 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2780 { 2781 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2782 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 2783 PetscErrorCode ierr; 2784 PetscInt jdx; 2785 const MatScalar *aa=a->a,*v; 2786 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2787 const PetscScalar *b; 2788 2789 PetscFunctionBegin; 2790 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2791 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2792 /* forward solve the lower triangular */ 2793 idx = 0; 2794 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2795 for (i=1; i<n; i++) { 2796 v = aa + 25*ai[i]; 2797 vi = aj + ai[i]; 2798 nz = ai[i+1] - ai[i]; 2799 idx = 5*i; 2800 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2801 for(k=0;k<nz;k++) { 2802 jdx = 5*vi[k]; 2803 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2804 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2805 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2806 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2807 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2808 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2809 v += 25; 2810 } 2811 x[idx] = s1; 2812 x[1+idx] = s2; 2813 x[2+idx] = s3; 2814 x[3+idx] = s4; 2815 x[4+idx] = s5; 2816 } 2817 2818 /* backward solve the upper triangular */ 2819 for (i=n-1; i>=0; i--){ 2820 v = aa + 25*(adiag[i+1]+1); 2821 vi = aj + adiag[i+1]+1; 2822 nz = adiag[i] - adiag[i+1]-1; 2823 idt = 5*i; 2824 s1 = x[idt]; s2 = x[1+idt]; 2825 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2826 for(k=0;k<nz;k++){ 2827 idx = 5*vi[k]; 2828 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2829 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2830 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2831 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2832 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2833 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2834 v += 25; 2835 } 2836 /* x = inv_diagonal*x */ 2837 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2838 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2839 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2840 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2841 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2842 } 2843 2844 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2845 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2846 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2847 PetscFunctionReturn(0); 2848 } 2849 2850 #undef __FUNCT__ 2851 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2852 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 2853 { 2854 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2855 IS iscol=a->col,isrow=a->row; 2856 PetscErrorCode ierr; 2857 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2858 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2859 const MatScalar *aa=a->a,*v; 2860 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2861 const PetscScalar *b; 2862 2863 PetscFunctionBegin; 2864 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2865 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2866 t = a->solve_work; 2867 2868 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2869 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2870 2871 /* forward solve the lower triangular */ 2872 idx = 4*(*r++); 2873 t[0] = b[idx]; t[1] = b[1+idx]; 2874 t[2] = b[2+idx]; t[3] = b[3+idx]; 2875 for (i=1; i<n; i++) { 2876 v = aa + 16*ai[i]; 2877 vi = aj + ai[i]; 2878 nz = diag[i] - ai[i]; 2879 idx = 4*(*r++); 2880 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2881 while (nz--) { 2882 idx = 4*(*vi++); 2883 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2884 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2885 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2886 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2887 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2888 v += 16; 2889 } 2890 idx = 4*i; 2891 t[idx] = s1;t[1+idx] = s2; 2892 t[2+idx] = s3;t[3+idx] = s4; 2893 } 2894 /* backward solve the upper triangular */ 2895 for (i=n-1; i>=0; i--){ 2896 v = aa + 16*diag[i] + 16; 2897 vi = aj + diag[i] + 1; 2898 nz = ai[i+1] - diag[i] - 1; 2899 idt = 4*i; 2900 s1 = t[idt]; s2 = t[1+idt]; 2901 s3 = t[2+idt];s4 = t[3+idt]; 2902 while (nz--) { 2903 idx = 4*(*vi++); 2904 x1 = t[idx]; x2 = t[1+idx]; 2905 x3 = t[2+idx]; x4 = t[3+idx]; 2906 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2907 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2908 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2909 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2910 v += 16; 2911 } 2912 idc = 4*(*c--); 2913 v = aa + 16*diag[i]; 2914 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2915 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2916 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2917 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2918 } 2919 2920 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2921 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2922 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2923 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2924 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2925 PetscFunctionReturn(0); 2926 } 2927 2928 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 2929 #undef __FUNCT__ 2930 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2931 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 2932 { 2933 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2934 IS iscol=a->col,isrow=a->row; 2935 PetscErrorCode ierr; 2936 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2937 const PetscInt *r,*c,*rout,*cout; 2938 const MatScalar *aa=a->a,*v; 2939 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2940 const PetscScalar *b; 2941 2942 PetscFunctionBegin; 2943 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2944 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2945 t = a->solve_work; 2946 2947 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2948 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2949 2950 /* forward solve the lower triangular */ 2951 idx = 4*r[0]; 2952 t[0] = b[idx]; t[1] = b[1+idx]; 2953 t[2] = b[2+idx]; t[3] = b[3+idx]; 2954 for (i=1; i<n; i++) { 2955 v = aa + 16*ai[i]; 2956 vi = aj + ai[i]; 2957 nz = ai[i+1] - ai[i]; 2958 idx = 4*r[i]; 2959 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2960 for(m=0;m<nz;m++){ 2961 idx = 4*vi[m]; 2962 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2963 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2964 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2965 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2966 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2967 v += 16; 2968 } 2969 idx = 4*i; 2970 t[idx] = s1;t[1+idx] = s2; 2971 t[2+idx] = s3;t[3+idx] = s4; 2972 } 2973 /* backward solve the upper triangular */ 2974 for (i=n-1; i>=0; i--){ 2975 k = 2*n-i; 2976 v = aa + 16*ai[k]; 2977 vi = aj + ai[k]; 2978 nz = ai[k+1] - ai[k] - 1; 2979 idt = 4*i; 2980 s1 = t[idt]; s2 = t[1+idt]; 2981 s3 = t[2+idt];s4 = t[3+idt]; 2982 for(m=0;m<nz;m++){ 2983 idx = 4*vi[m]; 2984 x1 = t[idx]; x2 = t[1+idx]; 2985 x3 = t[2+idx]; x4 = t[3+idx]; 2986 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2987 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2988 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2989 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2990 v += 16; 2991 } 2992 idc = 4*c[i]; 2993 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2994 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2995 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2996 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2997 } 2998 2999 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3000 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3001 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3002 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3003 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3004 PetscFunctionReturn(0); 3005 } 3006 #endif 3007 3008 #undef __FUNCT__ 3009 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 3010 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 3011 { 3012 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3013 IS iscol=a->col,isrow=a->row; 3014 PetscErrorCode ierr; 3015 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3016 const PetscInt *r,*c,*rout,*cout; 3017 const MatScalar *aa=a->a,*v; 3018 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3019 const PetscScalar *b; 3020 3021 PetscFunctionBegin; 3022 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3023 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3024 t = a->solve_work; 3025 3026 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3027 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3028 3029 /* forward solve the lower triangular */ 3030 idx = 4*r[0]; 3031 t[0] = b[idx]; t[1] = b[1+idx]; 3032 t[2] = b[2+idx]; t[3] = b[3+idx]; 3033 for (i=1; i<n; i++) { 3034 v = aa + 16*ai[i]; 3035 vi = aj + ai[i]; 3036 nz = ai[i+1] - ai[i]; 3037 idx = 4*r[i]; 3038 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3039 for(m=0;m<nz;m++){ 3040 idx = 4*vi[m]; 3041 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3042 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3043 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3044 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3045 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3046 v += 16; 3047 } 3048 idx = 4*i; 3049 t[idx] = s1;t[1+idx] = s2; 3050 t[2+idx] = s3;t[3+idx] = s4; 3051 } 3052 /* backward solve the upper triangular */ 3053 for (i=n-1; i>=0; i--){ 3054 v = aa + 16*(adiag[i+1]+1); 3055 vi = aj + adiag[i+1]+1; 3056 nz = adiag[i] - adiag[i+1] - 1; 3057 idt = 4*i; 3058 s1 = t[idt]; s2 = t[1+idt]; 3059 s3 = t[2+idt];s4 = t[3+idt]; 3060 for(m=0;m<nz;m++){ 3061 idx = 4*vi[m]; 3062 x1 = t[idx]; x2 = t[1+idx]; 3063 x3 = t[2+idx]; x4 = t[3+idx]; 3064 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3065 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3066 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3067 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3068 v += 16; 3069 } 3070 idc = 4*c[i]; 3071 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3072 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3073 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3074 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3075 } 3076 3077 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3078 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3079 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3080 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3081 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3082 PetscFunctionReturn(0); 3083 } 3084 3085 #undef __FUNCT__ 3086 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3087 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3088 { 3089 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3090 IS iscol=a->col,isrow=a->row; 3091 PetscErrorCode ierr; 3092 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3093 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3094 const MatScalar *aa=a->a,*v; 3095 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3096 PetscScalar *x; 3097 const PetscScalar *b; 3098 3099 PetscFunctionBegin; 3100 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3101 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3102 t = (MatScalar *)a->solve_work; 3103 3104 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3105 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3106 3107 /* forward solve the lower triangular */ 3108 idx = 4*(*r++); 3109 t[0] = (MatScalar)b[idx]; 3110 t[1] = (MatScalar)b[1+idx]; 3111 t[2] = (MatScalar)b[2+idx]; 3112 t[3] = (MatScalar)b[3+idx]; 3113 for (i=1; i<n; i++) { 3114 v = aa + 16*ai[i]; 3115 vi = aj + ai[i]; 3116 nz = diag[i] - ai[i]; 3117 idx = 4*(*r++); 3118 s1 = (MatScalar)b[idx]; 3119 s2 = (MatScalar)b[1+idx]; 3120 s3 = (MatScalar)b[2+idx]; 3121 s4 = (MatScalar)b[3+idx]; 3122 while (nz--) { 3123 idx = 4*(*vi++); 3124 x1 = t[idx]; 3125 x2 = t[1+idx]; 3126 x3 = t[2+idx]; 3127 x4 = t[3+idx]; 3128 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3129 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3130 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3131 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3132 v += 16; 3133 } 3134 idx = 4*i; 3135 t[idx] = s1; 3136 t[1+idx] = s2; 3137 t[2+idx] = s3; 3138 t[3+idx] = s4; 3139 } 3140 /* backward solve the upper triangular */ 3141 for (i=n-1; i>=0; i--){ 3142 v = aa + 16*diag[i] + 16; 3143 vi = aj + diag[i] + 1; 3144 nz = ai[i+1] - diag[i] - 1; 3145 idt = 4*i; 3146 s1 = t[idt]; 3147 s2 = t[1+idt]; 3148 s3 = t[2+idt]; 3149 s4 = t[3+idt]; 3150 while (nz--) { 3151 idx = 4*(*vi++); 3152 x1 = t[idx]; 3153 x2 = t[1+idx]; 3154 x3 = t[2+idx]; 3155 x4 = t[3+idx]; 3156 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3157 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3158 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3159 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3160 v += 16; 3161 } 3162 idc = 4*(*c--); 3163 v = aa + 16*diag[i]; 3164 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3165 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3166 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3167 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3168 x[idc] = (PetscScalar)t[idt]; 3169 x[1+idc] = (PetscScalar)t[1+idt]; 3170 x[2+idc] = (PetscScalar)t[2+idt]; 3171 x[3+idc] = (PetscScalar)t[3+idt]; 3172 } 3173 3174 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3175 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3176 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3177 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3178 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3179 PetscFunctionReturn(0); 3180 } 3181 3182 #if defined (PETSC_HAVE_SSE) 3183 3184 #include PETSC_HAVE_SSE 3185 3186 #undef __FUNCT__ 3187 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3188 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3189 { 3190 /* 3191 Note: This code uses demotion of double 3192 to float when performing the mixed-mode computation. 3193 This may not be numerically reasonable for all applications. 3194 */ 3195 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3196 IS iscol=a->col,isrow=a->row; 3197 PetscErrorCode ierr; 3198 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3199 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3200 MatScalar *aa=a->a,*v; 3201 PetscScalar *x,*b,*t; 3202 3203 /* Make space in temp stack for 16 Byte Aligned arrays */ 3204 float ssealignedspace[11],*tmps,*tmpx; 3205 unsigned long offset; 3206 3207 PetscFunctionBegin; 3208 SSE_SCOPE_BEGIN; 3209 3210 offset = (unsigned long)ssealignedspace % 16; 3211 if (offset) offset = (16 - offset)/4; 3212 tmps = &ssealignedspace[offset]; 3213 tmpx = &ssealignedspace[offset+4]; 3214 PREFETCH_NTA(aa+16*ai[1]); 3215 3216 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3217 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3218 t = a->solve_work; 3219 3220 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3221 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3222 3223 /* forward solve the lower triangular */ 3224 idx = 4*(*r++); 3225 t[0] = b[idx]; t[1] = b[1+idx]; 3226 t[2] = b[2+idx]; t[3] = b[3+idx]; 3227 v = aa + 16*ai[1]; 3228 3229 for (i=1; i<n;) { 3230 PREFETCH_NTA(&v[8]); 3231 vi = aj + ai[i]; 3232 nz = diag[i] - ai[i]; 3233 idx = 4*(*r++); 3234 3235 /* Demote sum from double to float */ 3236 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3237 LOAD_PS(tmps,XMM7); 3238 3239 while (nz--) { 3240 PREFETCH_NTA(&v[16]); 3241 idx = 4*(*vi++); 3242 3243 /* Demote solution (so far) from double to float */ 3244 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3245 3246 /* 4x4 Matrix-Vector product with negative accumulation: */ 3247 SSE_INLINE_BEGIN_2(tmpx,v) 3248 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3249 3250 /* First Column */ 3251 SSE_COPY_PS(XMM0,XMM6) 3252 SSE_SHUFFLE(XMM0,XMM0,0x00) 3253 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3254 SSE_SUB_PS(XMM7,XMM0) 3255 3256 /* Second Column */ 3257 SSE_COPY_PS(XMM1,XMM6) 3258 SSE_SHUFFLE(XMM1,XMM1,0x55) 3259 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3260 SSE_SUB_PS(XMM7,XMM1) 3261 3262 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3263 3264 /* Third Column */ 3265 SSE_COPY_PS(XMM2,XMM6) 3266 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3267 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3268 SSE_SUB_PS(XMM7,XMM2) 3269 3270 /* Fourth Column */ 3271 SSE_COPY_PS(XMM3,XMM6) 3272 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3273 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3274 SSE_SUB_PS(XMM7,XMM3) 3275 SSE_INLINE_END_2 3276 3277 v += 16; 3278 } 3279 idx = 4*i; 3280 v = aa + 16*ai[++i]; 3281 PREFETCH_NTA(v); 3282 STORE_PS(tmps,XMM7); 3283 3284 /* Promote result from float to double */ 3285 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3286 } 3287 /* backward solve the upper triangular */ 3288 idt = 4*(n-1); 3289 ai16 = 16*diag[n-1]; 3290 v = aa + ai16 + 16; 3291 for (i=n-1; i>=0;){ 3292 PREFETCH_NTA(&v[8]); 3293 vi = aj + diag[i] + 1; 3294 nz = ai[i+1] - diag[i] - 1; 3295 3296 /* Demote accumulator from double to float */ 3297 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3298 LOAD_PS(tmps,XMM7); 3299 3300 while (nz--) { 3301 PREFETCH_NTA(&v[16]); 3302 idx = 4*(*vi++); 3303 3304 /* Demote solution (so far) from double to float */ 3305 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3306 3307 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3308 SSE_INLINE_BEGIN_2(tmpx,v) 3309 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3310 3311 /* First Column */ 3312 SSE_COPY_PS(XMM0,XMM6) 3313 SSE_SHUFFLE(XMM0,XMM0,0x00) 3314 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3315 SSE_SUB_PS(XMM7,XMM0) 3316 3317 /* Second Column */ 3318 SSE_COPY_PS(XMM1,XMM6) 3319 SSE_SHUFFLE(XMM1,XMM1,0x55) 3320 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3321 SSE_SUB_PS(XMM7,XMM1) 3322 3323 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3324 3325 /* Third Column */ 3326 SSE_COPY_PS(XMM2,XMM6) 3327 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3328 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3329 SSE_SUB_PS(XMM7,XMM2) 3330 3331 /* Fourth Column */ 3332 SSE_COPY_PS(XMM3,XMM6) 3333 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3334 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3335 SSE_SUB_PS(XMM7,XMM3) 3336 SSE_INLINE_END_2 3337 v += 16; 3338 } 3339 v = aa + ai16; 3340 ai16 = 16*diag[--i]; 3341 PREFETCH_NTA(aa+ai16+16); 3342 /* 3343 Scale the result by the diagonal 4x4 block, 3344 which was inverted as part of the factorization 3345 */ 3346 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3347 /* First Column */ 3348 SSE_COPY_PS(XMM0,XMM7) 3349 SSE_SHUFFLE(XMM0,XMM0,0x00) 3350 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3351 3352 /* Second Column */ 3353 SSE_COPY_PS(XMM1,XMM7) 3354 SSE_SHUFFLE(XMM1,XMM1,0x55) 3355 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3356 SSE_ADD_PS(XMM0,XMM1) 3357 3358 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3359 3360 /* Third Column */ 3361 SSE_COPY_PS(XMM2,XMM7) 3362 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3363 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3364 SSE_ADD_PS(XMM0,XMM2) 3365 3366 /* Fourth Column */ 3367 SSE_COPY_PS(XMM3,XMM7) 3368 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3369 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3370 SSE_ADD_PS(XMM0,XMM3) 3371 3372 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3373 SSE_INLINE_END_3 3374 3375 /* Promote solution from float to double */ 3376 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3377 3378 /* Apply reordering to t and stream into x. */ 3379 /* This way, x doesn't pollute the cache. */ 3380 /* Be careful with size: 2 doubles = 4 floats! */ 3381 idc = 4*(*c--); 3382 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3383 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3384 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3385 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3386 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3387 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3388 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3389 SSE_INLINE_END_2 3390 v = aa + ai16 + 16; 3391 idt -= 4; 3392 } 3393 3394 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3395 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3396 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3397 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3398 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3399 SSE_SCOPE_END; 3400 PetscFunctionReturn(0); 3401 } 3402 3403 #endif 3404 3405 3406 /* 3407 Special case where the matrix was ILU(0) factored in the natural 3408 ordering. This eliminates the need for the column and row permutation. 3409 */ 3410 #undef __FUNCT__ 3411 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3412 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3413 { 3414 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3415 PetscInt n=a->mbs; 3416 const PetscInt *ai=a->i,*aj=a->j; 3417 PetscErrorCode ierr; 3418 const PetscInt *diag = a->diag; 3419 const MatScalar *aa=a->a; 3420 PetscScalar *x; 3421 const PetscScalar *b; 3422 3423 PetscFunctionBegin; 3424 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3425 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3426 3427 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3428 { 3429 static PetscScalar w[2000]; /* very BAD need to fix */ 3430 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3431 } 3432 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3433 { 3434 static PetscScalar w[2000]; /* very BAD need to fix */ 3435 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3436 } 3437 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3438 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3439 #else 3440 { 3441 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3442 const MatScalar *v; 3443 PetscInt jdx,idt,idx,nz,i,ai16; 3444 const PetscInt *vi; 3445 3446 /* forward solve the lower triangular */ 3447 idx = 0; 3448 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3449 for (i=1; i<n; i++) { 3450 v = aa + 16*ai[i]; 3451 vi = aj + ai[i]; 3452 nz = diag[i] - ai[i]; 3453 idx += 4; 3454 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3455 while (nz--) { 3456 jdx = 4*(*vi++); 3457 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3458 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3459 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3460 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3461 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3462 v += 16; 3463 } 3464 x[idx] = s1; 3465 x[1+idx] = s2; 3466 x[2+idx] = s3; 3467 x[3+idx] = s4; 3468 } 3469 /* backward solve the upper triangular */ 3470 idt = 4*(n-1); 3471 for (i=n-1; i>=0; i--){ 3472 ai16 = 16*diag[i]; 3473 v = aa + ai16 + 16; 3474 vi = aj + diag[i] + 1; 3475 nz = ai[i+1] - diag[i] - 1; 3476 s1 = x[idt]; s2 = x[1+idt]; 3477 s3 = x[2+idt];s4 = x[3+idt]; 3478 while (nz--) { 3479 idx = 4*(*vi++); 3480 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3481 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3482 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3483 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3484 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3485 v += 16; 3486 } 3487 v = aa + ai16; 3488 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3489 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3490 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3491 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3492 idt -= 4; 3493 } 3494 } 3495 #endif 3496 3497 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3498 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3499 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3500 PetscFunctionReturn(0); 3501 } 3502 3503 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 3504 #undef __FUNCT__ 3505 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3506 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3507 { 3508 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3509 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3510 PetscErrorCode ierr; 3511 PetscInt idx,jdx,idt; 3512 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3513 const MatScalar *aa=a->a,*v; 3514 PetscScalar *x; 3515 const PetscScalar *b; 3516 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3517 3518 PetscFunctionBegin; 3519 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3520 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3521 /* forward solve the lower triangular */ 3522 idx = 0; 3523 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3524 for (i=1; i<n; i++) { 3525 v = aa + bs2*ai[i]; 3526 vi = aj + ai[i]; 3527 nz = ai[i+1] - ai[i]; 3528 idx = bs*i; 3529 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3530 for(k=0;k<nz;k++) { 3531 jdx = bs*vi[k]; 3532 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3533 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3534 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3535 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3536 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3537 3538 v += bs2; 3539 } 3540 3541 x[idx] = s1; 3542 x[1+idx] = s2; 3543 x[2+idx] = s3; 3544 x[3+idx] = s4; 3545 } 3546 3547 /* backward solve the upper triangular */ 3548 for (i=n-1; i>=0; i--){ 3549 v = aa + bs2*ai[2*n-i]; 3550 vi = aj + ai[2*n-i]; 3551 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3552 idt = bs*i; 3553 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3554 3555 for(k=0;k<nz;k++){ 3556 idx = bs*vi[k]; 3557 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3558 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3559 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3560 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3561 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3562 3563 v += bs2; 3564 } 3565 /* x = inv_diagonal*x */ 3566 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3567 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3568 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3569 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3570 3571 } 3572 3573 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3574 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3575 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3576 PetscFunctionReturn(0); 3577 } 3578 #endif 3579 3580 #undef __FUNCT__ 3581 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3582 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3583 { 3584 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3585 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3586 PetscErrorCode ierr; 3587 PetscInt idx,jdx,idt; 3588 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3589 const MatScalar *aa=a->a,*v; 3590 PetscScalar *x; 3591 const PetscScalar *b; 3592 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3593 3594 PetscFunctionBegin; 3595 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3596 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3597 /* forward solve the lower triangular */ 3598 idx = 0; 3599 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3600 for (i=1; i<n; i++) { 3601 v = aa + bs2*ai[i]; 3602 vi = aj + ai[i]; 3603 nz = ai[i+1] - ai[i]; 3604 idx = bs*i; 3605 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3606 for(k=0;k<nz;k++) { 3607 jdx = bs*vi[k]; 3608 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3609 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3610 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3611 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3612 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3613 3614 v += bs2; 3615 } 3616 3617 x[idx] = s1; 3618 x[1+idx] = s2; 3619 x[2+idx] = s3; 3620 x[3+idx] = s4; 3621 } 3622 3623 /* backward solve the upper triangular */ 3624 for (i=n-1; i>=0; i--){ 3625 v = aa + bs2*(adiag[i+1]+1); 3626 vi = aj + adiag[i+1]+1; 3627 nz = adiag[i] - adiag[i+1]-1; 3628 idt = bs*i; 3629 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3630 3631 for(k=0;k<nz;k++){ 3632 idx = bs*vi[k]; 3633 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3634 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3635 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3636 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3637 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3638 3639 v += bs2; 3640 } 3641 /* x = inv_diagonal*x */ 3642 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3643 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3644 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3645 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3646 3647 } 3648 3649 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3650 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3651 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3652 PetscFunctionReturn(0); 3653 } 3654 3655 #undef __FUNCT__ 3656 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3657 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3658 { 3659 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3660 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3661 PetscErrorCode ierr; 3662 PetscInt *diag = a->diag; 3663 MatScalar *aa=a->a; 3664 PetscScalar *x,*b; 3665 3666 PetscFunctionBegin; 3667 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3668 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3669 3670 { 3671 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3672 MatScalar *v,*t=(MatScalar *)x; 3673 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3674 3675 /* forward solve the lower triangular */ 3676 idx = 0; 3677 t[0] = (MatScalar)b[0]; 3678 t[1] = (MatScalar)b[1]; 3679 t[2] = (MatScalar)b[2]; 3680 t[3] = (MatScalar)b[3]; 3681 for (i=1; i<n; i++) { 3682 v = aa + 16*ai[i]; 3683 vi = aj + ai[i]; 3684 nz = diag[i] - ai[i]; 3685 idx += 4; 3686 s1 = (MatScalar)b[idx]; 3687 s2 = (MatScalar)b[1+idx]; 3688 s3 = (MatScalar)b[2+idx]; 3689 s4 = (MatScalar)b[3+idx]; 3690 while (nz--) { 3691 jdx = 4*(*vi++); 3692 x1 = t[jdx]; 3693 x2 = t[1+jdx]; 3694 x3 = t[2+jdx]; 3695 x4 = t[3+jdx]; 3696 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3697 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3698 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3699 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3700 v += 16; 3701 } 3702 t[idx] = s1; 3703 t[1+idx] = s2; 3704 t[2+idx] = s3; 3705 t[3+idx] = s4; 3706 } 3707 /* backward solve the upper triangular */ 3708 idt = 4*(n-1); 3709 for (i=n-1; i>=0; i--){ 3710 ai16 = 16*diag[i]; 3711 v = aa + ai16 + 16; 3712 vi = aj + diag[i] + 1; 3713 nz = ai[i+1] - diag[i] - 1; 3714 s1 = t[idt]; 3715 s2 = t[1+idt]; 3716 s3 = t[2+idt]; 3717 s4 = t[3+idt]; 3718 while (nz--) { 3719 idx = 4*(*vi++); 3720 x1 = (MatScalar)x[idx]; 3721 x2 = (MatScalar)x[1+idx]; 3722 x3 = (MatScalar)x[2+idx]; 3723 x4 = (MatScalar)x[3+idx]; 3724 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3725 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3726 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3727 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3728 v += 16; 3729 } 3730 v = aa + ai16; 3731 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3732 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3733 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3734 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3735 idt -= 4; 3736 } 3737 } 3738 3739 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3740 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3741 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3742 PetscFunctionReturn(0); 3743 } 3744 3745 #if defined (PETSC_HAVE_SSE) 3746 3747 #include PETSC_HAVE_SSE 3748 #undef __FUNCT__ 3749 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3750 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 3751 { 3752 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3753 unsigned short *aj=(unsigned short *)a->j; 3754 PetscErrorCode ierr; 3755 int *ai=a->i,n=a->mbs,*diag = a->diag; 3756 MatScalar *aa=a->a; 3757 PetscScalar *x,*b; 3758 3759 PetscFunctionBegin; 3760 SSE_SCOPE_BEGIN; 3761 /* 3762 Note: This code currently uses demotion of double 3763 to float when performing the mixed-mode computation. 3764 This may not be numerically reasonable for all applications. 3765 */ 3766 PREFETCH_NTA(aa+16*ai[1]); 3767 3768 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3769 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3770 { 3771 /* x will first be computed in single precision then promoted inplace to double */ 3772 MatScalar *v,*t=(MatScalar *)x; 3773 int nz,i,idt,ai16; 3774 unsigned int jdx,idx; 3775 unsigned short *vi; 3776 /* Forward solve the lower triangular factor. */ 3777 3778 /* First block is the identity. */ 3779 idx = 0; 3780 CONVERT_DOUBLE4_FLOAT4(t,b); 3781 v = aa + 16*((unsigned int)ai[1]); 3782 3783 for (i=1; i<n;) { 3784 PREFETCH_NTA(&v[8]); 3785 vi = aj + ai[i]; 3786 nz = diag[i] - ai[i]; 3787 idx += 4; 3788 3789 /* Demote RHS from double to float. */ 3790 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3791 LOAD_PS(&t[idx],XMM7); 3792 3793 while (nz--) { 3794 PREFETCH_NTA(&v[16]); 3795 jdx = 4*((unsigned int)(*vi++)); 3796 3797 /* 4x4 Matrix-Vector product with negative accumulation: */ 3798 SSE_INLINE_BEGIN_2(&t[jdx],v) 3799 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3800 3801 /* First Column */ 3802 SSE_COPY_PS(XMM0,XMM6) 3803 SSE_SHUFFLE(XMM0,XMM0,0x00) 3804 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3805 SSE_SUB_PS(XMM7,XMM0) 3806 3807 /* Second Column */ 3808 SSE_COPY_PS(XMM1,XMM6) 3809 SSE_SHUFFLE(XMM1,XMM1,0x55) 3810 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3811 SSE_SUB_PS(XMM7,XMM1) 3812 3813 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3814 3815 /* Third Column */ 3816 SSE_COPY_PS(XMM2,XMM6) 3817 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3818 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3819 SSE_SUB_PS(XMM7,XMM2) 3820 3821 /* Fourth Column */ 3822 SSE_COPY_PS(XMM3,XMM6) 3823 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3824 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3825 SSE_SUB_PS(XMM7,XMM3) 3826 SSE_INLINE_END_2 3827 3828 v += 16; 3829 } 3830 v = aa + 16*ai[++i]; 3831 PREFETCH_NTA(v); 3832 STORE_PS(&t[idx],XMM7); 3833 } 3834 3835 /* Backward solve the upper triangular factor.*/ 3836 3837 idt = 4*(n-1); 3838 ai16 = 16*diag[n-1]; 3839 v = aa + ai16 + 16; 3840 for (i=n-1; i>=0;){ 3841 PREFETCH_NTA(&v[8]); 3842 vi = aj + diag[i] + 1; 3843 nz = ai[i+1] - diag[i] - 1; 3844 3845 LOAD_PS(&t[idt],XMM7); 3846 3847 while (nz--) { 3848 PREFETCH_NTA(&v[16]); 3849 idx = 4*((unsigned int)(*vi++)); 3850 3851 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3852 SSE_INLINE_BEGIN_2(&t[idx],v) 3853 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3854 3855 /* First Column */ 3856 SSE_COPY_PS(XMM0,XMM6) 3857 SSE_SHUFFLE(XMM0,XMM0,0x00) 3858 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3859 SSE_SUB_PS(XMM7,XMM0) 3860 3861 /* Second Column */ 3862 SSE_COPY_PS(XMM1,XMM6) 3863 SSE_SHUFFLE(XMM1,XMM1,0x55) 3864 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3865 SSE_SUB_PS(XMM7,XMM1) 3866 3867 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3868 3869 /* Third Column */ 3870 SSE_COPY_PS(XMM2,XMM6) 3871 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3872 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3873 SSE_SUB_PS(XMM7,XMM2) 3874 3875 /* Fourth Column */ 3876 SSE_COPY_PS(XMM3,XMM6) 3877 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3878 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3879 SSE_SUB_PS(XMM7,XMM3) 3880 SSE_INLINE_END_2 3881 v += 16; 3882 } 3883 v = aa + ai16; 3884 ai16 = 16*diag[--i]; 3885 PREFETCH_NTA(aa+ai16+16); 3886 /* 3887 Scale the result by the diagonal 4x4 block, 3888 which was inverted as part of the factorization 3889 */ 3890 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3891 /* First Column */ 3892 SSE_COPY_PS(XMM0,XMM7) 3893 SSE_SHUFFLE(XMM0,XMM0,0x00) 3894 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3895 3896 /* Second Column */ 3897 SSE_COPY_PS(XMM1,XMM7) 3898 SSE_SHUFFLE(XMM1,XMM1,0x55) 3899 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3900 SSE_ADD_PS(XMM0,XMM1) 3901 3902 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3903 3904 /* Third Column */ 3905 SSE_COPY_PS(XMM2,XMM7) 3906 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3907 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3908 SSE_ADD_PS(XMM0,XMM2) 3909 3910 /* Fourth Column */ 3911 SSE_COPY_PS(XMM3,XMM7) 3912 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3913 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3914 SSE_ADD_PS(XMM0,XMM3) 3915 3916 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3917 SSE_INLINE_END_3 3918 3919 v = aa + ai16 + 16; 3920 idt -= 4; 3921 } 3922 3923 /* Convert t from single precision back to double precision (inplace)*/ 3924 idt = 4*(n-1); 3925 for (i=n-1;i>=0;i--) { 3926 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3927 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3928 PetscScalar *xtemp=&x[idt]; 3929 MatScalar *ttemp=&t[idt]; 3930 xtemp[3] = (PetscScalar)ttemp[3]; 3931 xtemp[2] = (PetscScalar)ttemp[2]; 3932 xtemp[1] = (PetscScalar)ttemp[1]; 3933 xtemp[0] = (PetscScalar)ttemp[0]; 3934 idt -= 4; 3935 } 3936 3937 } /* End of artificial scope. */ 3938 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3939 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3940 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3941 SSE_SCOPE_END; 3942 PetscFunctionReturn(0); 3943 } 3944 3945 #undef __FUNCT__ 3946 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3947 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 3948 { 3949 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3950 int *aj=a->j; 3951 PetscErrorCode ierr; 3952 int *ai=a->i,n=a->mbs,*diag = a->diag; 3953 MatScalar *aa=a->a; 3954 PetscScalar *x,*b; 3955 3956 PetscFunctionBegin; 3957 SSE_SCOPE_BEGIN; 3958 /* 3959 Note: This code currently uses demotion of double 3960 to float when performing the mixed-mode computation. 3961 This may not be numerically reasonable for all applications. 3962 */ 3963 PREFETCH_NTA(aa+16*ai[1]); 3964 3965 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3966 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3967 { 3968 /* x will first be computed in single precision then promoted inplace to double */ 3969 MatScalar *v,*t=(MatScalar *)x; 3970 int nz,i,idt,ai16; 3971 int jdx,idx; 3972 int *vi; 3973 /* Forward solve the lower triangular factor. */ 3974 3975 /* First block is the identity. */ 3976 idx = 0; 3977 CONVERT_DOUBLE4_FLOAT4(t,b); 3978 v = aa + 16*ai[1]; 3979 3980 for (i=1; i<n;) { 3981 PREFETCH_NTA(&v[8]); 3982 vi = aj + ai[i]; 3983 nz = diag[i] - ai[i]; 3984 idx += 4; 3985 3986 /* Demote RHS from double to float. */ 3987 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3988 LOAD_PS(&t[idx],XMM7); 3989 3990 while (nz--) { 3991 PREFETCH_NTA(&v[16]); 3992 jdx = 4*(*vi++); 3993 /* jdx = *vi++; */ 3994 3995 /* 4x4 Matrix-Vector product with negative accumulation: */ 3996 SSE_INLINE_BEGIN_2(&t[jdx],v) 3997 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3998 3999 /* First Column */ 4000 SSE_COPY_PS(XMM0,XMM6) 4001 SSE_SHUFFLE(XMM0,XMM0,0x00) 4002 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4003 SSE_SUB_PS(XMM7,XMM0) 4004 4005 /* Second Column */ 4006 SSE_COPY_PS(XMM1,XMM6) 4007 SSE_SHUFFLE(XMM1,XMM1,0x55) 4008 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4009 SSE_SUB_PS(XMM7,XMM1) 4010 4011 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4012 4013 /* Third Column */ 4014 SSE_COPY_PS(XMM2,XMM6) 4015 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4016 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4017 SSE_SUB_PS(XMM7,XMM2) 4018 4019 /* Fourth Column */ 4020 SSE_COPY_PS(XMM3,XMM6) 4021 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4022 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4023 SSE_SUB_PS(XMM7,XMM3) 4024 SSE_INLINE_END_2 4025 4026 v += 16; 4027 } 4028 v = aa + 16*ai[++i]; 4029 PREFETCH_NTA(v); 4030 STORE_PS(&t[idx],XMM7); 4031 } 4032 4033 /* Backward solve the upper triangular factor.*/ 4034 4035 idt = 4*(n-1); 4036 ai16 = 16*diag[n-1]; 4037 v = aa + ai16 + 16; 4038 for (i=n-1; i>=0;){ 4039 PREFETCH_NTA(&v[8]); 4040 vi = aj + diag[i] + 1; 4041 nz = ai[i+1] - diag[i] - 1; 4042 4043 LOAD_PS(&t[idt],XMM7); 4044 4045 while (nz--) { 4046 PREFETCH_NTA(&v[16]); 4047 idx = 4*(*vi++); 4048 /* idx = *vi++; */ 4049 4050 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4051 SSE_INLINE_BEGIN_2(&t[idx],v) 4052 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4053 4054 /* First Column */ 4055 SSE_COPY_PS(XMM0,XMM6) 4056 SSE_SHUFFLE(XMM0,XMM0,0x00) 4057 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4058 SSE_SUB_PS(XMM7,XMM0) 4059 4060 /* Second Column */ 4061 SSE_COPY_PS(XMM1,XMM6) 4062 SSE_SHUFFLE(XMM1,XMM1,0x55) 4063 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4064 SSE_SUB_PS(XMM7,XMM1) 4065 4066 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4067 4068 /* Third Column */ 4069 SSE_COPY_PS(XMM2,XMM6) 4070 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4071 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4072 SSE_SUB_PS(XMM7,XMM2) 4073 4074 /* Fourth Column */ 4075 SSE_COPY_PS(XMM3,XMM6) 4076 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4077 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4078 SSE_SUB_PS(XMM7,XMM3) 4079 SSE_INLINE_END_2 4080 v += 16; 4081 } 4082 v = aa + ai16; 4083 ai16 = 16*diag[--i]; 4084 PREFETCH_NTA(aa+ai16+16); 4085 /* 4086 Scale the result by the diagonal 4x4 block, 4087 which was inverted as part of the factorization 4088 */ 4089 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4090 /* First Column */ 4091 SSE_COPY_PS(XMM0,XMM7) 4092 SSE_SHUFFLE(XMM0,XMM0,0x00) 4093 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4094 4095 /* Second Column */ 4096 SSE_COPY_PS(XMM1,XMM7) 4097 SSE_SHUFFLE(XMM1,XMM1,0x55) 4098 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4099 SSE_ADD_PS(XMM0,XMM1) 4100 4101 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4102 4103 /* Third Column */ 4104 SSE_COPY_PS(XMM2,XMM7) 4105 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4106 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4107 SSE_ADD_PS(XMM0,XMM2) 4108 4109 /* Fourth Column */ 4110 SSE_COPY_PS(XMM3,XMM7) 4111 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4112 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4113 SSE_ADD_PS(XMM0,XMM3) 4114 4115 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4116 SSE_INLINE_END_3 4117 4118 v = aa + ai16 + 16; 4119 idt -= 4; 4120 } 4121 4122 /* Convert t from single precision back to double precision (inplace)*/ 4123 idt = 4*(n-1); 4124 for (i=n-1;i>=0;i--) { 4125 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4126 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4127 PetscScalar *xtemp=&x[idt]; 4128 MatScalar *ttemp=&t[idt]; 4129 xtemp[3] = (PetscScalar)ttemp[3]; 4130 xtemp[2] = (PetscScalar)ttemp[2]; 4131 xtemp[1] = (PetscScalar)ttemp[1]; 4132 xtemp[0] = (PetscScalar)ttemp[0]; 4133 idt -= 4; 4134 } 4135 4136 } /* End of artificial scope. */ 4137 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4138 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4139 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4140 SSE_SCOPE_END; 4141 PetscFunctionReturn(0); 4142 } 4143 4144 #endif 4145 4146 #undef __FUNCT__ 4147 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4148 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4149 { 4150 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4151 IS iscol=a->col,isrow=a->row; 4152 PetscErrorCode ierr; 4153 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4154 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4155 const MatScalar *aa=a->a,*v; 4156 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4157 const PetscScalar *b; 4158 4159 PetscFunctionBegin; 4160 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4161 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4162 t = a->solve_work; 4163 4164 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4165 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4166 4167 /* forward solve the lower triangular */ 4168 idx = 3*(*r++); 4169 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4170 for (i=1; i<n; i++) { 4171 v = aa + 9*ai[i]; 4172 vi = aj + ai[i]; 4173 nz = diag[i] - ai[i]; 4174 idx = 3*(*r++); 4175 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4176 while (nz--) { 4177 idx = 3*(*vi++); 4178 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4179 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4180 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4181 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4182 v += 9; 4183 } 4184 idx = 3*i; 4185 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4186 } 4187 /* backward solve the upper triangular */ 4188 for (i=n-1; i>=0; i--){ 4189 v = aa + 9*diag[i] + 9; 4190 vi = aj + diag[i] + 1; 4191 nz = ai[i+1] - diag[i] - 1; 4192 idt = 3*i; 4193 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4194 while (nz--) { 4195 idx = 3*(*vi++); 4196 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4197 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4198 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4199 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4200 v += 9; 4201 } 4202 idc = 3*(*c--); 4203 v = aa + 9*diag[i]; 4204 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4205 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4206 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4207 } 4208 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4209 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4210 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4211 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4212 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4213 PetscFunctionReturn(0); 4214 } 4215 4216 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 4217 #undef __FUNCT__ 4218 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 4219 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 4220 { 4221 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4222 IS iscol=a->col,isrow=a->row; 4223 PetscErrorCode ierr; 4224 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 4225 const PetscInt *r,*c,*rout,*cout; 4226 const MatScalar *aa=a->a,*v; 4227 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4228 const PetscScalar *b; 4229 4230 PetscFunctionBegin; 4231 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4232 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4233 t = a->solve_work; 4234 4235 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4236 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4237 4238 /* forward solve the lower triangular */ 4239 idx = 3*r[0]; 4240 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4241 for (i=1; i<n; i++) { 4242 v = aa + 9*ai[i]; 4243 vi = aj + ai[i]; 4244 nz = ai[i+1] - ai[i]; 4245 idx = 3*r[i]; 4246 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4247 for(m=0;m<nz;m++){ 4248 idx = 3*vi[m]; 4249 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4250 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4251 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4252 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4253 v += 9; 4254 } 4255 idx = 3*i; 4256 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4257 } 4258 /* backward solve the upper triangular */ 4259 for (i=n-1; i>=0; i--){ 4260 k = 2*n-i; 4261 v = aa + 9*ai[k]; 4262 vi = aj + ai[k]; 4263 nz = ai[k +1] - ai[k] - 1; 4264 idt = 3*i; 4265 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4266 for(m=0;m<nz;m++){ 4267 idx = 3*vi[m]; 4268 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4269 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4270 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4271 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4272 v += 9; 4273 } 4274 idc = 3*c[i]; 4275 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4276 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4277 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4278 } 4279 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4280 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4281 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4282 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4283 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4284 PetscFunctionReturn(0); 4285 } 4286 #endif 4287 4288 #undef __FUNCT__ 4289 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 4290 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 4291 { 4292 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4293 IS iscol=a->col,isrow=a->row; 4294 PetscErrorCode ierr; 4295 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 4296 const PetscInt *r,*c,*rout,*cout; 4297 const MatScalar *aa=a->a,*v; 4298 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4299 const PetscScalar *b; 4300 4301 PetscFunctionBegin; 4302 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4303 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4304 t = a->solve_work; 4305 4306 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4307 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4308 4309 /* forward solve the lower triangular */ 4310 idx = 3*r[0]; 4311 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4312 for (i=1; i<n; i++) { 4313 v = aa + 9*ai[i]; 4314 vi = aj + ai[i]; 4315 nz = ai[i+1] - ai[i]; 4316 idx = 3*r[i]; 4317 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4318 for(m=0;m<nz;m++){ 4319 idx = 3*vi[m]; 4320 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4321 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4322 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4323 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4324 v += 9; 4325 } 4326 idx = 3*i; 4327 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4328 } 4329 /* backward solve the upper triangular */ 4330 for (i=n-1; i>=0; i--){ 4331 v = aa + 9*(adiag[i+1]+1); 4332 vi = aj + adiag[i+1]+1; 4333 nz = adiag[i] - adiag[i+1] - 1; 4334 idt = 3*i; 4335 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4336 for(m=0;m<nz;m++){ 4337 idx = 3*vi[m]; 4338 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4339 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4340 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4341 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4342 v += 9; 4343 } 4344 idc = 3*c[i]; 4345 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4346 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4347 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4348 } 4349 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4350 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4351 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4352 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4353 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4354 PetscFunctionReturn(0); 4355 } 4356 4357 /* 4358 Special case where the matrix was ILU(0) factored in the natural 4359 ordering. This eliminates the need for the column and row permutation. 4360 */ 4361 #undef __FUNCT__ 4362 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4363 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4364 { 4365 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4366 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4367 PetscErrorCode ierr; 4368 PetscInt *diag = a->diag; 4369 const MatScalar *aa=a->a,*v; 4370 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4371 const PetscScalar *b; 4372 PetscInt jdx,idt,idx,nz,*vi,i; 4373 4374 PetscFunctionBegin; 4375 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4376 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4377 4378 /* forward solve the lower triangular */ 4379 idx = 0; 4380 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4381 for (i=1; i<n; i++) { 4382 v = aa + 9*ai[i]; 4383 vi = aj + ai[i]; 4384 nz = diag[i] - ai[i]; 4385 idx += 3; 4386 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4387 while (nz--) { 4388 jdx = 3*(*vi++); 4389 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4390 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4391 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4392 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4393 v += 9; 4394 } 4395 x[idx] = s1; 4396 x[1+idx] = s2; 4397 x[2+idx] = s3; 4398 } 4399 /* backward solve the upper triangular */ 4400 for (i=n-1; i>=0; i--){ 4401 v = aa + 9*diag[i] + 9; 4402 vi = aj + diag[i] + 1; 4403 nz = ai[i+1] - diag[i] - 1; 4404 idt = 3*i; 4405 s1 = x[idt]; s2 = x[1+idt]; 4406 s3 = x[2+idt]; 4407 while (nz--) { 4408 idx = 3*(*vi++); 4409 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4410 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4411 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4412 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4413 v += 9; 4414 } 4415 v = aa + 9*diag[i]; 4416 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4417 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4418 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4419 } 4420 4421 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4422 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4423 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4424 PetscFunctionReturn(0); 4425 } 4426 4427 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 4428 #undef __FUNCT__ 4429 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4430 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4431 { 4432 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4433 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4434 PetscErrorCode ierr; 4435 PetscInt idx,jdx,idt; 4436 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4437 const MatScalar *aa=a->a,*v; 4438 PetscScalar *x; 4439 const PetscScalar *b; 4440 PetscScalar s1,s2,s3,x1,x2,x3; 4441 4442 PetscFunctionBegin; 4443 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4444 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4445 /* forward solve the lower triangular */ 4446 idx = 0; 4447 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4448 for (i=1; i<n; i++) { 4449 v = aa + bs2*ai[i]; 4450 vi = aj + ai[i]; 4451 nz = ai[i+1] - ai[i]; 4452 idx = bs*i; 4453 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4454 for(k=0;k<nz;k++){ 4455 jdx = bs*vi[k]; 4456 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4457 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4458 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4459 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4460 4461 v += bs2; 4462 } 4463 4464 x[idx] = s1; 4465 x[1+idx] = s2; 4466 x[2+idx] = s3; 4467 } 4468 4469 /* backward solve the upper triangular */ 4470 for (i=n-1; i>=0; i--){ 4471 v = aa + bs2*ai[2*n-i]; 4472 vi = aj + ai[2*n-i]; 4473 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4474 idt = bs*i; 4475 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4476 4477 for(k=0;k<nz;k++){ 4478 idx = bs*vi[k]; 4479 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4480 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4481 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4482 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4483 4484 v += bs2; 4485 } 4486 /* x = inv_diagonal*x */ 4487 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4488 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4489 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4490 4491 } 4492 4493 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4494 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4495 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4496 PetscFunctionReturn(0); 4497 } 4498 #endif 4499 4500 #undef __FUNCT__ 4501 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4502 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4503 { 4504 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4505 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4506 PetscErrorCode ierr; 4507 PetscInt idx,jdx,idt; 4508 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4509 const MatScalar *aa=a->a,*v; 4510 PetscScalar *x; 4511 const PetscScalar *b; 4512 PetscScalar s1,s2,s3,x1,x2,x3; 4513 4514 PetscFunctionBegin; 4515 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4516 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4517 /* forward solve the lower triangular */ 4518 idx = 0; 4519 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4520 for (i=1; i<n; i++) { 4521 v = aa + bs2*ai[i]; 4522 vi = aj + ai[i]; 4523 nz = ai[i+1] - ai[i]; 4524 idx = bs*i; 4525 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4526 for(k=0;k<nz;k++){ 4527 jdx = bs*vi[k]; 4528 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4529 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4530 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4531 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4532 4533 v += bs2; 4534 } 4535 4536 x[idx] = s1; 4537 x[1+idx] = s2; 4538 x[2+idx] = s3; 4539 } 4540 4541 /* backward solve the upper triangular */ 4542 for (i=n-1; i>=0; i--){ 4543 v = aa + bs2*(adiag[i+1]+1); 4544 vi = aj + adiag[i+1]+1; 4545 nz = adiag[i] - adiag[i+1]-1; 4546 idt = bs*i; 4547 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4548 4549 for(k=0;k<nz;k++){ 4550 idx = bs*vi[k]; 4551 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4552 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4553 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4554 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4555 4556 v += bs2; 4557 } 4558 /* x = inv_diagonal*x */ 4559 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4560 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4561 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4562 4563 } 4564 4565 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4566 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4567 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4568 PetscFunctionReturn(0); 4569 } 4570 4571 #undef __FUNCT__ 4572 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4573 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4574 { 4575 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4576 IS iscol=a->col,isrow=a->row; 4577 PetscErrorCode ierr; 4578 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4579 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4580 const MatScalar *aa=a->a,*v; 4581 PetscScalar *x,s1,s2,x1,x2,*t; 4582 const PetscScalar *b; 4583 4584 PetscFunctionBegin; 4585 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4586 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4587 t = a->solve_work; 4588 4589 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4590 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4591 4592 /* forward solve the lower triangular */ 4593 idx = 2*(*r++); 4594 t[0] = b[idx]; t[1] = b[1+idx]; 4595 for (i=1; i<n; i++) { 4596 v = aa + 4*ai[i]; 4597 vi = aj + ai[i]; 4598 nz = diag[i] - ai[i]; 4599 idx = 2*(*r++); 4600 s1 = b[idx]; s2 = b[1+idx]; 4601 while (nz--) { 4602 idx = 2*(*vi++); 4603 x1 = t[idx]; x2 = t[1+idx]; 4604 s1 -= v[0]*x1 + v[2]*x2; 4605 s2 -= v[1]*x1 + v[3]*x2; 4606 v += 4; 4607 } 4608 idx = 2*i; 4609 t[idx] = s1; t[1+idx] = s2; 4610 } 4611 /* backward solve the upper triangular */ 4612 for (i=n-1; i>=0; i--){ 4613 v = aa + 4*diag[i] + 4; 4614 vi = aj + diag[i] + 1; 4615 nz = ai[i+1] - diag[i] - 1; 4616 idt = 2*i; 4617 s1 = t[idt]; s2 = t[1+idt]; 4618 while (nz--) { 4619 idx = 2*(*vi++); 4620 x1 = t[idx]; x2 = t[1+idx]; 4621 s1 -= v[0]*x1 + v[2]*x2; 4622 s2 -= v[1]*x1 + v[3]*x2; 4623 v += 4; 4624 } 4625 idc = 2*(*c--); 4626 v = aa + 4*diag[i]; 4627 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4628 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4629 } 4630 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4631 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4632 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4633 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4634 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4635 PetscFunctionReturn(0); 4636 } 4637 4638 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 4639 #undef __FUNCT__ 4640 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4641 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 4642 { 4643 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4644 IS iscol=a->col,isrow=a->row; 4645 PetscErrorCode ierr; 4646 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 4647 const PetscInt *r,*c,*rout,*cout; 4648 const MatScalar *aa=a->a,*v; 4649 PetscScalar *x,s1,s2,x1,x2,*t; 4650 const PetscScalar *b; 4651 4652 PetscFunctionBegin; 4653 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4654 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4655 t = a->solve_work; 4656 4657 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4658 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4659 4660 /* forward solve the lower triangular */ 4661 idx = 2*r[0]; 4662 t[0] = b[idx]; t[1] = b[1+idx]; 4663 for (i=1; i<n; i++) { 4664 v = aa + 4*ai[i]; 4665 vi = aj + ai[i]; 4666 nz = ai[i+1] - ai[i]; 4667 idx = 2*r[i]; 4668 s1 = b[idx]; s2 = b[1+idx]; 4669 for(m=0;m<nz;m++){ 4670 jdx = 2*vi[m]; 4671 x1 = t[jdx]; x2 = t[1+jdx]; 4672 s1 -= v[0]*x1 + v[2]*x2; 4673 s2 -= v[1]*x1 + v[3]*x2; 4674 v += 4; 4675 } 4676 idx = 2*i; 4677 t[idx] = s1; t[1+idx] = s2; 4678 } 4679 /* backward solve the upper triangular */ 4680 for (i=n-1; i>=0; i--){ 4681 k = 2*n-i; 4682 v = aa + 4*ai[k]; 4683 vi = aj + ai[k]; 4684 nz = ai[k +1] - ai[k] - 1; 4685 idt = 2*i; 4686 s1 = t[idt]; s2 = t[1+idt]; 4687 for(m=0;m<nz;m++){ 4688 idx = 2*vi[m]; 4689 x1 = t[idx]; x2 = t[1+idx]; 4690 s1 -= v[0]*x1 + v[2]*x2; 4691 s2 -= v[1]*x1 + v[3]*x2; 4692 v += 4; 4693 } 4694 idc = 2*c[i]; 4695 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4696 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4697 } 4698 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4699 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4700 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4701 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4702 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4703 PetscFunctionReturn(0); 4704 } 4705 #endif 4706 4707 #undef __FUNCT__ 4708 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4709 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 4710 { 4711 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4712 IS iscol=a->col,isrow=a->row; 4713 PetscErrorCode ierr; 4714 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4715 const PetscInt *r,*c,*rout,*cout; 4716 const MatScalar *aa=a->a,*v; 4717 PetscScalar *x,s1,s2,x1,x2,*t; 4718 const PetscScalar *b; 4719 4720 PetscFunctionBegin; 4721 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4722 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4723 t = a->solve_work; 4724 4725 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4726 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4727 4728 /* forward solve the lower triangular */ 4729 idx = 2*r[0]; 4730 t[0] = b[idx]; t[1] = b[1+idx]; 4731 for (i=1; i<n; i++) { 4732 v = aa + 4*ai[i]; 4733 vi = aj + ai[i]; 4734 nz = ai[i+1] - ai[i]; 4735 idx = 2*r[i]; 4736 s1 = b[idx]; s2 = b[1+idx]; 4737 for(m=0;m<nz;m++){ 4738 jdx = 2*vi[m]; 4739 x1 = t[jdx]; x2 = t[1+jdx]; 4740 s1 -= v[0]*x1 + v[2]*x2; 4741 s2 -= v[1]*x1 + v[3]*x2; 4742 v += 4; 4743 } 4744 idx = 2*i; 4745 t[idx] = s1; t[1+idx] = s2; 4746 } 4747 /* backward solve the upper triangular */ 4748 for (i=n-1; i>=0; i--){ 4749 v = aa + 4*(adiag[i+1]+1); 4750 vi = aj + adiag[i+1]+1; 4751 nz = adiag[i] - adiag[i+1] - 1; 4752 idt = 2*i; 4753 s1 = t[idt]; s2 = t[1+idt]; 4754 for(m=0;m<nz;m++){ 4755 idx = 2*vi[m]; 4756 x1 = t[idx]; x2 = t[1+idx]; 4757 s1 -= v[0]*x1 + v[2]*x2; 4758 s2 -= v[1]*x1 + v[3]*x2; 4759 v += 4; 4760 } 4761 idc = 2*c[i]; 4762 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4763 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4764 } 4765 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4766 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4767 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4768 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4769 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4770 PetscFunctionReturn(0); 4771 } 4772 4773 /* 4774 Special case where the matrix was ILU(0) factored in the natural 4775 ordering. This eliminates the need for the column and row permutation. 4776 */ 4777 #undef __FUNCT__ 4778 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4779 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 4780 { 4781 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4782 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4783 PetscErrorCode ierr; 4784 PetscInt *diag = a->diag; 4785 const MatScalar *aa=a->a,*v; 4786 PetscScalar *x,s1,s2,x1,x2; 4787 const PetscScalar *b; 4788 PetscInt jdx,idt,idx,nz,*vi,i; 4789 4790 PetscFunctionBegin; 4791 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4793 4794 /* forward solve the lower triangular */ 4795 idx = 0; 4796 x[0] = b[0]; x[1] = b[1]; 4797 for (i=1; i<n; i++) { 4798 v = aa + 4*ai[i]; 4799 vi = aj + ai[i]; 4800 nz = diag[i] - ai[i]; 4801 idx += 2; 4802 s1 = b[idx];s2 = b[1+idx]; 4803 while (nz--) { 4804 jdx = 2*(*vi++); 4805 x1 = x[jdx];x2 = x[1+jdx]; 4806 s1 -= v[0]*x1 + v[2]*x2; 4807 s2 -= v[1]*x1 + v[3]*x2; 4808 v += 4; 4809 } 4810 x[idx] = s1; 4811 x[1+idx] = s2; 4812 } 4813 /* backward solve the upper triangular */ 4814 for (i=n-1; i>=0; i--){ 4815 v = aa + 4*diag[i] + 4; 4816 vi = aj + diag[i] + 1; 4817 nz = ai[i+1] - diag[i] - 1; 4818 idt = 2*i; 4819 s1 = x[idt]; s2 = x[1+idt]; 4820 while (nz--) { 4821 idx = 2*(*vi++); 4822 x1 = x[idx]; x2 = x[1+idx]; 4823 s1 -= v[0]*x1 + v[2]*x2; 4824 s2 -= v[1]*x1 + v[3]*x2; 4825 v += 4; 4826 } 4827 v = aa + 4*diag[i]; 4828 x[idt] = v[0]*s1 + v[2]*s2; 4829 x[1+idt] = v[1]*s1 + v[3]*s2; 4830 } 4831 4832 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4833 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4834 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4835 PetscFunctionReturn(0); 4836 } 4837 4838 #if defined(OLD_ROUTINE_TO_BE_REPLACED) 4839 #undef __FUNCT__ 4840 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4841 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4842 { 4843 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4844 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4845 PetscErrorCode ierr; 4846 PetscInt jdx; 4847 const MatScalar *aa=a->a,*v; 4848 PetscScalar *x,s1,s2,x1,x2; 4849 const PetscScalar *b; 4850 4851 PetscFunctionBegin; 4852 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4853 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4854 /* forward solve the lower triangular */ 4855 idx = 0; 4856 x[0] = b[idx]; x[1] = b[1+idx]; 4857 for (i=1; i<n; i++) { 4858 v = aa + 4*ai[i]; 4859 vi = aj + ai[i]; 4860 nz = ai[i+1] - ai[i]; 4861 idx = 2*i; 4862 s1 = b[idx];s2 = b[1+idx]; 4863 for(k=0;k<nz;k++){ 4864 jdx = 2*vi[k]; 4865 x1 = x[jdx];x2 = x[1+jdx]; 4866 s1 -= v[0]*x1 + v[2]*x2; 4867 s2 -= v[1]*x1 + v[3]*x2; 4868 v += 4; 4869 } 4870 x[idx] = s1; 4871 x[1+idx] = s2; 4872 } 4873 4874 /* backward solve the upper triangular */ 4875 for (i=n-1; i>=0; i--){ 4876 v = aa + 4*ai[2*n-i]; 4877 vi = aj + ai[2*n-i]; 4878 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4879 idt = 2*i; 4880 s1 = x[idt]; s2 = x[1+idt]; 4881 for(k=0;k<nz;k++){ 4882 idx = 2*vi[k]; 4883 x1 = x[idx]; x2 = x[1+idx]; 4884 s1 -= v[0]*x1 + v[2]*x2; 4885 s2 -= v[1]*x1 + v[3]*x2; 4886 v += 4; 4887 } 4888 /* x = inv_diagonal*x */ 4889 x[idt] = v[0]*s1 + v[2]*s2; 4890 x[1+idt] = v[1]*s1 + v[3]*s2; 4891 } 4892 4893 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4894 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4895 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4896 PetscFunctionReturn(0); 4897 } 4898 #endif 4899 4900 #undef __FUNCT__ 4901 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4902 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4903 { 4904 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4905 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4906 PetscErrorCode ierr; 4907 PetscInt jdx; 4908 const MatScalar *aa=a->a,*v; 4909 PetscScalar *x,s1,s2,x1,x2; 4910 const PetscScalar *b; 4911 4912 PetscFunctionBegin; 4913 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4914 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4915 /* forward solve the lower triangular */ 4916 idx = 0; 4917 x[0] = b[idx]; x[1] = b[1+idx]; 4918 for (i=1; i<n; i++) { 4919 v = aa + 4*ai[i]; 4920 vi = aj + ai[i]; 4921 nz = ai[i+1] - ai[i]; 4922 idx = 2*i; 4923 s1 = b[idx];s2 = b[1+idx]; 4924 for(k=0;k<nz;k++){ 4925 jdx = 2*vi[k]; 4926 x1 = x[jdx];x2 = x[1+jdx]; 4927 s1 -= v[0]*x1 + v[2]*x2; 4928 s2 -= v[1]*x1 + v[3]*x2; 4929 v += 4; 4930 } 4931 x[idx] = s1; 4932 x[1+idx] = s2; 4933 } 4934 4935 /* backward solve the upper triangular */ 4936 for (i=n-1; i>=0; i--){ 4937 v = aa + 4*(adiag[i+1]+1); 4938 vi = aj + adiag[i+1]+1; 4939 nz = adiag[i] - adiag[i+1]-1; 4940 idt = 2*i; 4941 s1 = x[idt]; s2 = x[1+idt]; 4942 for(k=0;k<nz;k++){ 4943 idx = 2*vi[k]; 4944 x1 = x[idx]; x2 = x[1+idx]; 4945 s1 -= v[0]*x1 + v[2]*x2; 4946 s2 -= v[1]*x1 + v[3]*x2; 4947 v += 4; 4948 } 4949 /* x = inv_diagonal*x */ 4950 x[idt] = v[0]*s1 + v[2]*s2; 4951 x[1+idt] = v[1]*s1 + v[3]*s2; 4952 } 4953 4954 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4955 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4956 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4957 PetscFunctionReturn(0); 4958 } 4959 4960 #undef __FUNCT__ 4961 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4962 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4963 { 4964 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4965 IS iscol=a->col,isrow=a->row; 4966 PetscErrorCode ierr; 4967 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4968 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4969 MatScalar *aa=a->a,*v; 4970 PetscScalar *x,*b,s1,*t; 4971 4972 PetscFunctionBegin; 4973 if (!n) PetscFunctionReturn(0); 4974 4975 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4976 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4977 t = a->solve_work; 4978 4979 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4980 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4981 4982 /* forward solve the lower triangular */ 4983 t[0] = b[*r++]; 4984 for (i=1; i<n; i++) { 4985 v = aa + ai[i]; 4986 vi = aj + ai[i]; 4987 nz = diag[i] - ai[i]; 4988 s1 = b[*r++]; 4989 while (nz--) { 4990 s1 -= (*v++)*t[*vi++]; 4991 } 4992 t[i] = s1; 4993 } 4994 /* backward solve the upper triangular */ 4995 for (i=n-1; i>=0; i--){ 4996 v = aa + diag[i] + 1; 4997 vi = aj + diag[i] + 1; 4998 nz = ai[i+1] - diag[i] - 1; 4999 s1 = t[i]; 5000 while (nz--) { 5001 s1 -= (*v++)*t[*vi++]; 5002 } 5003 x[*c--] = t[i] = aa[diag[i]]*s1; 5004 } 5005 5006 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5007 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5008 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5009 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5010 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5011 PetscFunctionReturn(0); 5012 } 5013 /* 5014 Special case where the matrix was ILU(0) factored in the natural 5015 ordering. This eliminates the need for the column and row permutation. 5016 */ 5017 #undef __FUNCT__ 5018 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5019 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5020 { 5021 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5022 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5023 PetscErrorCode ierr; 5024 PetscInt *diag = a->diag; 5025 MatScalar *aa=a->a; 5026 PetscScalar *x,*b; 5027 PetscScalar s1,x1; 5028 MatScalar *v; 5029 PetscInt jdx,idt,idx,nz,*vi,i; 5030 5031 PetscFunctionBegin; 5032 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5033 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5034 5035 /* forward solve the lower triangular */ 5036 idx = 0; 5037 x[0] = b[0]; 5038 for (i=1; i<n; i++) { 5039 v = aa + ai[i]; 5040 vi = aj + ai[i]; 5041 nz = diag[i] - ai[i]; 5042 idx += 1; 5043 s1 = b[idx]; 5044 while (nz--) { 5045 jdx = *vi++; 5046 x1 = x[jdx]; 5047 s1 -= v[0]*x1; 5048 v += 1; 5049 } 5050 x[idx] = s1; 5051 } 5052 /* backward solve the upper triangular */ 5053 for (i=n-1; i>=0; i--){ 5054 v = aa + diag[i] + 1; 5055 vi = aj + diag[i] + 1; 5056 nz = ai[i+1] - diag[i] - 1; 5057 idt = i; 5058 s1 = x[idt]; 5059 while (nz--) { 5060 idx = *vi++; 5061 x1 = x[idx]; 5062 s1 -= v[0]*x1; 5063 v += 1; 5064 } 5065 v = aa + diag[i]; 5066 x[idt] = v[0]*s1; 5067 } 5068 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5069 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5070 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5071 PetscFunctionReturn(0); 5072 } 5073 5074 /* ----------------------------------------------------------------*/ 5075 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 5076 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 5077 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 5078 5079 #undef __FUNCT__ 5080 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 5081 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 5082 { 5083 Mat C=B; 5084 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5085 IS isrow = b->row,isicol = b->icol; 5086 PetscErrorCode ierr; 5087 const PetscInt *r,*ic,*ics; 5088 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5089 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5090 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5091 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5092 MatScalar *v_work; 5093 PetscTruth col_identity,row_identity,both_identity; 5094 5095 PetscFunctionBegin; 5096 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5097 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5098 5099 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5100 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5101 ics = ic; 5102 5103 /* generate work space needed by dense LU factorization */ 5104 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5105 5106 for (i=0; i<n; i++){ 5107 /* zero rtmp */ 5108 /* L part */ 5109 nz = bi[i+1] - bi[i]; 5110 bjtmp = bj + bi[i]; 5111 for (j=0; j<nz; j++){ 5112 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5113 } 5114 5115 /* U part */ 5116 nz = bdiag[i] - bdiag[i+1]; 5117 bjtmp = bj + bdiag[i+1]+1; 5118 for (j=0; j<nz; j++){ 5119 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5120 } 5121 5122 /* load in initial (unfactored row) */ 5123 nz = ai[r[i]+1] - ai[r[i]]; 5124 ajtmp = aj + ai[r[i]]; 5125 v = aa + bs2*ai[r[i]]; 5126 for (j=0; j<nz; j++) { 5127 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5128 } 5129 5130 /* elimination */ 5131 bjtmp = bj + bi[i]; 5132 nzL = bi[i+1] - bi[i]; 5133 for(k=0;k < nzL;k++) { 5134 row = bjtmp[k]; 5135 pc = rtmp + bs2*row; 5136 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5137 if (flg) { 5138 pv = b->a + bs2*bdiag[row]; 5139 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5140 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5141 pv = b->a + bs2*(bdiag[row+1]+1); 5142 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5143 for (j=0; j<nz; j++) { 5144 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5145 } 5146 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5147 } 5148 } 5149 5150 /* finished row so stick it into b->a */ 5151 /* L part */ 5152 pv = b->a + bs2*bi[i] ; 5153 pj = b->j + bi[i] ; 5154 nz = bi[i+1] - bi[i]; 5155 for (j=0; j<nz; j++) { 5156 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5157 } 5158 5159 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5160 pv = b->a + bs2*bdiag[i]; 5161 pj = b->j + bdiag[i]; 5162 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5163 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5164 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5165 5166 /* U part */ 5167 pv = b->a + bs2*(bdiag[i+1]+1); 5168 pj = b->j + bdiag[i+1]+1; 5169 nz = bdiag[i] - bdiag[i+1] - 1; 5170 for (j=0; j<nz; j++){ 5171 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5172 } 5173 } 5174 5175 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5176 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5177 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5178 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5179 5180 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5181 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5182 both_identity = (PetscTruth) (row_identity && col_identity); 5183 if (both_identity){ 5184 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 5185 } else { 5186 C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 5187 } 5188 5189 C->assembled = PETSC_TRUE; 5190 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5191 PetscFunctionReturn(0); 5192 } 5193 5194 /* 5195 ilu(0) with natural ordering under new data structure. 5196 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 5197 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 5198 */ 5199 5200 #undef __FUNCT__ 5201 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 5202 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5203 { 5204 5205 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5206 PetscErrorCode ierr; 5207 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5208 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5209 5210 PetscFunctionBegin; 5211 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5212 b = (Mat_SeqBAIJ*)(fact)->data; 5213 5214 /* allocate matrix arrays for new data structure */ 5215 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5216 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5217 b->singlemalloc = PETSC_TRUE; 5218 if (!b->diag){ 5219 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5220 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5221 } 5222 bdiag = b->diag; 5223 5224 if (n > 0) { 5225 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5226 } 5227 5228 /* set bi and bj with new data structure */ 5229 bi = b->i; 5230 bj = b->j; 5231 5232 /* L part */ 5233 bi[0] = 0; 5234 for (i=0; i<n; i++){ 5235 nz = adiag[i] - ai[i]; 5236 bi[i+1] = bi[i] + nz; 5237 aj = a->j + ai[i]; 5238 for (j=0; j<nz; j++){ 5239 *bj = aj[j]; bj++; 5240 } 5241 } 5242 5243 /* U part */ 5244 bi_temp = bi[n]; 5245 bdiag[n] = bi[n]-1; 5246 for (i=n-1; i>=0; i--){ 5247 nz = ai[i+1] - adiag[i] - 1; 5248 bi_temp = bi_temp + nz + 1; 5249 aj = a->j + adiag[i] + 1; 5250 for (j=0; j<nz; j++){ 5251 *bj = aj[j]; bj++; 5252 } 5253 /* diag[i] */ 5254 *bj = i; bj++; 5255 bdiag[i] = bi_temp - 1; 5256 } 5257 PetscFunctionReturn(0); 5258 } 5259 5260 #undef __FUNCT__ 5261 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 5262 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5263 { 5264 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5265 IS isicol; 5266 PetscErrorCode ierr; 5267 const PetscInt *r,*ic; 5268 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5269 PetscInt *bi,*cols,nnz,*cols_lvl; 5270 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5271 PetscInt i,levels,diagonal_fill; 5272 PetscTruth col_identity,row_identity,both_identity; 5273 PetscReal f; 5274 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5275 PetscBT lnkbt; 5276 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5277 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5278 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5279 PetscTruth missing; 5280 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5281 5282 PetscFunctionBegin; 5283 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5284 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5285 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5286 5287 f = info->fill; 5288 levels = (PetscInt)info->levels; 5289 diagonal_fill = (PetscInt)info->diagonal_fill; 5290 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5291 5292 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5293 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5294 both_identity = (PetscTruth) (row_identity && col_identity); 5295 5296 if (!levels && both_identity) { 5297 /* special case: ilu(0) with natural ordering */ 5298 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5299 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 5300 5301 fact->factor = MAT_FACTOR_ILU; 5302 (fact)->info.factor_mallocs = 0; 5303 (fact)->info.fill_ratio_given = info->fill; 5304 (fact)->info.fill_ratio_needed = 1.0; 5305 b = (Mat_SeqBAIJ*)(fact)->data; 5306 b->row = isrow; 5307 b->col = iscol; 5308 b->icol = isicol; 5309 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5310 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5311 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5312 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5313 PetscFunctionReturn(0); 5314 } 5315 5316 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5317 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5318 5319 /* get new row pointers */ 5320 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5321 bi[0] = 0; 5322 /* bdiag is location of diagonal in factor */ 5323 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5324 bdiag[0] = 0; 5325 5326 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5327 5328 /* create a linked list for storing column indices of the active row */ 5329 nlnk = n + 1; 5330 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5331 5332 /* initial FreeSpace size is f*(ai[n]+1) */ 5333 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5334 current_space = free_space; 5335 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5336 current_space_lvl = free_space_lvl; 5337 5338 for (i=0; i<n; i++) { 5339 nzi = 0; 5340 /* copy current row into linked list */ 5341 nnz = ai[r[i]+1] - ai[r[i]]; 5342 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5343 cols = aj + ai[r[i]]; 5344 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5345 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5346 nzi += nlnk; 5347 5348 /* make sure diagonal entry is included */ 5349 if (diagonal_fill && lnk[i] == -1) { 5350 fm = n; 5351 while (lnk[fm] < i) fm = lnk[fm]; 5352 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5353 lnk[fm] = i; 5354 lnk_lvl[i] = 0; 5355 nzi++; dcount++; 5356 } 5357 5358 /* add pivot rows into the active row */ 5359 nzbd = 0; 5360 prow = lnk[n]; 5361 while (prow < i) { 5362 nnz = bdiag[prow]; 5363 cols = bj_ptr[prow] + nnz + 1; 5364 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5365 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5366 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5367 nzi += nlnk; 5368 prow = lnk[prow]; 5369 nzbd++; 5370 } 5371 bdiag[i] = nzbd; 5372 bi[i+1] = bi[i] + nzi; 5373 5374 /* if free space is not available, make more free space */ 5375 if (current_space->local_remaining<nzi) { 5376 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5377 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5378 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5379 reallocs++; 5380 } 5381 5382 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5383 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5384 bj_ptr[i] = current_space->array; 5385 bjlvl_ptr[i] = current_space_lvl->array; 5386 5387 /* make sure the active row i has diagonal entry */ 5388 if (*(bj_ptr[i]+bdiag[i]) != i) { 5389 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5390 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5391 } 5392 5393 current_space->array += nzi; 5394 current_space->local_used += nzi; 5395 current_space->local_remaining -= nzi; 5396 current_space_lvl->array += nzi; 5397 current_space_lvl->local_used += nzi; 5398 current_space_lvl->local_remaining -= nzi; 5399 } 5400 5401 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5402 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5403 5404 /* destroy list of free space and other temporary arrays */ 5405 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5406 5407 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5408 ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5409 5410 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5411 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5412 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5413 5414 #if defined(PETSC_USE_INFO) 5415 { 5416 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5417 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5418 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5419 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5420 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5421 if (diagonal_fill) { 5422 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5423 } 5424 } 5425 #endif 5426 5427 /* put together the new matrix */ 5428 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5429 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5430 b = (Mat_SeqBAIJ*)(fact)->data; 5431 b->free_a = PETSC_TRUE; 5432 b->free_ij = PETSC_TRUE; 5433 b->singlemalloc = PETSC_FALSE; 5434 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5435 b->j = bj; 5436 b->i = bi; 5437 b->diag = bdiag; 5438 b->free_diag = PETSC_TRUE; 5439 b->ilen = 0; 5440 b->imax = 0; 5441 b->row = isrow; 5442 b->col = iscol; 5443 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5444 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5445 b->icol = isicol; 5446 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5447 /* In b structure: Free imax, ilen, old a, old j. 5448 Allocate bdiag, solve_work, new a, new j */ 5449 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5450 b->maxnz = b->nz = bdiag[0]+1; 5451 fact->info.factor_mallocs = reallocs; 5452 fact->info.fill_ratio_given = f; 5453 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5454 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 5455 PetscFunctionReturn(0); 5456 } 5457 5458 5459 /* 5460 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5461 except that the data structure of Mat_SeqAIJ is slightly different. 5462 Not a good example of code reuse. 5463 */ 5464 #undef __FUNCT__ 5465 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5466 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5467 { 5468 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5469 IS isicol; 5470 PetscErrorCode ierr; 5471 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5472 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5473 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5474 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5475 PetscTruth col_identity,row_identity,both_identity,flg; 5476 PetscReal f; 5477 PetscTruth newdatastruct = PETSC_FALSE; 5478 5479 PetscFunctionBegin; 5480 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 5481 if (newdatastruct){ 5482 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5483 PetscFunctionReturn(0); 5484 } 5485 5486 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5487 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5488 5489 f = info->fill; 5490 levels = (PetscInt)info->levels; 5491 diagonal_fill = (PetscInt)info->diagonal_fill; 5492 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5493 5494 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5495 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5496 both_identity = (PetscTruth) (row_identity && col_identity); 5497 5498 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5499 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5500 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5501 5502 fact->factor = MAT_FACTOR_ILU; 5503 b = (Mat_SeqBAIJ*)fact->data; 5504 b->row = isrow; 5505 b->col = iscol; 5506 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5507 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5508 b->icol = isicol; 5509 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5510 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5511 PetscFunctionReturn(0); 5512 } 5513 5514 /* general case perform the symbolic factorization */ 5515 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5516 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5517 5518 /* get new row pointers */ 5519 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5520 ainew[0] = 0; 5521 /* don't know how many column pointers are needed so estimate */ 5522 jmax = (PetscInt)(f*ai[n] + 1); 5523 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5524 /* ajfill is level of fill for each fill entry */ 5525 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5526 /* fill is a linked list of nonzeros in active row */ 5527 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5528 /* im is level for each filled value */ 5529 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5530 /* dloc is location of diagonal in factor */ 5531 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5532 dloc[0] = 0; 5533 for (prow=0; prow<n; prow++) { 5534 5535 /* copy prow into linked list */ 5536 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5537 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5538 xi = aj + ai[r[prow]]; 5539 fill[n] = n; 5540 fill[prow] = -1; /* marker for diagonal entry */ 5541 while (nz--) { 5542 fm = n; 5543 idx = ic[*xi++]; 5544 do { 5545 m = fm; 5546 fm = fill[m]; 5547 } while (fm < idx); 5548 fill[m] = idx; 5549 fill[idx] = fm; 5550 im[idx] = 0; 5551 } 5552 5553 /* make sure diagonal entry is included */ 5554 if (diagonal_fill && fill[prow] == -1) { 5555 fm = n; 5556 while (fill[fm] < prow) fm = fill[fm]; 5557 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5558 fill[fm] = prow; 5559 im[prow] = 0; 5560 nzf++; 5561 dcount++; 5562 } 5563 5564 nzi = 0; 5565 row = fill[n]; 5566 while (row < prow) { 5567 incrlev = im[row] + 1; 5568 nz = dloc[row]; 5569 xi = ajnew + ainew[row] + nz + 1; 5570 flev = ajfill + ainew[row] + nz + 1; 5571 nnz = ainew[row+1] - ainew[row] - nz - 1; 5572 fm = row; 5573 while (nnz-- > 0) { 5574 idx = *xi++; 5575 if (*flev + incrlev > levels) { 5576 flev++; 5577 continue; 5578 } 5579 do { 5580 m = fm; 5581 fm = fill[m]; 5582 } while (fm < idx); 5583 if (fm != idx) { 5584 im[idx] = *flev + incrlev; 5585 fill[m] = idx; 5586 fill[idx] = fm; 5587 fm = idx; 5588 nzf++; 5589 } else { 5590 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5591 } 5592 flev++; 5593 } 5594 row = fill[row]; 5595 nzi++; 5596 } 5597 /* copy new filled row into permanent storage */ 5598 ainew[prow+1] = ainew[prow] + nzf; 5599 if (ainew[prow+1] > jmax) { 5600 5601 /* estimate how much additional space we will need */ 5602 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5603 /* just double the memory each time */ 5604 PetscInt maxadd = jmax; 5605 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5606 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5607 jmax += maxadd; 5608 5609 /* allocate a longer ajnew and ajfill */ 5610 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5611 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5612 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5613 ajnew = xitmp; 5614 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5615 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5616 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5617 ajfill = xitmp; 5618 reallocate++; /* count how many reallocations are needed */ 5619 } 5620 xitmp = ajnew + ainew[prow]; 5621 flev = ajfill + ainew[prow]; 5622 dloc[prow] = nzi; 5623 fm = fill[n]; 5624 while (nzf--) { 5625 *xitmp++ = fm; 5626 *flev++ = im[fm]; 5627 fm = fill[fm]; 5628 } 5629 /* make sure row has diagonal entry */ 5630 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5631 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5632 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5633 } 5634 } 5635 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5636 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5637 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5638 ierr = PetscFree(fill);CHKERRQ(ierr); 5639 ierr = PetscFree(im);CHKERRQ(ierr); 5640 5641 #if defined(PETSC_USE_INFO) 5642 { 5643 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5644 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5645 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5646 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5647 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5648 if (diagonal_fill) { 5649 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5650 } 5651 } 5652 #endif 5653 5654 /* put together the new matrix */ 5655 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5656 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5657 b = (Mat_SeqBAIJ*)fact->data; 5658 b->free_a = PETSC_TRUE; 5659 b->free_ij = PETSC_TRUE; 5660 b->singlemalloc = PETSC_FALSE; 5661 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5662 b->j = ajnew; 5663 b->i = ainew; 5664 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5665 b->diag = dloc; 5666 b->free_diag = PETSC_TRUE; 5667 b->ilen = 0; 5668 b->imax = 0; 5669 b->row = isrow; 5670 b->col = iscol; 5671 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5672 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5673 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5674 b->icol = isicol; 5675 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5676 /* In b structure: Free imax, ilen, old a, old j. 5677 Allocate dloc, solve_work, new a, new j */ 5678 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5679 b->maxnz = b->nz = ainew[n]; 5680 5681 fact->info.factor_mallocs = reallocate; 5682 fact->info.fill_ratio_given = f; 5683 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5684 5685 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5686 PetscFunctionReturn(0); 5687 } 5688 5689 #undef __FUNCT__ 5690 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5691 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5692 { 5693 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5694 /* int i,*AJ=a->j,nz=a->nz; */ 5695 PetscFunctionBegin; 5696 /* Undo Column scaling */ 5697 /* while (nz--) { */ 5698 /* AJ[i] = AJ[i]/4; */ 5699 /* } */ 5700 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5701 A->ops->setunfactored = PETSC_NULL; 5702 PetscFunctionReturn(0); 5703 } 5704 5705 #undef __FUNCT__ 5706 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5707 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5708 { 5709 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5710 PetscInt *AJ=a->j,nz=a->nz; 5711 unsigned short *aj=(unsigned short *)AJ; 5712 PetscFunctionBegin; 5713 /* Is this really necessary? */ 5714 while (nz--) { 5715 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5716 } 5717 A->ops->setunfactored = PETSC_NULL; 5718 PetscFunctionReturn(0); 5719 } 5720 5721 5722