1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "../src/mat/impls/baij/seq/baij.h" 8 #include "../src/mat/blockinvert.h" 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 12 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 13 { 14 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15 PetscErrorCode ierr; 16 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17 PetscInt *diag = a->diag; 18 MatScalar *aa=a->a,*v; 19 PetscScalar s1,*x,*b; 20 21 PetscFunctionBegin; 22 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 23 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25 26 /* forward solve the U^T */ 27 for (i=0; i<n; i++) { 28 29 v = aa + diag[i]; 30 /* multiply by the inverse of the block diagonal */ 31 s1 = (*v++)*x[i]; 32 vi = aj + diag[i] + 1; 33 nz = ai[i+1] - diag[i] - 1; 34 while (nz--) { 35 x[*vi++] -= (*v++)*s1; 36 } 37 x[i] = s1; 38 } 39 /* backward solve the L^T */ 40 for (i=n-1; i>=0; i--){ 41 v = aa + diag[i] - 1; 42 vi = aj + diag[i] - 1; 43 nz = diag[i] - ai[i]; 44 s1 = x[i]; 45 while (nz--) { 46 x[*vi--] -= (*v--)*s1; 47 } 48 } 49 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 57 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58 { 59 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60 PetscErrorCode ierr; 61 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 62 PetscInt *diag = a->diag,oidx; 63 MatScalar *aa=a->a,*v; 64 PetscScalar s1,s2,x1,x2; 65 PetscScalar *x,*b; 66 67 PetscFunctionBegin; 68 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 69 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 70 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 71 72 /* forward solve the U^T */ 73 idx = 0; 74 for (i=0; i<n; i++) { 75 76 v = aa + 4*diag[i]; 77 /* multiply by the inverse of the block diagonal */ 78 x1 = x[idx]; x2 = x[1+idx]; 79 s1 = v[0]*x1 + v[1]*x2; 80 s2 = v[2]*x1 + v[3]*x2; 81 v += 4; 82 83 vi = aj + diag[i] + 1; 84 nz = ai[i+1] - diag[i] - 1; 85 while (nz--) { 86 oidx = 2*(*vi++); 87 x[oidx] -= v[0]*s1 + v[1]*s2; 88 x[oidx+1] -= v[2]*s1 + v[3]*s2; 89 v += 4; 90 } 91 x[idx] = s1;x[1+idx] = s2; 92 idx += 2; 93 } 94 /* backward solve the L^T */ 95 for (i=n-1; i>=0; i--){ 96 v = aa + 4*diag[i] - 4; 97 vi = aj + diag[i] - 1; 98 nz = diag[i] - ai[i]; 99 idt = 2*i; 100 s1 = x[idt]; s2 = x[1+idt]; 101 while (nz--) { 102 idx = 2*(*vi--); 103 x[idx] -= v[0]*s1 + v[1]*s2; 104 x[idx+1] -= v[2]*s1 + v[3]*s2; 105 v -= 4; 106 } 107 } 108 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 109 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 110 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 111 PetscFunctionReturn(0); 112 } 113 114 #undef __FUNCT__ 115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 116 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 117 { 118 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 119 PetscErrorCode ierr; 120 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 121 PetscInt *diag = a->diag,oidx; 122 MatScalar *aa=a->a,*v; 123 PetscScalar s1,s2,s3,x1,x2,x3; 124 PetscScalar *x,*b; 125 126 PetscFunctionBegin; 127 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 128 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 129 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 130 131 /* forward solve the U^T */ 132 idx = 0; 133 for (i=0; i<n; i++) { 134 135 v = aa + 9*diag[i]; 136 /* multiply by the inverse of the block diagonal */ 137 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 138 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 139 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 140 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 141 v += 9; 142 143 vi = aj + diag[i] + 1; 144 nz = ai[i+1] - diag[i] - 1; 145 while (nz--) { 146 oidx = 3*(*vi++); 147 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 148 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 149 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 150 v += 9; 151 } 152 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 153 idx += 3; 154 } 155 /* backward solve the L^T */ 156 for (i=n-1; i>=0; i--){ 157 v = aa + 9*diag[i] - 9; 158 vi = aj + diag[i] - 1; 159 nz = diag[i] - ai[i]; 160 idt = 3*i; 161 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 162 while (nz--) { 163 idx = 3*(*vi--); 164 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 165 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 166 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 167 v -= 9; 168 } 169 } 170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 172 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 178 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 179 { 180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181 PetscErrorCode ierr; 182 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183 PetscInt *diag = a->diag,oidx; 184 MatScalar *aa=a->a,*v; 185 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 186 PetscScalar *x,*b; 187 188 PetscFunctionBegin; 189 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192 193 /* forward solve the U^T */ 194 idx = 0; 195 for (i=0; i<n; i++) { 196 197 v = aa + 16*diag[i]; 198 /* multiply by the inverse of the block diagonal */ 199 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 200 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 201 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 202 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 203 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 204 v += 16; 205 206 vi = aj + diag[i] + 1; 207 nz = ai[i+1] - diag[i] - 1; 208 while (nz--) { 209 oidx = 4*(*vi++); 210 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 211 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 212 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 213 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 214 v += 16; 215 } 216 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 217 idx += 4; 218 } 219 /* backward solve the L^T */ 220 for (i=n-1; i>=0; i--){ 221 v = aa + 16*diag[i] - 16; 222 vi = aj + diag[i] - 1; 223 nz = diag[i] - ai[i]; 224 idt = 4*i; 225 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 226 while (nz--) { 227 idx = 4*(*vi--); 228 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 229 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 230 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 231 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 232 v -= 16; 233 } 234 } 235 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 236 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 237 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 #undef __FUNCT__ 242 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 243 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 244 { 245 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 246 PetscErrorCode ierr; 247 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 248 PetscInt *diag = a->diag,oidx; 249 MatScalar *aa=a->a,*v; 250 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 251 PetscScalar *x,*b; 252 253 PetscFunctionBegin; 254 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 255 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 256 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 257 258 /* forward solve the U^T */ 259 idx = 0; 260 for (i=0; i<n; i++) { 261 262 v = aa + 25*diag[i]; 263 /* multiply by the inverse of the block diagonal */ 264 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 265 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 266 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 267 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 268 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 269 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 270 v += 25; 271 272 vi = aj + diag[i] + 1; 273 nz = ai[i+1] - diag[i] - 1; 274 while (nz--) { 275 oidx = 5*(*vi++); 276 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 277 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 278 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 279 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 280 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 281 v += 25; 282 } 283 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 284 idx += 5; 285 } 286 /* backward solve the L^T */ 287 for (i=n-1; i>=0; i--){ 288 v = aa + 25*diag[i] - 25; 289 vi = aj + diag[i] - 1; 290 nz = diag[i] - ai[i]; 291 idt = 5*i; 292 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 293 while (nz--) { 294 idx = 5*(*vi--); 295 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 296 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 297 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 298 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 299 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 300 v -= 25; 301 } 302 } 303 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 304 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 305 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 306 PetscFunctionReturn(0); 307 } 308 309 #undef __FUNCT__ 310 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 311 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 312 { 313 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 314 PetscErrorCode ierr; 315 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 316 PetscInt *diag = a->diag,oidx; 317 MatScalar *aa=a->a,*v; 318 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 319 PetscScalar *x,*b; 320 321 PetscFunctionBegin; 322 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 323 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 324 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 325 326 /* forward solve the U^T */ 327 idx = 0; 328 for (i=0; i<n; i++) { 329 330 v = aa + 36*diag[i]; 331 /* multiply by the inverse of the block diagonal */ 332 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 333 x6 = x[5+idx]; 334 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 335 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 336 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 337 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 338 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 339 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 340 v += 36; 341 342 vi = aj + diag[i] + 1; 343 nz = ai[i+1] - diag[i] - 1; 344 while (nz--) { 345 oidx = 6*(*vi++); 346 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 347 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 348 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 349 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 350 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 351 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 352 v += 36; 353 } 354 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 355 x[5+idx] = s6; 356 idx += 6; 357 } 358 /* backward solve the L^T */ 359 for (i=n-1; i>=0; i--){ 360 v = aa + 36*diag[i] - 36; 361 vi = aj + diag[i] - 1; 362 nz = diag[i] - ai[i]; 363 idt = 6*i; 364 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 365 s6 = x[5+idt]; 366 while (nz--) { 367 idx = 6*(*vi--); 368 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 369 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 370 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 371 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 372 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 373 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 374 v -= 36; 375 } 376 } 377 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 378 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 379 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 380 PetscFunctionReturn(0); 381 } 382 383 #undef __FUNCT__ 384 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 385 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 386 { 387 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 388 PetscErrorCode ierr; 389 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 390 PetscInt *diag = a->diag,oidx; 391 MatScalar *aa=a->a,*v; 392 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 393 PetscScalar *x,*b; 394 395 PetscFunctionBegin; 396 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 397 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 398 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 399 400 /* forward solve the U^T */ 401 idx = 0; 402 for (i=0; i<n; i++) { 403 404 v = aa + 49*diag[i]; 405 /* multiply by the inverse of the block diagonal */ 406 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 407 x6 = x[5+idx]; x7 = x[6+idx]; 408 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 409 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 410 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 411 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 412 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 413 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 414 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 415 v += 49; 416 417 vi = aj + diag[i] + 1; 418 nz = ai[i+1] - diag[i] - 1; 419 while (nz--) { 420 oidx = 7*(*vi++); 421 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 422 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 423 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 424 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 425 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 426 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 427 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 428 v += 49; 429 } 430 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 431 x[5+idx] = s6;x[6+idx] = s7; 432 idx += 7; 433 } 434 /* backward solve the L^T */ 435 for (i=n-1; i>=0; i--){ 436 v = aa + 49*diag[i] - 49; 437 vi = aj + diag[i] - 1; 438 nz = diag[i] - ai[i]; 439 idt = 7*i; 440 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 441 s6 = x[5+idt];s7 = x[6+idt]; 442 while (nz--) { 443 idx = 7*(*vi--); 444 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 445 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 446 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 447 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 448 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 449 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 450 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 451 v -= 49; 452 } 453 } 454 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 456 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 457 PetscFunctionReturn(0); 458 } 459 460 /*---------------------------------------------------------------------------------------------*/ 461 #undef __FUNCT__ 462 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 463 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 464 { 465 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 466 IS iscol=a->col,isrow=a->row; 467 PetscErrorCode ierr; 468 const PetscInt *r,*c,*rout,*cout; 469 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 470 PetscInt *diag = a->diag; 471 MatScalar *aa=a->a,*v; 472 PetscScalar s1,*x,*b,*t; 473 474 PetscFunctionBegin; 475 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 476 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477 t = a->solve_work; 478 479 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 480 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 481 482 /* copy the b into temp work space according to permutation */ 483 for (i=0; i<n; i++) { 484 t[i] = b[c[i]]; 485 } 486 487 /* forward solve the U^T */ 488 for (i=0; i<n; i++) { 489 490 v = aa + diag[i]; 491 /* multiply by the inverse of the block diagonal */ 492 s1 = (*v++)*t[i]; 493 vi = aj + diag[i] + 1; 494 nz = ai[i+1] - diag[i] - 1; 495 while (nz--) { 496 t[*vi++] -= (*v++)*s1; 497 } 498 t[i] = s1; 499 } 500 /* backward solve the L^T */ 501 for (i=n-1; i>=0; i--){ 502 v = aa + diag[i] - 1; 503 vi = aj + diag[i] - 1; 504 nz = diag[i] - ai[i]; 505 s1 = t[i]; 506 while (nz--) { 507 t[*vi--] -= (*v--)*s1; 508 } 509 } 510 511 /* copy t into x according to permutation */ 512 for (i=0; i<n; i++) { 513 x[r[i]] = t[i]; 514 } 515 516 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 517 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 518 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 519 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 520 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 521 PetscFunctionReturn(0); 522 } 523 524 #undef __FUNCT__ 525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 527 { 528 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 529 IS iscol=a->col,isrow=a->row; 530 PetscErrorCode ierr; 531 const PetscInt *r,*c,*rout,*cout; 532 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 533 PetscInt *diag = a->diag,ii,ic,ir,oidx; 534 MatScalar *aa=a->a,*v; 535 PetscScalar s1,s2,x1,x2; 536 PetscScalar *x,*b,*t; 537 538 PetscFunctionBegin; 539 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 540 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 541 t = a->solve_work; 542 543 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 544 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 545 546 /* copy the b into temp work space according to permutation */ 547 ii = 0; 548 for (i=0; i<n; i++) { 549 ic = 2*c[i]; 550 t[ii] = b[ic]; 551 t[ii+1] = b[ic+1]; 552 ii += 2; 553 } 554 555 /* forward solve the U^T */ 556 idx = 0; 557 for (i=0; i<n; i++) { 558 559 v = aa + 4*diag[i]; 560 /* multiply by the inverse of the block diagonal */ 561 x1 = t[idx]; x2 = t[1+idx]; 562 s1 = v[0]*x1 + v[1]*x2; 563 s2 = v[2]*x1 + v[3]*x2; 564 v += 4; 565 566 vi = aj + diag[i] + 1; 567 nz = ai[i+1] - diag[i] - 1; 568 while (nz--) { 569 oidx = 2*(*vi++); 570 t[oidx] -= v[0]*s1 + v[1]*s2; 571 t[oidx+1] -= v[2]*s1 + v[3]*s2; 572 v += 4; 573 } 574 t[idx] = s1;t[1+idx] = s2; 575 idx += 2; 576 } 577 /* backward solve the L^T */ 578 for (i=n-1; i>=0; i--){ 579 v = aa + 4*diag[i] - 4; 580 vi = aj + diag[i] - 1; 581 nz = diag[i] - ai[i]; 582 idt = 2*i; 583 s1 = t[idt]; s2 = t[1+idt]; 584 while (nz--) { 585 idx = 2*(*vi--); 586 t[idx] -= v[0]*s1 + v[1]*s2; 587 t[idx+1] -= v[2]*s1 + v[3]*s2; 588 v -= 4; 589 } 590 } 591 592 /* copy t into x according to permutation */ 593 ii = 0; 594 for (i=0; i<n; i++) { 595 ir = 2*r[i]; 596 x[ir] = t[ii]; 597 x[ir+1] = t[ii+1]; 598 ii += 2; 599 } 600 601 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 602 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 603 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 604 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 605 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 606 PetscFunctionReturn(0); 607 } 608 609 #undef __FUNCT__ 610 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 611 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 612 { 613 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 614 IS iscol=a->col,isrow=a->row; 615 PetscErrorCode ierr; 616 const PetscInt *r,*c,*rout,*cout; 617 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 618 PetscInt *diag = a->diag,ii,ic,ir,oidx; 619 MatScalar *aa=a->a,*v; 620 PetscScalar s1,s2,s3,x1,x2,x3; 621 PetscScalar *x,*b,*t; 622 623 PetscFunctionBegin; 624 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 625 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 626 t = a->solve_work; 627 628 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 629 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 630 631 /* copy the b into temp work space according to permutation */ 632 ii = 0; 633 for (i=0; i<n; i++) { 634 ic = 3*c[i]; 635 t[ii] = b[ic]; 636 t[ii+1] = b[ic+1]; 637 t[ii+2] = b[ic+2]; 638 ii += 3; 639 } 640 641 /* forward solve the U^T */ 642 idx = 0; 643 for (i=0; i<n; i++) { 644 645 v = aa + 9*diag[i]; 646 /* multiply by the inverse of the block diagonal */ 647 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 648 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 649 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 650 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 651 v += 9; 652 653 vi = aj + diag[i] + 1; 654 nz = ai[i+1] - diag[i] - 1; 655 while (nz--) { 656 oidx = 3*(*vi++); 657 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 658 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 659 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 660 v += 9; 661 } 662 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 663 idx += 3; 664 } 665 /* backward solve the L^T */ 666 for (i=n-1; i>=0; i--){ 667 v = aa + 9*diag[i] - 9; 668 vi = aj + diag[i] - 1; 669 nz = diag[i] - ai[i]; 670 idt = 3*i; 671 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 672 while (nz--) { 673 idx = 3*(*vi--); 674 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 675 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 676 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 677 v -= 9; 678 } 679 } 680 681 /* copy t into x according to permutation */ 682 ii = 0; 683 for (i=0; i<n; i++) { 684 ir = 3*r[i]; 685 x[ir] = t[ii]; 686 x[ir+1] = t[ii+1]; 687 x[ir+2] = t[ii+2]; 688 ii += 3; 689 } 690 691 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 692 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 693 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 694 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 695 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 696 PetscFunctionReturn(0); 697 } 698 699 #undef __FUNCT__ 700 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 701 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 702 { 703 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 704 IS iscol=a->col,isrow=a->row; 705 PetscErrorCode ierr; 706 const PetscInt *r,*c,*rout,*cout; 707 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 708 PetscInt *diag = a->diag,ii,ic,ir,oidx; 709 MatScalar *aa=a->a,*v; 710 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 711 PetscScalar *x,*b,*t; 712 713 PetscFunctionBegin; 714 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 715 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 716 t = a->solve_work; 717 718 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 719 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 720 721 /* copy the b into temp work space according to permutation */ 722 ii = 0; 723 for (i=0; i<n; i++) { 724 ic = 4*c[i]; 725 t[ii] = b[ic]; 726 t[ii+1] = b[ic+1]; 727 t[ii+2] = b[ic+2]; 728 t[ii+3] = b[ic+3]; 729 ii += 4; 730 } 731 732 /* forward solve the U^T */ 733 idx = 0; 734 for (i=0; i<n; i++) { 735 736 v = aa + 16*diag[i]; 737 /* multiply by the inverse of the block diagonal */ 738 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 739 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 740 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 741 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 742 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 743 v += 16; 744 745 vi = aj + diag[i] + 1; 746 nz = ai[i+1] - diag[i] - 1; 747 while (nz--) { 748 oidx = 4*(*vi++); 749 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 750 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 751 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 752 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 753 v += 16; 754 } 755 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 756 idx += 4; 757 } 758 /* backward solve the L^T */ 759 for (i=n-1; i>=0; i--){ 760 v = aa + 16*diag[i] - 16; 761 vi = aj + diag[i] - 1; 762 nz = diag[i] - ai[i]; 763 idt = 4*i; 764 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 765 while (nz--) { 766 idx = 4*(*vi--); 767 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 768 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 769 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 770 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 771 v -= 16; 772 } 773 } 774 775 /* copy t into x according to permutation */ 776 ii = 0; 777 for (i=0; i<n; i++) { 778 ir = 4*r[i]; 779 x[ir] = t[ii]; 780 x[ir+1] = t[ii+1]; 781 x[ir+2] = t[ii+2]; 782 x[ir+3] = t[ii+3]; 783 ii += 4; 784 } 785 786 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 787 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 788 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 789 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 790 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 791 PetscFunctionReturn(0); 792 } 793 794 #undef __FUNCT__ 795 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 796 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 797 { 798 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 799 IS iscol=a->col,isrow=a->row; 800 PetscErrorCode ierr; 801 const PetscInt *r,*c,*rout,*cout; 802 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 803 PetscInt *diag = a->diag,ii,ic,ir,oidx; 804 MatScalar *aa=a->a,*v; 805 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 806 PetscScalar *x,*b,*t; 807 808 PetscFunctionBegin; 809 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 810 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 811 t = a->solve_work; 812 813 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 814 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 815 816 /* copy the b into temp work space according to permutation */ 817 ii = 0; 818 for (i=0; i<n; i++) { 819 ic = 5*c[i]; 820 t[ii] = b[ic]; 821 t[ii+1] = b[ic+1]; 822 t[ii+2] = b[ic+2]; 823 t[ii+3] = b[ic+3]; 824 t[ii+4] = b[ic+4]; 825 ii += 5; 826 } 827 828 /* forward solve the U^T */ 829 idx = 0; 830 for (i=0; i<n; i++) { 831 832 v = aa + 25*diag[i]; 833 /* multiply by the inverse of the block diagonal */ 834 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 835 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 836 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 837 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 838 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 839 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 840 v += 25; 841 842 vi = aj + diag[i] + 1; 843 nz = ai[i+1] - diag[i] - 1; 844 while (nz--) { 845 oidx = 5*(*vi++); 846 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 847 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 848 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 849 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 850 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 851 v += 25; 852 } 853 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 854 idx += 5; 855 } 856 /* backward solve the L^T */ 857 for (i=n-1; i>=0; i--){ 858 v = aa + 25*diag[i] - 25; 859 vi = aj + diag[i] - 1; 860 nz = diag[i] - ai[i]; 861 idt = 5*i; 862 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 863 while (nz--) { 864 idx = 5*(*vi--); 865 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 866 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 867 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 868 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 869 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 870 v -= 25; 871 } 872 } 873 874 /* copy t into x according to permutation */ 875 ii = 0; 876 for (i=0; i<n; i++) { 877 ir = 5*r[i]; 878 x[ir] = t[ii]; 879 x[ir+1] = t[ii+1]; 880 x[ir+2] = t[ii+2]; 881 x[ir+3] = t[ii+3]; 882 x[ir+4] = t[ii+4]; 883 ii += 5; 884 } 885 886 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 887 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 888 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 889 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 890 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 891 PetscFunctionReturn(0); 892 } 893 894 #undef __FUNCT__ 895 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 896 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 897 { 898 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 899 IS iscol=a->col,isrow=a->row; 900 PetscErrorCode ierr; 901 const PetscInt *r,*c,*rout,*cout; 902 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 903 PetscInt *diag = a->diag,ii,ic,ir,oidx; 904 MatScalar *aa=a->a,*v; 905 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 906 PetscScalar *x,*b,*t; 907 908 PetscFunctionBegin; 909 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 910 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 911 t = a->solve_work; 912 913 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 914 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 915 916 /* copy the b into temp work space according to permutation */ 917 ii = 0; 918 for (i=0; i<n; i++) { 919 ic = 6*c[i]; 920 t[ii] = b[ic]; 921 t[ii+1] = b[ic+1]; 922 t[ii+2] = b[ic+2]; 923 t[ii+3] = b[ic+3]; 924 t[ii+4] = b[ic+4]; 925 t[ii+5] = b[ic+5]; 926 ii += 6; 927 } 928 929 /* forward solve the U^T */ 930 idx = 0; 931 for (i=0; i<n; i++) { 932 933 v = aa + 36*diag[i]; 934 /* multiply by the inverse of the block diagonal */ 935 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 936 x6 = t[5+idx]; 937 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 938 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 939 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 940 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 941 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 942 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 943 v += 36; 944 945 vi = aj + diag[i] + 1; 946 nz = ai[i+1] - diag[i] - 1; 947 while (nz--) { 948 oidx = 6*(*vi++); 949 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 950 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 951 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 952 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 953 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 954 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 955 v += 36; 956 } 957 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 958 t[5+idx] = s6; 959 idx += 6; 960 } 961 /* backward solve the L^T */ 962 for (i=n-1; i>=0; i--){ 963 v = aa + 36*diag[i] - 36; 964 vi = aj + diag[i] - 1; 965 nz = diag[i] - ai[i]; 966 idt = 6*i; 967 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 968 s6 = t[5+idt]; 969 while (nz--) { 970 idx = 6*(*vi--); 971 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 972 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 973 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 974 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 975 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 976 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 977 v -= 36; 978 } 979 } 980 981 /* copy t into x according to permutation */ 982 ii = 0; 983 for (i=0; i<n; i++) { 984 ir = 6*r[i]; 985 x[ir] = t[ii]; 986 x[ir+1] = t[ii+1]; 987 x[ir+2] = t[ii+2]; 988 x[ir+3] = t[ii+3]; 989 x[ir+4] = t[ii+4]; 990 x[ir+5] = t[ii+5]; 991 ii += 6; 992 } 993 994 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 995 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 996 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 997 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 998 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 999 PetscFunctionReturn(0); 1000 } 1001 1002 #undef __FUNCT__ 1003 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1004 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1005 { 1006 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1007 IS iscol=a->col,isrow=a->row; 1008 PetscErrorCode ierr; 1009 const PetscInt *r,*c,*rout,*cout; 1010 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1011 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1012 MatScalar *aa=a->a,*v; 1013 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1014 PetscScalar *x,*b,*t; 1015 1016 PetscFunctionBegin; 1017 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1018 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1019 t = a->solve_work; 1020 1021 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1022 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1023 1024 /* copy the b into temp work space according to permutation */ 1025 ii = 0; 1026 for (i=0; i<n; i++) { 1027 ic = 7*c[i]; 1028 t[ii] = b[ic]; 1029 t[ii+1] = b[ic+1]; 1030 t[ii+2] = b[ic+2]; 1031 t[ii+3] = b[ic+3]; 1032 t[ii+4] = b[ic+4]; 1033 t[ii+5] = b[ic+5]; 1034 t[ii+6] = b[ic+6]; 1035 ii += 7; 1036 } 1037 1038 /* forward solve the U^T */ 1039 idx = 0; 1040 for (i=0; i<n; i++) { 1041 1042 v = aa + 49*diag[i]; 1043 /* multiply by the inverse of the block diagonal */ 1044 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1045 x6 = t[5+idx]; x7 = t[6+idx]; 1046 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1047 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1048 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1049 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1050 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1051 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1052 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1053 v += 49; 1054 1055 vi = aj + diag[i] + 1; 1056 nz = ai[i+1] - diag[i] - 1; 1057 while (nz--) { 1058 oidx = 7*(*vi++); 1059 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1060 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1061 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1062 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1063 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1064 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1065 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1066 v += 49; 1067 } 1068 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1069 t[5+idx] = s6;t[6+idx] = s7; 1070 idx += 7; 1071 } 1072 /* backward solve the L^T */ 1073 for (i=n-1; i>=0; i--){ 1074 v = aa + 49*diag[i] - 49; 1075 vi = aj + diag[i] - 1; 1076 nz = diag[i] - ai[i]; 1077 idt = 7*i; 1078 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1079 s6 = t[5+idt];s7 = t[6+idt]; 1080 while (nz--) { 1081 idx = 7*(*vi--); 1082 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1083 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1084 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1085 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1086 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1087 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1088 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1089 v -= 49; 1090 } 1091 } 1092 1093 /* copy t into x according to permutation */ 1094 ii = 0; 1095 for (i=0; i<n; i++) { 1096 ir = 7*r[i]; 1097 x[ir] = t[ii]; 1098 x[ir+1] = t[ii+1]; 1099 x[ir+2] = t[ii+2]; 1100 x[ir+3] = t[ii+3]; 1101 x[ir+4] = t[ii+4]; 1102 x[ir+5] = t[ii+5]; 1103 x[ir+6] = t[ii+6]; 1104 ii += 7; 1105 } 1106 1107 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1108 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1109 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1111 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1112 PetscFunctionReturn(0); 1113 } 1114 1115 /* ----------------------------------------------------------- */ 1116 #undef __FUNCT__ 1117 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1118 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1119 { 1120 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1121 IS iscol=a->col,isrow=a->row; 1122 PetscErrorCode ierr; 1123 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1124 PetscInt i,n=a->mbs; 1125 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1126 MatScalar *aa=a->a,*v; 1127 PetscScalar *x,*b,*s,*t,*ls; 1128 1129 PetscFunctionBegin; 1130 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1131 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1132 t = a->solve_work; 1133 1134 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1135 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1136 1137 /* forward solve the lower triangular */ 1138 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1139 for (i=1; i<n; i++) { 1140 v = aa + bs2*ai[i]; 1141 vi = aj + ai[i]; 1142 nz = a->diag[i] - ai[i]; 1143 s = t + bs*i; 1144 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1145 while (nz--) { 1146 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1147 v += bs2; 1148 } 1149 } 1150 /* backward solve the upper triangular */ 1151 ls = a->solve_work + A->cmap->n; 1152 for (i=n-1; i>=0; i--){ 1153 v = aa + bs2*(a->diag[i] + 1); 1154 vi = aj + a->diag[i] + 1; 1155 nz = ai[i+1] - a->diag[i] - 1; 1156 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1157 while (nz--) { 1158 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1159 v += bs2; 1160 } 1161 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1162 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1163 } 1164 1165 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1166 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1169 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1170 PetscFunctionReturn(0); 1171 } 1172 1173 #undef __FUNCT__ 1174 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1175 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1176 { 1177 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1178 IS iscol=a->col,isrow=a->row; 1179 PetscErrorCode ierr; 1180 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1181 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1182 MatScalar *aa=a->a,*v; 1183 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1184 PetscScalar *x,*b,*t; 1185 1186 PetscFunctionBegin; 1187 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1188 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1189 t = a->solve_work; 1190 1191 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1192 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1193 1194 /* forward solve the lower triangular */ 1195 idx = 7*(*r++); 1196 t[0] = b[idx]; t[1] = b[1+idx]; 1197 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1198 t[5] = b[5+idx]; t[6] = b[6+idx]; 1199 1200 for (i=1; i<n; i++) { 1201 v = aa + 49*ai[i]; 1202 vi = aj + ai[i]; 1203 nz = diag[i] - ai[i]; 1204 idx = 7*(*r++); 1205 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1206 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1207 while (nz--) { 1208 idx = 7*(*vi++); 1209 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1210 x4 = t[3+idx];x5 = t[4+idx]; 1211 x6 = t[5+idx];x7 = t[6+idx]; 1212 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1213 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1214 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1215 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1216 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1217 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1218 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1219 v += 49; 1220 } 1221 idx = 7*i; 1222 t[idx] = s1;t[1+idx] = s2; 1223 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1224 t[5+idx] = s6;t[6+idx] = s7; 1225 } 1226 /* backward solve the upper triangular */ 1227 for (i=n-1; i>=0; i--){ 1228 v = aa + 49*diag[i] + 49; 1229 vi = aj + diag[i] + 1; 1230 nz = ai[i+1] - diag[i] - 1; 1231 idt = 7*i; 1232 s1 = t[idt]; s2 = t[1+idt]; 1233 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1234 s6 = t[5+idt];s7 = t[6+idt]; 1235 while (nz--) { 1236 idx = 7*(*vi++); 1237 x1 = t[idx]; x2 = t[1+idx]; 1238 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1239 x6 = t[5+idx]; x7 = t[6+idx]; 1240 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1241 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1242 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1243 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1244 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1245 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1246 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1247 v += 49; 1248 } 1249 idc = 7*(*c--); 1250 v = aa + 49*diag[i]; 1251 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1252 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1253 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1254 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1255 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1256 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1257 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1258 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1259 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1260 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1261 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1262 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1263 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1264 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1265 } 1266 1267 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1268 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1269 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1270 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1271 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1272 PetscFunctionReturn(0); 1273 } 1274 1275 #undef __FUNCT__ 1276 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1277 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1278 { 1279 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1280 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1281 PetscErrorCode ierr; 1282 PetscInt *diag = a->diag,jdx; 1283 const MatScalar *aa=a->a,*v; 1284 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1285 const PetscScalar *b; 1286 1287 PetscFunctionBegin; 1288 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1289 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1290 /* forward solve the lower triangular */ 1291 idx = 0; 1292 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1293 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1294 x[6] = b[6+idx]; 1295 for (i=1; i<n; i++) { 1296 v = aa + 49*ai[i]; 1297 vi = aj + ai[i]; 1298 nz = diag[i] - ai[i]; 1299 idx = 7*i; 1300 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1301 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1302 s7 = b[6+idx]; 1303 while (nz--) { 1304 jdx = 7*(*vi++); 1305 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1306 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1307 x7 = x[6+jdx]; 1308 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1309 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1310 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1311 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1312 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1313 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1314 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1315 v += 49; 1316 } 1317 x[idx] = s1; 1318 x[1+idx] = s2; 1319 x[2+idx] = s3; 1320 x[3+idx] = s4; 1321 x[4+idx] = s5; 1322 x[5+idx] = s6; 1323 x[6+idx] = s7; 1324 } 1325 /* backward solve the upper triangular */ 1326 for (i=n-1; i>=0; i--){ 1327 v = aa + 49*diag[i] + 49; 1328 vi = aj + diag[i] + 1; 1329 nz = ai[i+1] - diag[i] - 1; 1330 idt = 7*i; 1331 s1 = x[idt]; s2 = x[1+idt]; 1332 s3 = x[2+idt]; s4 = x[3+idt]; 1333 s5 = x[4+idt]; s6 = x[5+idt]; 1334 s7 = x[6+idt]; 1335 while (nz--) { 1336 idx = 7*(*vi++); 1337 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1338 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1339 x7 = x[6+idx]; 1340 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1341 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1342 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1343 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1344 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1345 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1346 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1347 v += 49; 1348 } 1349 v = aa + 49*diag[i]; 1350 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1351 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1352 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1353 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1354 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1355 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1356 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1357 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1358 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1359 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1360 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1361 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1362 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1363 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1364 } 1365 1366 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1367 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1368 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1369 PetscFunctionReturn(0); 1370 } 1371 1372 #undef __FUNCT__ 1373 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1374 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1375 { 1376 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1377 IS iscol=a->col,isrow=a->row; 1378 PetscErrorCode ierr; 1379 const PetscInt *r,*c,*rout,*cout; 1380 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1381 const MatScalar *aa=a->a,*v; 1382 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1383 const PetscScalar *b; 1384 PetscFunctionBegin; 1385 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1386 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1387 t = a->solve_work; 1388 1389 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1390 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1391 1392 /* forward solve the lower triangular */ 1393 idx = 6*(*r++); 1394 t[0] = b[idx]; t[1] = b[1+idx]; 1395 t[2] = b[2+idx]; t[3] = b[3+idx]; 1396 t[4] = b[4+idx]; t[5] = b[5+idx]; 1397 for (i=1; i<n; i++) { 1398 v = aa + 36*ai[i]; 1399 vi = aj + ai[i]; 1400 nz = diag[i] - ai[i]; 1401 idx = 6*(*r++); 1402 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1403 s5 = b[4+idx]; s6 = b[5+idx]; 1404 while (nz--) { 1405 idx = 6*(*vi++); 1406 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1407 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1408 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1409 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1410 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1411 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1412 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1413 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1414 v += 36; 1415 } 1416 idx = 6*i; 1417 t[idx] = s1;t[1+idx] = s2; 1418 t[2+idx] = s3;t[3+idx] = s4; 1419 t[4+idx] = s5;t[5+idx] = s6; 1420 } 1421 /* backward solve the upper triangular */ 1422 for (i=n-1; i>=0; i--){ 1423 v = aa + 36*diag[i] + 36; 1424 vi = aj + diag[i] + 1; 1425 nz = ai[i+1] - diag[i] - 1; 1426 idt = 6*i; 1427 s1 = t[idt]; s2 = t[1+idt]; 1428 s3 = t[2+idt];s4 = t[3+idt]; 1429 s5 = t[4+idt];s6 = t[5+idt]; 1430 while (nz--) { 1431 idx = 6*(*vi++); 1432 x1 = t[idx]; x2 = t[1+idx]; 1433 x3 = t[2+idx]; x4 = t[3+idx]; 1434 x5 = t[4+idx]; x6 = t[5+idx]; 1435 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1436 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1437 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1438 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1439 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1440 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1441 v += 36; 1442 } 1443 idc = 6*(*c--); 1444 v = aa + 36*diag[i]; 1445 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1446 v[18]*s4+v[24]*s5+v[30]*s6; 1447 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1448 v[19]*s4+v[25]*s5+v[31]*s6; 1449 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1450 v[20]*s4+v[26]*s5+v[32]*s6; 1451 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1452 v[21]*s4+v[27]*s5+v[33]*s6; 1453 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1454 v[22]*s4+v[28]*s5+v[34]*s6; 1455 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1456 v[23]*s4+v[29]*s5+v[35]*s6; 1457 } 1458 1459 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1460 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1461 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1462 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1463 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1464 PetscFunctionReturn(0); 1465 } 1466 1467 #undef __FUNCT__ 1468 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1469 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1470 { 1471 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1472 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1473 PetscErrorCode ierr; 1474 PetscInt *diag = a->diag,jdx; 1475 const MatScalar *aa=a->a,*v; 1476 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1477 const PetscScalar *b; 1478 1479 PetscFunctionBegin; 1480 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1481 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1482 /* forward solve the lower triangular */ 1483 idx = 0; 1484 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1485 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1486 for (i=1; i<n; i++) { 1487 v = aa + 36*ai[i]; 1488 vi = aj + ai[i]; 1489 nz = diag[i] - ai[i]; 1490 idx = 6*i; 1491 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1492 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1493 while (nz--) { 1494 jdx = 6*(*vi++); 1495 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1496 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1497 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1498 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1499 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1500 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1501 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1502 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1503 v += 36; 1504 } 1505 x[idx] = s1; 1506 x[1+idx] = s2; 1507 x[2+idx] = s3; 1508 x[3+idx] = s4; 1509 x[4+idx] = s5; 1510 x[5+idx] = s6; 1511 } 1512 /* backward solve the upper triangular */ 1513 for (i=n-1; i>=0; i--){ 1514 v = aa + 36*diag[i] + 36; 1515 vi = aj + diag[i] + 1; 1516 nz = ai[i+1] - diag[i] - 1; 1517 idt = 6*i; 1518 s1 = x[idt]; s2 = x[1+idt]; 1519 s3 = x[2+idt]; s4 = x[3+idt]; 1520 s5 = x[4+idt]; s6 = x[5+idt]; 1521 while (nz--) { 1522 idx = 6*(*vi++); 1523 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1524 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1525 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1526 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1527 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1528 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1529 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1530 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1531 v += 36; 1532 } 1533 v = aa + 36*diag[i]; 1534 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1535 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1536 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1537 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1538 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1539 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1540 } 1541 1542 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1543 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1544 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1545 PetscFunctionReturn(0); 1546 } 1547 1548 #undef __FUNCT__ 1549 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1550 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1551 { 1552 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1553 IS iscol=a->col,isrow=a->row; 1554 PetscErrorCode ierr; 1555 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 1556 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1557 const MatScalar *aa=a->a,*v; 1558 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1559 const PetscScalar *b; 1560 1561 PetscFunctionBegin; 1562 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1563 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1564 t = a->solve_work; 1565 1566 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1567 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1568 1569 /* forward solve the lower triangular */ 1570 idx = 5*(*r++); 1571 t[0] = b[idx]; t[1] = b[1+idx]; 1572 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1573 for (i=1; i<n; i++) { 1574 v = aa + 25*ai[i]; 1575 vi = aj + ai[i]; 1576 nz = diag[i] - ai[i]; 1577 idx = 5*(*r++); 1578 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1579 s5 = b[4+idx]; 1580 while (nz--) { 1581 idx = 5*(*vi++); 1582 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1583 x4 = t[3+idx];x5 = t[4+idx]; 1584 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1585 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1586 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1587 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1588 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1589 v += 25; 1590 } 1591 idx = 5*i; 1592 t[idx] = s1;t[1+idx] = s2; 1593 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1594 } 1595 /* backward solve the upper triangular */ 1596 for (i=n-1; i>=0; i--){ 1597 v = aa + 25*diag[i] + 25; 1598 vi = aj + diag[i] + 1; 1599 nz = ai[i+1] - diag[i] - 1; 1600 idt = 5*i; 1601 s1 = t[idt]; s2 = t[1+idt]; 1602 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1603 while (nz--) { 1604 idx = 5*(*vi++); 1605 x1 = t[idx]; x2 = t[1+idx]; 1606 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1607 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1608 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1609 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1610 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1611 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1612 v += 25; 1613 } 1614 idc = 5*(*c--); 1615 v = aa + 25*diag[i]; 1616 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1617 v[15]*s4+v[20]*s5; 1618 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1619 v[16]*s4+v[21]*s5; 1620 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1621 v[17]*s4+v[22]*s5; 1622 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1623 v[18]*s4+v[23]*s5; 1624 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1625 v[19]*s4+v[24]*s5; 1626 } 1627 1628 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1629 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1630 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1631 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1632 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1633 PetscFunctionReturn(0); 1634 } 1635 1636 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1637 { 1638 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1639 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1640 PetscErrorCode ierr; 1641 PetscInt jdx; 1642 const MatScalar *aa=a->a,*v; 1643 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1644 const PetscScalar *b; 1645 1646 PetscFunctionBegin; 1647 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1648 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1649 /* forward solve the lower triangular */ 1650 idx = 0; 1651 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1652 for (i=1; i<n; i++) { 1653 v = aa + 25*ai[i]; 1654 vi = aj + ai[i]; 1655 nz = ai[i+1] - ai[i]; 1656 idx = 5*i; 1657 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1658 while (nz--) { 1659 jdx = 5*(*vi++); 1660 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1661 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1662 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1663 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1664 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1665 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1666 v += 25; 1667 } 1668 x[idx] = s1; 1669 x[1+idx] = s2; 1670 x[2+idx] = s3; 1671 x[3+idx] = s4; 1672 x[4+idx] = s5; 1673 } 1674 1675 /* backward solve the upper triangular */ 1676 for (i=n-1; i>=0; i--){ 1677 v = aa + 25*ai[2*n-i]; 1678 vi = aj + ai[2*n-i]; 1679 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1680 idt = 5*i; 1681 s1 = x[idt]; s2 = x[1+idt]; 1682 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1683 while (nz--) { 1684 idx = 5*(*vi++); 1685 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1686 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1687 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1688 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1689 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1690 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1691 v += 25; 1692 } 1693 /* x = inv_diagonal*x */ 1694 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1695 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1696 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1697 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1698 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1699 } 1700 1701 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1703 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1704 PetscFunctionReturn(0); 1705 } 1706 1707 #undef __FUNCT__ 1708 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1709 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1710 { 1711 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1712 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1713 PetscErrorCode ierr; 1714 PetscInt *diag = a->diag,jdx; 1715 const MatScalar *aa=a->a,*v; 1716 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1717 const PetscScalar *b; 1718 1719 PetscFunctionBegin; 1720 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1721 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1722 /* forward solve the lower triangular */ 1723 idx = 0; 1724 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1725 for (i=1; i<n; i++) { 1726 v = aa + 25*ai[i]; 1727 vi = aj + ai[i]; 1728 nz = diag[i] - ai[i]; 1729 idx = 5*i; 1730 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1731 while (nz--) { 1732 jdx = 5*(*vi++); 1733 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1734 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1735 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1736 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1737 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1738 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1739 v += 25; 1740 } 1741 x[idx] = s1; 1742 x[1+idx] = s2; 1743 x[2+idx] = s3; 1744 x[3+idx] = s4; 1745 x[4+idx] = s5; 1746 } 1747 /* backward solve the upper triangular */ 1748 for (i=n-1; i>=0; i--){ 1749 v = aa + 25*diag[i] + 25; 1750 vi = aj + diag[i] + 1; 1751 nz = ai[i+1] - diag[i] - 1; 1752 idt = 5*i; 1753 s1 = x[idt]; s2 = x[1+idt]; 1754 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1755 while (nz--) { 1756 idx = 5*(*vi++); 1757 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1758 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1759 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1760 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1761 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1762 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1763 v += 25; 1764 } 1765 v = aa + 25*diag[i]; 1766 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1767 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1768 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1769 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1770 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1771 } 1772 1773 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1774 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1775 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1776 PetscFunctionReturn(0); 1777 } 1778 1779 #undef __FUNCT__ 1780 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1781 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1782 { 1783 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1784 IS iscol=a->col,isrow=a->row; 1785 PetscErrorCode ierr; 1786 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1787 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1788 const MatScalar *aa=a->a,*v; 1789 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1790 const PetscScalar *b; 1791 1792 PetscFunctionBegin; 1793 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1794 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1795 t = a->solve_work; 1796 1797 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1798 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1799 1800 /* forward solve the lower triangular */ 1801 idx = 4*(*r++); 1802 t[0] = b[idx]; t[1] = b[1+idx]; 1803 t[2] = b[2+idx]; t[3] = b[3+idx]; 1804 for (i=1; i<n; i++) { 1805 v = aa + 16*ai[i]; 1806 vi = aj + ai[i]; 1807 nz = diag[i] - ai[i]; 1808 idx = 4*(*r++); 1809 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1810 while (nz--) { 1811 idx = 4*(*vi++); 1812 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1813 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1814 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1815 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1816 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1817 v += 16; 1818 } 1819 idx = 4*i; 1820 t[idx] = s1;t[1+idx] = s2; 1821 t[2+idx] = s3;t[3+idx] = s4; 1822 } 1823 /* backward solve the upper triangular */ 1824 for (i=n-1; i>=0; i--){ 1825 v = aa + 16*diag[i] + 16; 1826 vi = aj + diag[i] + 1; 1827 nz = ai[i+1] - diag[i] - 1; 1828 idt = 4*i; 1829 s1 = t[idt]; s2 = t[1+idt]; 1830 s3 = t[2+idt];s4 = t[3+idt]; 1831 while (nz--) { 1832 idx = 4*(*vi++); 1833 x1 = t[idx]; x2 = t[1+idx]; 1834 x3 = t[2+idx]; x4 = t[3+idx]; 1835 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1836 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1837 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1838 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1839 v += 16; 1840 } 1841 idc = 4*(*c--); 1842 v = aa + 16*diag[i]; 1843 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1844 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1845 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1846 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1847 } 1848 1849 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1850 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1851 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1852 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1853 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1854 PetscFunctionReturn(0); 1855 } 1856 1857 #undef __FUNCT__ 1858 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1859 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1860 { 1861 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1862 IS iscol=a->col,isrow=a->row; 1863 PetscErrorCode ierr; 1864 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1865 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1866 const MatScalar *aa=a->a,*v; 1867 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1868 PetscScalar *x; 1869 const PetscScalar *b; 1870 1871 PetscFunctionBegin; 1872 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1873 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1874 t = (MatScalar *)a->solve_work; 1875 1876 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1877 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1878 1879 /* forward solve the lower triangular */ 1880 idx = 4*(*r++); 1881 t[0] = (MatScalar)b[idx]; 1882 t[1] = (MatScalar)b[1+idx]; 1883 t[2] = (MatScalar)b[2+idx]; 1884 t[3] = (MatScalar)b[3+idx]; 1885 for (i=1; i<n; i++) { 1886 v = aa + 16*ai[i]; 1887 vi = aj + ai[i]; 1888 nz = diag[i] - ai[i]; 1889 idx = 4*(*r++); 1890 s1 = (MatScalar)b[idx]; 1891 s2 = (MatScalar)b[1+idx]; 1892 s3 = (MatScalar)b[2+idx]; 1893 s4 = (MatScalar)b[3+idx]; 1894 while (nz--) { 1895 idx = 4*(*vi++); 1896 x1 = t[idx]; 1897 x2 = t[1+idx]; 1898 x3 = t[2+idx]; 1899 x4 = t[3+idx]; 1900 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1901 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1902 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1903 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1904 v += 16; 1905 } 1906 idx = 4*i; 1907 t[idx] = s1; 1908 t[1+idx] = s2; 1909 t[2+idx] = s3; 1910 t[3+idx] = s4; 1911 } 1912 /* backward solve the upper triangular */ 1913 for (i=n-1; i>=0; i--){ 1914 v = aa + 16*diag[i] + 16; 1915 vi = aj + diag[i] + 1; 1916 nz = ai[i+1] - diag[i] - 1; 1917 idt = 4*i; 1918 s1 = t[idt]; 1919 s2 = t[1+idt]; 1920 s3 = t[2+idt]; 1921 s4 = t[3+idt]; 1922 while (nz--) { 1923 idx = 4*(*vi++); 1924 x1 = t[idx]; 1925 x2 = t[1+idx]; 1926 x3 = t[2+idx]; 1927 x4 = t[3+idx]; 1928 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1929 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1930 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1931 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1932 v += 16; 1933 } 1934 idc = 4*(*c--); 1935 v = aa + 16*diag[i]; 1936 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1937 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1938 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1939 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1940 x[idc] = (PetscScalar)t[idt]; 1941 x[1+idc] = (PetscScalar)t[1+idt]; 1942 x[2+idc] = (PetscScalar)t[2+idt]; 1943 x[3+idc] = (PetscScalar)t[3+idt]; 1944 } 1945 1946 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1947 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1948 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1949 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1950 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1951 PetscFunctionReturn(0); 1952 } 1953 1954 #if defined (PETSC_HAVE_SSE) 1955 1956 #include PETSC_HAVE_SSE 1957 1958 #undef __FUNCT__ 1959 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1960 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1961 { 1962 /* 1963 Note: This code uses demotion of double 1964 to float when performing the mixed-mode computation. 1965 This may not be numerically reasonable for all applications. 1966 */ 1967 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1968 IS iscol=a->col,isrow=a->row; 1969 PetscErrorCode ierr; 1970 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 1971 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1972 MatScalar *aa=a->a,*v; 1973 PetscScalar *x,*b,*t; 1974 1975 /* Make space in temp stack for 16 Byte Aligned arrays */ 1976 float ssealignedspace[11],*tmps,*tmpx; 1977 unsigned long offset; 1978 1979 PetscFunctionBegin; 1980 SSE_SCOPE_BEGIN; 1981 1982 offset = (unsigned long)ssealignedspace % 16; 1983 if (offset) offset = (16 - offset)/4; 1984 tmps = &ssealignedspace[offset]; 1985 tmpx = &ssealignedspace[offset+4]; 1986 PREFETCH_NTA(aa+16*ai[1]); 1987 1988 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1989 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1990 t = a->solve_work; 1991 1992 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1993 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1994 1995 /* forward solve the lower triangular */ 1996 idx = 4*(*r++); 1997 t[0] = b[idx]; t[1] = b[1+idx]; 1998 t[2] = b[2+idx]; t[3] = b[3+idx]; 1999 v = aa + 16*ai[1]; 2000 2001 for (i=1; i<n;) { 2002 PREFETCH_NTA(&v[8]); 2003 vi = aj + ai[i]; 2004 nz = diag[i] - ai[i]; 2005 idx = 4*(*r++); 2006 2007 /* Demote sum from double to float */ 2008 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2009 LOAD_PS(tmps,XMM7); 2010 2011 while (nz--) { 2012 PREFETCH_NTA(&v[16]); 2013 idx = 4*(*vi++); 2014 2015 /* Demote solution (so far) from double to float */ 2016 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2017 2018 /* 4x4 Matrix-Vector product with negative accumulation: */ 2019 SSE_INLINE_BEGIN_2(tmpx,v) 2020 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2021 2022 /* First Column */ 2023 SSE_COPY_PS(XMM0,XMM6) 2024 SSE_SHUFFLE(XMM0,XMM0,0x00) 2025 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2026 SSE_SUB_PS(XMM7,XMM0) 2027 2028 /* Second Column */ 2029 SSE_COPY_PS(XMM1,XMM6) 2030 SSE_SHUFFLE(XMM1,XMM1,0x55) 2031 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2032 SSE_SUB_PS(XMM7,XMM1) 2033 2034 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2035 2036 /* Third Column */ 2037 SSE_COPY_PS(XMM2,XMM6) 2038 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2039 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2040 SSE_SUB_PS(XMM7,XMM2) 2041 2042 /* Fourth Column */ 2043 SSE_COPY_PS(XMM3,XMM6) 2044 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2045 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2046 SSE_SUB_PS(XMM7,XMM3) 2047 SSE_INLINE_END_2 2048 2049 v += 16; 2050 } 2051 idx = 4*i; 2052 v = aa + 16*ai[++i]; 2053 PREFETCH_NTA(v); 2054 STORE_PS(tmps,XMM7); 2055 2056 /* Promote result from float to double */ 2057 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 2058 } 2059 /* backward solve the upper triangular */ 2060 idt = 4*(n-1); 2061 ai16 = 16*diag[n-1]; 2062 v = aa + ai16 + 16; 2063 for (i=n-1; i>=0;){ 2064 PREFETCH_NTA(&v[8]); 2065 vi = aj + diag[i] + 1; 2066 nz = ai[i+1] - diag[i] - 1; 2067 2068 /* Demote accumulator from double to float */ 2069 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 2070 LOAD_PS(tmps,XMM7); 2071 2072 while (nz--) { 2073 PREFETCH_NTA(&v[16]); 2074 idx = 4*(*vi++); 2075 2076 /* Demote solution (so far) from double to float */ 2077 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2078 2079 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2080 SSE_INLINE_BEGIN_2(tmpx,v) 2081 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2082 2083 /* First Column */ 2084 SSE_COPY_PS(XMM0,XMM6) 2085 SSE_SHUFFLE(XMM0,XMM0,0x00) 2086 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2087 SSE_SUB_PS(XMM7,XMM0) 2088 2089 /* Second Column */ 2090 SSE_COPY_PS(XMM1,XMM6) 2091 SSE_SHUFFLE(XMM1,XMM1,0x55) 2092 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2093 SSE_SUB_PS(XMM7,XMM1) 2094 2095 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2096 2097 /* Third Column */ 2098 SSE_COPY_PS(XMM2,XMM6) 2099 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2100 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2101 SSE_SUB_PS(XMM7,XMM2) 2102 2103 /* Fourth Column */ 2104 SSE_COPY_PS(XMM3,XMM6) 2105 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2106 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2107 SSE_SUB_PS(XMM7,XMM3) 2108 SSE_INLINE_END_2 2109 v += 16; 2110 } 2111 v = aa + ai16; 2112 ai16 = 16*diag[--i]; 2113 PREFETCH_NTA(aa+ai16+16); 2114 /* 2115 Scale the result by the diagonal 4x4 block, 2116 which was inverted as part of the factorization 2117 */ 2118 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2119 /* First Column */ 2120 SSE_COPY_PS(XMM0,XMM7) 2121 SSE_SHUFFLE(XMM0,XMM0,0x00) 2122 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2123 2124 /* Second Column */ 2125 SSE_COPY_PS(XMM1,XMM7) 2126 SSE_SHUFFLE(XMM1,XMM1,0x55) 2127 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2128 SSE_ADD_PS(XMM0,XMM1) 2129 2130 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2131 2132 /* Third Column */ 2133 SSE_COPY_PS(XMM2,XMM7) 2134 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2135 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2136 SSE_ADD_PS(XMM0,XMM2) 2137 2138 /* Fourth Column */ 2139 SSE_COPY_PS(XMM3,XMM7) 2140 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2141 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2142 SSE_ADD_PS(XMM0,XMM3) 2143 2144 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2145 SSE_INLINE_END_3 2146 2147 /* Promote solution from float to double */ 2148 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2149 2150 /* Apply reordering to t and stream into x. */ 2151 /* This way, x doesn't pollute the cache. */ 2152 /* Be careful with size: 2 doubles = 4 floats! */ 2153 idc = 4*(*c--); 2154 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2155 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2156 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2157 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2158 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2159 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2160 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2161 SSE_INLINE_END_2 2162 v = aa + ai16 + 16; 2163 idt -= 4; 2164 } 2165 2166 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2167 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2170 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2171 SSE_SCOPE_END; 2172 PetscFunctionReturn(0); 2173 } 2174 2175 #endif 2176 2177 2178 /* 2179 Special case where the matrix was ILU(0) factored in the natural 2180 ordering. This eliminates the need for the column and row permutation. 2181 */ 2182 #undef __FUNCT__ 2183 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2184 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2185 { 2186 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2187 PetscInt n=a->mbs; 2188 const PetscInt *ai=a->i,*aj=a->j; 2189 PetscErrorCode ierr; 2190 const PetscInt *diag = a->diag; 2191 const MatScalar *aa=a->a; 2192 PetscScalar *x; 2193 const PetscScalar *b; 2194 2195 PetscFunctionBegin; 2196 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2197 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2198 2199 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2200 { 2201 static PetscScalar w[2000]; /* very BAD need to fix */ 2202 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2203 } 2204 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2205 { 2206 static PetscScalar w[2000]; /* very BAD need to fix */ 2207 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2208 } 2209 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2210 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2211 #else 2212 { 2213 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2214 const MatScalar *v; 2215 PetscInt jdx,idt,idx,nz,i,ai16; 2216 const PetscInt *vi; 2217 2218 /* forward solve the lower triangular */ 2219 idx = 0; 2220 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2221 for (i=1; i<n; i++) { 2222 v = aa + 16*ai[i]; 2223 vi = aj + ai[i]; 2224 nz = diag[i] - ai[i]; 2225 idx += 4; 2226 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2227 while (nz--) { 2228 jdx = 4*(*vi++); 2229 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2230 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2231 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2232 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2233 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2234 v += 16; 2235 } 2236 x[idx] = s1; 2237 x[1+idx] = s2; 2238 x[2+idx] = s3; 2239 x[3+idx] = s4; 2240 } 2241 /* backward solve the upper triangular */ 2242 idt = 4*(n-1); 2243 for (i=n-1; i>=0; i--){ 2244 ai16 = 16*diag[i]; 2245 v = aa + ai16 + 16; 2246 vi = aj + diag[i] + 1; 2247 nz = ai[i+1] - diag[i] - 1; 2248 s1 = x[idt]; s2 = x[1+idt]; 2249 s3 = x[2+idt];s4 = x[3+idt]; 2250 while (nz--) { 2251 idx = 4*(*vi++); 2252 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2253 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2254 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2255 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2256 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2257 v += 16; 2258 } 2259 v = aa + ai16; 2260 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2261 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2262 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2263 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2264 idt -= 4; 2265 } 2266 } 2267 #endif 2268 2269 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2270 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2271 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2272 PetscFunctionReturn(0); 2273 } 2274 2275 #undef __FUNCT__ 2276 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2277 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2278 { 2279 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2280 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2281 PetscErrorCode ierr; 2282 PetscInt *diag = a->diag; 2283 MatScalar *aa=a->a; 2284 PetscScalar *x,*b; 2285 2286 PetscFunctionBegin; 2287 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2288 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2289 2290 { 2291 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2292 MatScalar *v,*t=(MatScalar *)x; 2293 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2294 2295 /* forward solve the lower triangular */ 2296 idx = 0; 2297 t[0] = (MatScalar)b[0]; 2298 t[1] = (MatScalar)b[1]; 2299 t[2] = (MatScalar)b[2]; 2300 t[3] = (MatScalar)b[3]; 2301 for (i=1; i<n; i++) { 2302 v = aa + 16*ai[i]; 2303 vi = aj + ai[i]; 2304 nz = diag[i] - ai[i]; 2305 idx += 4; 2306 s1 = (MatScalar)b[idx]; 2307 s2 = (MatScalar)b[1+idx]; 2308 s3 = (MatScalar)b[2+idx]; 2309 s4 = (MatScalar)b[3+idx]; 2310 while (nz--) { 2311 jdx = 4*(*vi++); 2312 x1 = t[jdx]; 2313 x2 = t[1+jdx]; 2314 x3 = t[2+jdx]; 2315 x4 = t[3+jdx]; 2316 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2317 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2318 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2319 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2320 v += 16; 2321 } 2322 t[idx] = s1; 2323 t[1+idx] = s2; 2324 t[2+idx] = s3; 2325 t[3+idx] = s4; 2326 } 2327 /* backward solve the upper triangular */ 2328 idt = 4*(n-1); 2329 for (i=n-1; i>=0; i--){ 2330 ai16 = 16*diag[i]; 2331 v = aa + ai16 + 16; 2332 vi = aj + diag[i] + 1; 2333 nz = ai[i+1] - diag[i] - 1; 2334 s1 = t[idt]; 2335 s2 = t[1+idt]; 2336 s3 = t[2+idt]; 2337 s4 = t[3+idt]; 2338 while (nz--) { 2339 idx = 4*(*vi++); 2340 x1 = (MatScalar)x[idx]; 2341 x2 = (MatScalar)x[1+idx]; 2342 x3 = (MatScalar)x[2+idx]; 2343 x4 = (MatScalar)x[3+idx]; 2344 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2345 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2346 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2347 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2348 v += 16; 2349 } 2350 v = aa + ai16; 2351 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2352 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2353 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2354 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2355 idt -= 4; 2356 } 2357 } 2358 2359 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2361 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2362 PetscFunctionReturn(0); 2363 } 2364 2365 #if defined (PETSC_HAVE_SSE) 2366 2367 #include PETSC_HAVE_SSE 2368 #undef __FUNCT__ 2369 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2370 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2371 { 2372 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2373 unsigned short *aj=(unsigned short *)a->j; 2374 PetscErrorCode ierr; 2375 int *ai=a->i,n=a->mbs,*diag = a->diag; 2376 MatScalar *aa=a->a; 2377 PetscScalar *x,*b; 2378 2379 PetscFunctionBegin; 2380 SSE_SCOPE_BEGIN; 2381 /* 2382 Note: This code currently uses demotion of double 2383 to float when performing the mixed-mode computation. 2384 This may not be numerically reasonable for all applications. 2385 */ 2386 PREFETCH_NTA(aa+16*ai[1]); 2387 2388 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2389 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2390 { 2391 /* x will first be computed in single precision then promoted inplace to double */ 2392 MatScalar *v,*t=(MatScalar *)x; 2393 int nz,i,idt,ai16; 2394 unsigned int jdx,idx; 2395 unsigned short *vi; 2396 /* Forward solve the lower triangular factor. */ 2397 2398 /* First block is the identity. */ 2399 idx = 0; 2400 CONVERT_DOUBLE4_FLOAT4(t,b); 2401 v = aa + 16*((unsigned int)ai[1]); 2402 2403 for (i=1; i<n;) { 2404 PREFETCH_NTA(&v[8]); 2405 vi = aj + ai[i]; 2406 nz = diag[i] - ai[i]; 2407 idx += 4; 2408 2409 /* Demote RHS from double to float. */ 2410 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2411 LOAD_PS(&t[idx],XMM7); 2412 2413 while (nz--) { 2414 PREFETCH_NTA(&v[16]); 2415 jdx = 4*((unsigned int)(*vi++)); 2416 2417 /* 4x4 Matrix-Vector product with negative accumulation: */ 2418 SSE_INLINE_BEGIN_2(&t[jdx],v) 2419 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2420 2421 /* First Column */ 2422 SSE_COPY_PS(XMM0,XMM6) 2423 SSE_SHUFFLE(XMM0,XMM0,0x00) 2424 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2425 SSE_SUB_PS(XMM7,XMM0) 2426 2427 /* Second Column */ 2428 SSE_COPY_PS(XMM1,XMM6) 2429 SSE_SHUFFLE(XMM1,XMM1,0x55) 2430 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2431 SSE_SUB_PS(XMM7,XMM1) 2432 2433 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2434 2435 /* Third Column */ 2436 SSE_COPY_PS(XMM2,XMM6) 2437 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2438 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2439 SSE_SUB_PS(XMM7,XMM2) 2440 2441 /* Fourth Column */ 2442 SSE_COPY_PS(XMM3,XMM6) 2443 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2444 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2445 SSE_SUB_PS(XMM7,XMM3) 2446 SSE_INLINE_END_2 2447 2448 v += 16; 2449 } 2450 v = aa + 16*ai[++i]; 2451 PREFETCH_NTA(v); 2452 STORE_PS(&t[idx],XMM7); 2453 } 2454 2455 /* Backward solve the upper triangular factor.*/ 2456 2457 idt = 4*(n-1); 2458 ai16 = 16*diag[n-1]; 2459 v = aa + ai16 + 16; 2460 for (i=n-1; i>=0;){ 2461 PREFETCH_NTA(&v[8]); 2462 vi = aj + diag[i] + 1; 2463 nz = ai[i+1] - diag[i] - 1; 2464 2465 LOAD_PS(&t[idt],XMM7); 2466 2467 while (nz--) { 2468 PREFETCH_NTA(&v[16]); 2469 idx = 4*((unsigned int)(*vi++)); 2470 2471 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2472 SSE_INLINE_BEGIN_2(&t[idx],v) 2473 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2474 2475 /* First Column */ 2476 SSE_COPY_PS(XMM0,XMM6) 2477 SSE_SHUFFLE(XMM0,XMM0,0x00) 2478 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2479 SSE_SUB_PS(XMM7,XMM0) 2480 2481 /* Second Column */ 2482 SSE_COPY_PS(XMM1,XMM6) 2483 SSE_SHUFFLE(XMM1,XMM1,0x55) 2484 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2485 SSE_SUB_PS(XMM7,XMM1) 2486 2487 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2488 2489 /* Third Column */ 2490 SSE_COPY_PS(XMM2,XMM6) 2491 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2492 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2493 SSE_SUB_PS(XMM7,XMM2) 2494 2495 /* Fourth Column */ 2496 SSE_COPY_PS(XMM3,XMM6) 2497 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2498 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2499 SSE_SUB_PS(XMM7,XMM3) 2500 SSE_INLINE_END_2 2501 v += 16; 2502 } 2503 v = aa + ai16; 2504 ai16 = 16*diag[--i]; 2505 PREFETCH_NTA(aa+ai16+16); 2506 /* 2507 Scale the result by the diagonal 4x4 block, 2508 which was inverted as part of the factorization 2509 */ 2510 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2511 /* First Column */ 2512 SSE_COPY_PS(XMM0,XMM7) 2513 SSE_SHUFFLE(XMM0,XMM0,0x00) 2514 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2515 2516 /* Second Column */ 2517 SSE_COPY_PS(XMM1,XMM7) 2518 SSE_SHUFFLE(XMM1,XMM1,0x55) 2519 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2520 SSE_ADD_PS(XMM0,XMM1) 2521 2522 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2523 2524 /* Third Column */ 2525 SSE_COPY_PS(XMM2,XMM7) 2526 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2527 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2528 SSE_ADD_PS(XMM0,XMM2) 2529 2530 /* Fourth Column */ 2531 SSE_COPY_PS(XMM3,XMM7) 2532 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2533 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2534 SSE_ADD_PS(XMM0,XMM3) 2535 2536 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2537 SSE_INLINE_END_3 2538 2539 v = aa + ai16 + 16; 2540 idt -= 4; 2541 } 2542 2543 /* Convert t from single precision back to double precision (inplace)*/ 2544 idt = 4*(n-1); 2545 for (i=n-1;i>=0;i--) { 2546 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2547 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2548 PetscScalar *xtemp=&x[idt]; 2549 MatScalar *ttemp=&t[idt]; 2550 xtemp[3] = (PetscScalar)ttemp[3]; 2551 xtemp[2] = (PetscScalar)ttemp[2]; 2552 xtemp[1] = (PetscScalar)ttemp[1]; 2553 xtemp[0] = (PetscScalar)ttemp[0]; 2554 idt -= 4; 2555 } 2556 2557 } /* End of artificial scope. */ 2558 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2559 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2560 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2561 SSE_SCOPE_END; 2562 PetscFunctionReturn(0); 2563 } 2564 2565 #undef __FUNCT__ 2566 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2567 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2568 { 2569 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2570 int *aj=a->j; 2571 PetscErrorCode ierr; 2572 int *ai=a->i,n=a->mbs,*diag = a->diag; 2573 MatScalar *aa=a->a; 2574 PetscScalar *x,*b; 2575 2576 PetscFunctionBegin; 2577 SSE_SCOPE_BEGIN; 2578 /* 2579 Note: This code currently uses demotion of double 2580 to float when performing the mixed-mode computation. 2581 This may not be numerically reasonable for all applications. 2582 */ 2583 PREFETCH_NTA(aa+16*ai[1]); 2584 2585 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2586 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2587 { 2588 /* x will first be computed in single precision then promoted inplace to double */ 2589 MatScalar *v,*t=(MatScalar *)x; 2590 int nz,i,idt,ai16; 2591 int jdx,idx; 2592 int *vi; 2593 /* Forward solve the lower triangular factor. */ 2594 2595 /* First block is the identity. */ 2596 idx = 0; 2597 CONVERT_DOUBLE4_FLOAT4(t,b); 2598 v = aa + 16*ai[1]; 2599 2600 for (i=1; i<n;) { 2601 PREFETCH_NTA(&v[8]); 2602 vi = aj + ai[i]; 2603 nz = diag[i] - ai[i]; 2604 idx += 4; 2605 2606 /* Demote RHS from double to float. */ 2607 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2608 LOAD_PS(&t[idx],XMM7); 2609 2610 while (nz--) { 2611 PREFETCH_NTA(&v[16]); 2612 jdx = 4*(*vi++); 2613 /* jdx = *vi++; */ 2614 2615 /* 4x4 Matrix-Vector product with negative accumulation: */ 2616 SSE_INLINE_BEGIN_2(&t[jdx],v) 2617 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2618 2619 /* First Column */ 2620 SSE_COPY_PS(XMM0,XMM6) 2621 SSE_SHUFFLE(XMM0,XMM0,0x00) 2622 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2623 SSE_SUB_PS(XMM7,XMM0) 2624 2625 /* Second Column */ 2626 SSE_COPY_PS(XMM1,XMM6) 2627 SSE_SHUFFLE(XMM1,XMM1,0x55) 2628 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2629 SSE_SUB_PS(XMM7,XMM1) 2630 2631 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2632 2633 /* Third Column */ 2634 SSE_COPY_PS(XMM2,XMM6) 2635 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2636 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2637 SSE_SUB_PS(XMM7,XMM2) 2638 2639 /* Fourth Column */ 2640 SSE_COPY_PS(XMM3,XMM6) 2641 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2642 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2643 SSE_SUB_PS(XMM7,XMM3) 2644 SSE_INLINE_END_2 2645 2646 v += 16; 2647 } 2648 v = aa + 16*ai[++i]; 2649 PREFETCH_NTA(v); 2650 STORE_PS(&t[idx],XMM7); 2651 } 2652 2653 /* Backward solve the upper triangular factor.*/ 2654 2655 idt = 4*(n-1); 2656 ai16 = 16*diag[n-1]; 2657 v = aa + ai16 + 16; 2658 for (i=n-1; i>=0;){ 2659 PREFETCH_NTA(&v[8]); 2660 vi = aj + diag[i] + 1; 2661 nz = ai[i+1] - diag[i] - 1; 2662 2663 LOAD_PS(&t[idt],XMM7); 2664 2665 while (nz--) { 2666 PREFETCH_NTA(&v[16]); 2667 idx = 4*(*vi++); 2668 /* idx = *vi++; */ 2669 2670 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2671 SSE_INLINE_BEGIN_2(&t[idx],v) 2672 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2673 2674 /* First Column */ 2675 SSE_COPY_PS(XMM0,XMM6) 2676 SSE_SHUFFLE(XMM0,XMM0,0x00) 2677 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2678 SSE_SUB_PS(XMM7,XMM0) 2679 2680 /* Second Column */ 2681 SSE_COPY_PS(XMM1,XMM6) 2682 SSE_SHUFFLE(XMM1,XMM1,0x55) 2683 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2684 SSE_SUB_PS(XMM7,XMM1) 2685 2686 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2687 2688 /* Third Column */ 2689 SSE_COPY_PS(XMM2,XMM6) 2690 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2691 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2692 SSE_SUB_PS(XMM7,XMM2) 2693 2694 /* Fourth Column */ 2695 SSE_COPY_PS(XMM3,XMM6) 2696 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2697 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2698 SSE_SUB_PS(XMM7,XMM3) 2699 SSE_INLINE_END_2 2700 v += 16; 2701 } 2702 v = aa + ai16; 2703 ai16 = 16*diag[--i]; 2704 PREFETCH_NTA(aa+ai16+16); 2705 /* 2706 Scale the result by the diagonal 4x4 block, 2707 which was inverted as part of the factorization 2708 */ 2709 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2710 /* First Column */ 2711 SSE_COPY_PS(XMM0,XMM7) 2712 SSE_SHUFFLE(XMM0,XMM0,0x00) 2713 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2714 2715 /* Second Column */ 2716 SSE_COPY_PS(XMM1,XMM7) 2717 SSE_SHUFFLE(XMM1,XMM1,0x55) 2718 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2719 SSE_ADD_PS(XMM0,XMM1) 2720 2721 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2722 2723 /* Third Column */ 2724 SSE_COPY_PS(XMM2,XMM7) 2725 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2726 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2727 SSE_ADD_PS(XMM0,XMM2) 2728 2729 /* Fourth Column */ 2730 SSE_COPY_PS(XMM3,XMM7) 2731 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2732 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2733 SSE_ADD_PS(XMM0,XMM3) 2734 2735 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2736 SSE_INLINE_END_3 2737 2738 v = aa + ai16 + 16; 2739 idt -= 4; 2740 } 2741 2742 /* Convert t from single precision back to double precision (inplace)*/ 2743 idt = 4*(n-1); 2744 for (i=n-1;i>=0;i--) { 2745 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2746 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2747 PetscScalar *xtemp=&x[idt]; 2748 MatScalar *ttemp=&t[idt]; 2749 xtemp[3] = (PetscScalar)ttemp[3]; 2750 xtemp[2] = (PetscScalar)ttemp[2]; 2751 xtemp[1] = (PetscScalar)ttemp[1]; 2752 xtemp[0] = (PetscScalar)ttemp[0]; 2753 idt -= 4; 2754 } 2755 2756 } /* End of artificial scope. */ 2757 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2758 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2759 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2760 SSE_SCOPE_END; 2761 PetscFunctionReturn(0); 2762 } 2763 2764 #endif 2765 #undef __FUNCT__ 2766 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2767 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2768 { 2769 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2770 IS iscol=a->col,isrow=a->row; 2771 PetscErrorCode ierr; 2772 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2773 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2774 const MatScalar *aa=a->a,*v; 2775 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2776 const PetscScalar *b; 2777 2778 PetscFunctionBegin; 2779 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2780 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2781 t = a->solve_work; 2782 2783 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2784 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2785 2786 /* forward solve the lower triangular */ 2787 idx = 3*(*r++); 2788 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2789 for (i=1; i<n; i++) { 2790 v = aa + 9*ai[i]; 2791 vi = aj + ai[i]; 2792 nz = diag[i] - ai[i]; 2793 idx = 3*(*r++); 2794 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2795 while (nz--) { 2796 idx = 3*(*vi++); 2797 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2798 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2799 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2800 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2801 v += 9; 2802 } 2803 idx = 3*i; 2804 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2805 } 2806 /* backward solve the upper triangular */ 2807 for (i=n-1; i>=0; i--){ 2808 v = aa + 9*diag[i] + 9; 2809 vi = aj + diag[i] + 1; 2810 nz = ai[i+1] - diag[i] - 1; 2811 idt = 3*i; 2812 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2813 while (nz--) { 2814 idx = 3*(*vi++); 2815 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2816 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2817 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2818 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2819 v += 9; 2820 } 2821 idc = 3*(*c--); 2822 v = aa + 9*diag[i]; 2823 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2824 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2825 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2826 } 2827 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2828 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2829 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2830 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2831 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 2832 PetscFunctionReturn(0); 2833 } 2834 2835 /* 2836 Special case where the matrix was ILU(0) factored in the natural 2837 ordering. This eliminates the need for the column and row permutation. 2838 */ 2839 #undef __FUNCT__ 2840 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2841 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2842 { 2843 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2844 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2845 PetscErrorCode ierr; 2846 PetscInt *diag = a->diag; 2847 const MatScalar *aa=a->a,*v; 2848 PetscScalar *x,s1,s2,s3,x1,x2,x3; 2849 const PetscScalar *b; 2850 PetscInt jdx,idt,idx,nz,*vi,i; 2851 2852 PetscFunctionBegin; 2853 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2854 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2855 2856 /* forward solve the lower triangular */ 2857 idx = 0; 2858 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2859 for (i=1; i<n; i++) { 2860 v = aa + 9*ai[i]; 2861 vi = aj + ai[i]; 2862 nz = diag[i] - ai[i]; 2863 idx += 3; 2864 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2865 while (nz--) { 2866 jdx = 3*(*vi++); 2867 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2868 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2869 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2870 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2871 v += 9; 2872 } 2873 x[idx] = s1; 2874 x[1+idx] = s2; 2875 x[2+idx] = s3; 2876 } 2877 /* backward solve the upper triangular */ 2878 for (i=n-1; i>=0; i--){ 2879 v = aa + 9*diag[i] + 9; 2880 vi = aj + diag[i] + 1; 2881 nz = ai[i+1] - diag[i] - 1; 2882 idt = 3*i; 2883 s1 = x[idt]; s2 = x[1+idt]; 2884 s3 = x[2+idt]; 2885 while (nz--) { 2886 idx = 3*(*vi++); 2887 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2888 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2889 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2890 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2891 v += 9; 2892 } 2893 v = aa + 9*diag[i]; 2894 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2895 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2896 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2897 } 2898 2899 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2900 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2901 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 2902 PetscFunctionReturn(0); 2903 } 2904 2905 #undef __FUNCT__ 2906 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2907 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2908 { 2909 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2910 IS iscol=a->col,isrow=a->row; 2911 PetscErrorCode ierr; 2912 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2913 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2914 const MatScalar *aa=a->a,*v; 2915 PetscScalar *x,s1,s2,x1,x2,*t; 2916 const PetscScalar *b; 2917 2918 PetscFunctionBegin; 2919 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2920 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2921 t = a->solve_work; 2922 2923 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2924 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2925 2926 /* forward solve the lower triangular */ 2927 idx = 2*(*r++); 2928 t[0] = b[idx]; t[1] = b[1+idx]; 2929 for (i=1; i<n; i++) { 2930 v = aa + 4*ai[i]; 2931 vi = aj + ai[i]; 2932 nz = diag[i] - ai[i]; 2933 idx = 2*(*r++); 2934 s1 = b[idx]; s2 = b[1+idx]; 2935 while (nz--) { 2936 idx = 2*(*vi++); 2937 x1 = t[idx]; x2 = t[1+idx]; 2938 s1 -= v[0]*x1 + v[2]*x2; 2939 s2 -= v[1]*x1 + v[3]*x2; 2940 v += 4; 2941 } 2942 idx = 2*i; 2943 t[idx] = s1; t[1+idx] = s2; 2944 } 2945 /* backward solve the upper triangular */ 2946 for (i=n-1; i>=0; i--){ 2947 v = aa + 4*diag[i] + 4; 2948 vi = aj + diag[i] + 1; 2949 nz = ai[i+1] - diag[i] - 1; 2950 idt = 2*i; 2951 s1 = t[idt]; s2 = t[1+idt]; 2952 while (nz--) { 2953 idx = 2*(*vi++); 2954 x1 = t[idx]; x2 = t[1+idx]; 2955 s1 -= v[0]*x1 + v[2]*x2; 2956 s2 -= v[1]*x1 + v[3]*x2; 2957 v += 4; 2958 } 2959 idc = 2*(*c--); 2960 v = aa + 4*diag[i]; 2961 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2962 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2963 } 2964 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2965 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2966 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2967 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2968 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 2969 PetscFunctionReturn(0); 2970 } 2971 2972 /* 2973 Special case where the matrix was ILU(0) factored in the natural 2974 ordering. This eliminates the need for the column and row permutation. 2975 */ 2976 #undef __FUNCT__ 2977 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2978 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2979 { 2980 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2981 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2982 PetscErrorCode ierr; 2983 PetscInt *diag = a->diag; 2984 const MatScalar *aa=a->a,*v; 2985 PetscScalar *x,s1,s2,x1,x2; 2986 const PetscScalar *b; 2987 PetscInt jdx,idt,idx,nz,*vi,i; 2988 2989 PetscFunctionBegin; 2990 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2991 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2992 2993 /* forward solve the lower triangular */ 2994 idx = 0; 2995 x[0] = b[0]; x[1] = b[1]; 2996 for (i=1; i<n; i++) { 2997 v = aa + 4*ai[i]; 2998 vi = aj + ai[i]; 2999 nz = diag[i] - ai[i]; 3000 idx += 2; 3001 s1 = b[idx];s2 = b[1+idx]; 3002 while (nz--) { 3003 jdx = 2*(*vi++); 3004 x1 = x[jdx];x2 = x[1+jdx]; 3005 s1 -= v[0]*x1 + v[2]*x2; 3006 s2 -= v[1]*x1 + v[3]*x2; 3007 v += 4; 3008 } 3009 x[idx] = s1; 3010 x[1+idx] = s2; 3011 } 3012 /* backward solve the upper triangular */ 3013 for (i=n-1; i>=0; i--){ 3014 v = aa + 4*diag[i] + 4; 3015 vi = aj + diag[i] + 1; 3016 nz = ai[i+1] - diag[i] - 1; 3017 idt = 2*i; 3018 s1 = x[idt]; s2 = x[1+idt]; 3019 while (nz--) { 3020 idx = 2*(*vi++); 3021 x1 = x[idx]; x2 = x[1+idx]; 3022 s1 -= v[0]*x1 + v[2]*x2; 3023 s2 -= v[1]*x1 + v[3]*x2; 3024 v += 4; 3025 } 3026 v = aa + 4*diag[i]; 3027 x[idt] = v[0]*s1 + v[2]*s2; 3028 x[1+idt] = v[1]*s1 + v[3]*s2; 3029 } 3030 3031 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3032 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3033 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3034 PetscFunctionReturn(0); 3035 } 3036 3037 #undef __FUNCT__ 3038 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 3039 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 3040 { 3041 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3042 IS iscol=a->col,isrow=a->row; 3043 PetscErrorCode ierr; 3044 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3045 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3046 MatScalar *aa=a->a,*v; 3047 PetscScalar *x,*b,s1,*t; 3048 3049 PetscFunctionBegin; 3050 if (!n) PetscFunctionReturn(0); 3051 3052 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3053 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3054 t = a->solve_work; 3055 3056 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3057 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3058 3059 /* forward solve the lower triangular */ 3060 t[0] = b[*r++]; 3061 for (i=1; i<n; i++) { 3062 v = aa + ai[i]; 3063 vi = aj + ai[i]; 3064 nz = diag[i] - ai[i]; 3065 s1 = b[*r++]; 3066 while (nz--) { 3067 s1 -= (*v++)*t[*vi++]; 3068 } 3069 t[i] = s1; 3070 } 3071 /* backward solve the upper triangular */ 3072 for (i=n-1; i>=0; i--){ 3073 v = aa + diag[i] + 1; 3074 vi = aj + diag[i] + 1; 3075 nz = ai[i+1] - diag[i] - 1; 3076 s1 = t[i]; 3077 while (nz--) { 3078 s1 -= (*v++)*t[*vi++]; 3079 } 3080 x[*c--] = t[i] = aa[diag[i]]*s1; 3081 } 3082 3083 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3084 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3085 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3086 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3087 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3088 PetscFunctionReturn(0); 3089 } 3090 /* 3091 Special case where the matrix was ILU(0) factored in the natural 3092 ordering. This eliminates the need for the column and row permutation. 3093 */ 3094 #undef __FUNCT__ 3095 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3096 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3097 { 3098 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3099 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3100 PetscErrorCode ierr; 3101 PetscInt *diag = a->diag; 3102 MatScalar *aa=a->a; 3103 PetscScalar *x,*b; 3104 PetscScalar s1,x1; 3105 MatScalar *v; 3106 PetscInt jdx,idt,idx,nz,*vi,i; 3107 3108 PetscFunctionBegin; 3109 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3110 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3111 3112 /* forward solve the lower triangular */ 3113 idx = 0; 3114 x[0] = b[0]; 3115 for (i=1; i<n; i++) { 3116 v = aa + ai[i]; 3117 vi = aj + ai[i]; 3118 nz = diag[i] - ai[i]; 3119 idx += 1; 3120 s1 = b[idx]; 3121 while (nz--) { 3122 jdx = *vi++; 3123 x1 = x[jdx]; 3124 s1 -= v[0]*x1; 3125 v += 1; 3126 } 3127 x[idx] = s1; 3128 } 3129 /* backward solve the upper triangular */ 3130 for (i=n-1; i>=0; i--){ 3131 v = aa + diag[i] + 1; 3132 vi = aj + diag[i] + 1; 3133 nz = ai[i+1] - diag[i] - 1; 3134 idt = i; 3135 s1 = x[idt]; 3136 while (nz--) { 3137 idx = *vi++; 3138 x1 = x[idx]; 3139 s1 -= v[0]*x1; 3140 v += 1; 3141 } 3142 v = aa + diag[i]; 3143 x[idt] = v[0]*s1; 3144 } 3145 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3146 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3147 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3148 PetscFunctionReturn(0); 3149 } 3150 3151 /* ----------------------------------------------------------------*/ 3152 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 3153 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3154 3155 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 3156 #undef __FUNCT__ 3157 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 3158 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 3159 { 3160 Mat C=B; 3161 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 3162 IS isrow = b->row,isicol = b->icol; 3163 PetscErrorCode ierr; 3164 const PetscInt *r,*ic,*ics; 3165 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 3166 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 3167 MatScalar *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a; 3168 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 3169 MatScalar *v_work; 3170 3171 PetscFunctionBegin; 3172 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3173 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3174 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3175 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 3176 ics = ic; 3177 3178 /* generate work space needed by dense LU factorization */ 3179 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 3180 multiplier = v_work + bs; 3181 v_pivots = (PetscInt*)(multiplier + bs2); 3182 3183 for (i=0; i<n; i++){ 3184 /* zero rtmp */ 3185 /* L part */ 3186 nz = bi[i+1] - bi[i]; 3187 bjtmp = bj + bi[i]; 3188 for (j=0; j<nz; j++){ 3189 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3190 } 3191 3192 /* U part */ 3193 nz = bi[2*n-i+1] - bi[2*n-i]; 3194 bjtmp = bj + bi[2*n-i]; 3195 for (j=0; j<nz; j++){ 3196 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3197 } 3198 3199 /* load in initial (unfactored row) */ 3200 nz = ai[r[i]+1] - ai[r[i]]; 3201 ajtmp = aj + ai[r[i]]; 3202 v = aa + bs2*ai[r[i]]; 3203 for (j=0; j<nz; j++) { 3204 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 3205 } 3206 3207 /* elimination */ 3208 bjtmp = bj + bi[i]; 3209 row = *bjtmp++; 3210 nzL = bi[i+1] - bi[i]; 3211 k = 0; 3212 while (k < nzL) { 3213 pc = rtmp + bs2*row; 3214 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 3215 if (flg) { 3216 pv = b->a + bs2*bdiag[row]; 3217 Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */ 3218 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 3219 pv = b->a + bs2*bi[2*n-row]; 3220 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 3221 for (j=0; j<nz; j++) { 3222 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 3223 } 3224 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 3225 } 3226 row = *bjtmp++; k++; 3227 } 3228 3229 /* finished row so stick it into b->a */ 3230 /* L part */ 3231 pv = b->a + bs2*bi[i] ; 3232 pj = b->j + bi[i] ; 3233 nz = bi[i+1] - bi[i]; 3234 for (j=0; j<nz; j++) { 3235 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3236 } 3237 3238 /* Mark diagonal and invert diagonal for simplier triangular solves */ 3239 pv = b->a + bs2*bdiag[i]; 3240 pj = b->j + bdiag[i]; 3241 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 3242 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3243 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 3244 3245 /* U part */ 3246 pv = b->a + bs2*bi[2*n-i]; 3247 pj = b->j + bi[2*n-i]; 3248 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3249 for (j=0; j<nz; j++){ 3250 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3251 } 3252 } 3253 3254 ierr = PetscFree(rtmp);CHKERRQ(ierr); 3255 ierr = PetscFree(v_work);CHKERRQ(ierr); 3256 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3257 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3258 if (bs == 5){ 3259 C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3260 } else { 3261 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3262 } 3263 C->assembled = PETSC_TRUE; 3264 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 3265 PetscFunctionReturn(0); 3266 } 3267 3268 /* 3269 ilu(0) with natural ordering under new data structure. 3270 Factored arrays bj and ba are stored as 3271 L(0,:), L(1,:), ...,L(n-1,:), U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:) 3272 3273 bi=fact->i is an array of size 2n+2, in which 3274 bi+ 3275 bi[i] -> 1st entry of L(i,:),i=0,...,i-1 3276 bi[n] -> end of L(n-1,:)+1 3277 bi[n+1] -> 1st entry of U(n-1,:) 3278 bi[2n-i] -> 1st entry of U(i,:) 3279 bi[2n-i+1] -> end of U(i,:)+1, the 1st entry of U(i-1,:) 3280 bi[2n] -> end of U(0,:)+1 3281 3282 U(i,:) contains diag[i] as its last entry, i.e., 3283 U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i]) 3284 */ 3285 #undef __FUNCT__ 3286 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 3287 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3288 { 3289 3290 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3291 PetscErrorCode ierr; 3292 PetscInt mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 3293 PetscInt i,j,nz=a->nz,*bi,*bj,*bdiag; 3294 3295 PetscFunctionBegin; 3296 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3297 b = (Mat_SeqBAIJ*)(fact)->data; 3298 bdiag = b->diag; 3299 3300 /* replace matrix arrays with single allocations, then reset values */ 3301 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 3302 3303 ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr); 3304 ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr); 3305 ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr); 3306 b->singlemalloc = PETSC_FALSE; 3307 if (mbs > 0) { 3308 ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr); 3309 } 3310 3311 /* set bi and bj with new data structure */ 3312 bi = b->i; 3313 bj = b->j; 3314 3315 /* L part */ 3316 bi[0] = 0; 3317 for (i=0; i<mbs; i++){ 3318 nz = adiag[i] - ai[i]; 3319 bi[i+1] = bi[i] + nz; 3320 aj = a->j + ai[i]; 3321 for (j=0; j<nz; j++){ 3322 *bj = aj[j]; bj++; 3323 } 3324 } 3325 3326 /* U part */ 3327 bi[mbs+1] = bi[mbs]; 3328 for (i=mbs-1; i>=0; i--){ 3329 nz = ai[i+1] - adiag[i] - 1; 3330 if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz); 3331 bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1; 3332 aj = a->j + adiag[i] + 1; 3333 for (j=0; j<nz; j++){ 3334 *bj = aj[j]; bj++; 3335 } 3336 /* diag[i] */ 3337 *bj = i; bj++; 3338 bdiag[i] = bi[2*mbs-i+1]-1; 3339 } 3340 PetscFunctionReturn(0); 3341 } 3342 3343 /* 3344 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3345 except that the data structure of Mat_SeqAIJ is slightly different. 3346 Not a good example of code reuse. 3347 */ 3348 3349 #undef __FUNCT__ 3350 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3351 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3352 { 3353 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3354 IS isicol; 3355 PetscErrorCode ierr; 3356 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 3357 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 3358 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3359 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 3360 PetscTruth col_identity,row_identity,both_identity,flg; 3361 PetscReal f; 3362 3363 PetscFunctionBegin; 3364 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 3365 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 3366 3367 f = info->fill; 3368 levels = (PetscInt)info->levels; 3369 diagonal_fill = (PetscInt)info->diagonal_fill; 3370 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3371 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3372 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3373 both_identity = (PetscTruth) (row_identity && col_identity); 3374 3375 if (!levels && both_identity) { /* special case copy the nonzero structure */ 3376 3377 PetscTruth newdatastruct=PETSC_FALSE; 3378 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 3379 if (newdatastruct){ 3380 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 3381 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 3382 } else { 3383 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3384 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3385 } 3386 3387 fact->factor = MAT_FACTOR_ILU; 3388 b = (Mat_SeqBAIJ*)(fact)->data; 3389 b->row = isrow; 3390 b->col = iscol; 3391 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3392 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3393 b->icol = isicol; 3394 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3395 ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3396 PetscFunctionReturn(0); 3397 } 3398 3399 /* general case perform the symbolic factorization */ 3400 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3401 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3402 3403 /* get new row pointers */ 3404 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3405 ainew[0] = 0; 3406 /* don't know how many column pointers are needed so estimate */ 3407 jmax = (PetscInt)(f*ai[n] + 1); 3408 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3409 /* ajfill is level of fill for each fill entry */ 3410 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3411 /* fill is a linked list of nonzeros in active row */ 3412 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3413 /* im is level for each filled value */ 3414 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3415 /* dloc is location of diagonal in factor */ 3416 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3417 dloc[0] = 0; 3418 for (prow=0; prow<n; prow++) { 3419 3420 /* copy prow into linked list */ 3421 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3422 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 3423 xi = aj + ai[r[prow]]; 3424 fill[n] = n; 3425 fill[prow] = -1; /* marker for diagonal entry */ 3426 while (nz--) { 3427 fm = n; 3428 idx = ic[*xi++]; 3429 do { 3430 m = fm; 3431 fm = fill[m]; 3432 } while (fm < idx); 3433 fill[m] = idx; 3434 fill[idx] = fm; 3435 im[idx] = 0; 3436 } 3437 3438 /* make sure diagonal entry is included */ 3439 if (diagonal_fill && fill[prow] == -1) { 3440 fm = n; 3441 while (fill[fm] < prow) fm = fill[fm]; 3442 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3443 fill[fm] = prow; 3444 im[prow] = 0; 3445 nzf++; 3446 dcount++; 3447 } 3448 3449 nzi = 0; 3450 row = fill[n]; 3451 while (row < prow) { 3452 incrlev = im[row] + 1; 3453 nz = dloc[row]; 3454 xi = ajnew + ainew[row] + nz + 1; 3455 flev = ajfill + ainew[row] + nz + 1; 3456 nnz = ainew[row+1] - ainew[row] - nz - 1; 3457 fm = row; 3458 while (nnz-- > 0) { 3459 idx = *xi++; 3460 if (*flev + incrlev > levels) { 3461 flev++; 3462 continue; 3463 } 3464 do { 3465 m = fm; 3466 fm = fill[m]; 3467 } while (fm < idx); 3468 if (fm != idx) { 3469 im[idx] = *flev + incrlev; 3470 fill[m] = idx; 3471 fill[idx] = fm; 3472 fm = idx; 3473 nzf++; 3474 } else { 3475 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3476 } 3477 flev++; 3478 } 3479 row = fill[row]; 3480 nzi++; 3481 } 3482 /* copy new filled row into permanent storage */ 3483 ainew[prow+1] = ainew[prow] + nzf; 3484 if (ainew[prow+1] > jmax) { 3485 3486 /* estimate how much additional space we will need */ 3487 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3488 /* just double the memory each time */ 3489 PetscInt maxadd = jmax; 3490 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3491 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3492 jmax += maxadd; 3493 3494 /* allocate a longer ajnew and ajfill */ 3495 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3496 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3497 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3498 ajnew = xitmp; 3499 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3500 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3501 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3502 ajfill = xitmp; 3503 reallocate++; /* count how many reallocations are needed */ 3504 } 3505 xitmp = ajnew + ainew[prow]; 3506 flev = ajfill + ainew[prow]; 3507 dloc[prow] = nzi; 3508 fm = fill[n]; 3509 while (nzf--) { 3510 *xitmp++ = fm; 3511 *flev++ = im[fm]; 3512 fm = fill[fm]; 3513 } 3514 /* make sure row has diagonal entry */ 3515 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3516 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3517 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3518 } 3519 } 3520 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3521 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3522 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3523 ierr = PetscFree(fill);CHKERRQ(ierr); 3524 ierr = PetscFree(im);CHKERRQ(ierr); 3525 3526 #if defined(PETSC_USE_INFO) 3527 { 3528 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3529 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3530 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3531 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3532 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3533 if (diagonal_fill) { 3534 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3535 } 3536 } 3537 #endif 3538 3539 /* put together the new matrix */ 3540 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3541 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3542 b = (Mat_SeqBAIJ*)(fact)->data; 3543 b->free_a = PETSC_TRUE; 3544 b->free_ij = PETSC_TRUE; 3545 b->singlemalloc = PETSC_FALSE; 3546 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3547 b->j = ajnew; 3548 b->i = ainew; 3549 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3550 b->diag = dloc; 3551 b->ilen = 0; 3552 b->imax = 0; 3553 b->row = isrow; 3554 b->col = iscol; 3555 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3556 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3557 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3558 b->icol = isicol; 3559 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3560 /* In b structure: Free imax, ilen, old a, old j. 3561 Allocate dloc, solve_work, new a, new j */ 3562 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3563 b->maxnz = b->nz = ainew[n]; 3564 3565 (fact)->info.factor_mallocs = reallocate; 3566 (fact)->info.fill_ratio_given = f; 3567 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3568 3569 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3570 PetscFunctionReturn(0); 3571 } 3572 3573 #undef __FUNCT__ 3574 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3575 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3576 { 3577 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3578 /* int i,*AJ=a->j,nz=a->nz; */ 3579 PetscFunctionBegin; 3580 /* Undo Column scaling */ 3581 /* while (nz--) { */ 3582 /* AJ[i] = AJ[i]/4; */ 3583 /* } */ 3584 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3585 A->ops->setunfactored = PETSC_NULL; 3586 PetscFunctionReturn(0); 3587 } 3588 3589 #undef __FUNCT__ 3590 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3591 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3592 { 3593 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3594 PetscInt *AJ=a->j,nz=a->nz; 3595 unsigned short *aj=(unsigned short *)AJ; 3596 PetscFunctionBegin; 3597 /* Is this really necessary? */ 3598 while (nz--) { 3599 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3600 } 3601 A->ops->setunfactored = PETSC_NULL; 3602 PetscFunctionReturn(0); 3603 } 3604 3605 3606