1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 PetscErrorCode ierr; 17 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 PetscInt *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 PetscScalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 24 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 27 /* forward solve the U^T */ 28 for (i=0; i<n; i++) { 29 30 v = aa + diag[i]; 31 /* multiply by the inverse of the block diagonal */ 32 s1 = (*v++)*x[i]; 33 vi = aj + diag[i] + 1; 34 nz = ai[i+1] - diag[i] - 1; 35 while (nz--) { 36 x[*vi++] -= (*v++)*s1; 37 } 38 x[i] = s1; 39 } 40 /* backward solve the L^T */ 41 for (i=n-1; i>=0; i--){ 42 v = aa + diag[i] - 1; 43 vi = aj + diag[i] - 1; 44 nz = diag[i] - ai[i]; 45 s1 = x[i]; 46 while (nz--) { 47 x[*vi--] -= (*v--)*s1; 48 } 49 } 50 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59 { 60 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61 PetscErrorCode ierr; 62 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63 PetscInt *diag = a->diag,oidx; 64 MatScalar *aa=a->a,*v; 65 PetscScalar s1,s2,x1,x2; 66 PetscScalar *x,*b; 67 68 PetscFunctionBegin; 69 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 70 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 71 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72 73 /* forward solve the U^T */ 74 idx = 0; 75 for (i=0; i<n; i++) { 76 77 v = aa + 4*diag[i]; 78 /* multiply by the inverse of the block diagonal */ 79 x1 = x[idx]; x2 = x[1+idx]; 80 s1 = v[0]*x1 + v[1]*x2; 81 s2 = v[2]*x1 + v[3]*x2; 82 v += 4; 83 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 oidx = 2*(*vi++); 88 x[oidx] -= v[0]*s1 + v[1]*s2; 89 x[oidx+1] -= v[2]*s1 + v[3]*s2; 90 v += 4; 91 } 92 x[idx] = s1;x[1+idx] = s2; 93 idx += 2; 94 } 95 /* backward solve the L^T */ 96 for (i=n-1; i>=0; i--){ 97 v = aa + 4*diag[i] - 4; 98 vi = aj + diag[i] - 1; 99 nz = diag[i] - ai[i]; 100 idt = 2*i; 101 s1 = x[idt]; s2 = x[1+idt]; 102 while (nz--) { 103 idx = 2*(*vi--); 104 x[idx] -= v[0]*s1 + v[1]*s2; 105 x[idx+1] -= v[2]*s1 + v[3]*s2; 106 v -= 4; 107 } 108 } 109 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118 { 119 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120 PetscErrorCode ierr; 121 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122 PetscInt *diag = a->diag,oidx; 123 MatScalar *aa=a->a,*v; 124 PetscScalar s1,s2,s3,x1,x2,x3; 125 PetscScalar *x,*b; 126 127 PetscFunctionBegin; 128 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 129 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 130 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131 132 /* forward solve the U^T */ 133 idx = 0; 134 for (i=0; i<n; i++) { 135 136 v = aa + 9*diag[i]; 137 /* multiply by the inverse of the block diagonal */ 138 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142 v += 9; 143 144 vi = aj + diag[i] + 1; 145 nz = ai[i+1] - diag[i] - 1; 146 while (nz--) { 147 oidx = 3*(*vi++); 148 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151 v += 9; 152 } 153 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154 idx += 3; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + 9*diag[i] - 9; 159 vi = aj + diag[i] - 1; 160 nz = diag[i] - ai[i]; 161 idt = 3*i; 162 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163 while (nz--) { 164 idx = 3*(*vi--); 165 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168 v -= 9; 169 } 170 } 171 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 172 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180 { 181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182 PetscErrorCode ierr; 183 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184 PetscInt *diag = a->diag,oidx; 185 MatScalar *aa=a->a,*v; 186 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 187 PetscScalar *x,*b; 188 189 PetscFunctionBegin; 190 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 191 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193 194 /* forward solve the U^T */ 195 idx = 0; 196 for (i=0; i<n; i++) { 197 198 v = aa + 16*diag[i]; 199 /* multiply by the inverse of the block diagonal */ 200 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205 v += 16; 206 207 vi = aj + diag[i] + 1; 208 nz = ai[i+1] - diag[i] - 1; 209 while (nz--) { 210 oidx = 4*(*vi++); 211 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215 v += 16; 216 } 217 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218 idx += 4; 219 } 220 /* backward solve the L^T */ 221 for (i=n-1; i>=0; i--){ 222 v = aa + 16*diag[i] - 16; 223 vi = aj + diag[i] - 1; 224 nz = diag[i] - ai[i]; 225 idt = 4*i; 226 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi--); 229 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233 v -= 16; 234 } 235 } 236 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 #undef __FUNCT__ 243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245 { 246 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247 PetscErrorCode ierr; 248 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249 PetscInt *diag = a->diag,oidx; 250 MatScalar *aa=a->a,*v; 251 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 252 PetscScalar *x,*b; 253 254 PetscFunctionBegin; 255 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258 259 /* forward solve the U^T */ 260 idx = 0; 261 for (i=0; i<n; i++) { 262 263 v = aa + 25*diag[i]; 264 /* multiply by the inverse of the block diagonal */ 265 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271 v += 25; 272 273 vi = aj + diag[i] + 1; 274 nz = ai[i+1] - diag[i] - 1; 275 while (nz--) { 276 oidx = 5*(*vi++); 277 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282 v += 25; 283 } 284 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285 idx += 5; 286 } 287 /* backward solve the L^T */ 288 for (i=n-1; i>=0; i--){ 289 v = aa + 25*diag[i] - 25; 290 vi = aj + diag[i] - 1; 291 nz = diag[i] - ai[i]; 292 idt = 5*i; 293 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294 while (nz--) { 295 idx = 5*(*vi--); 296 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301 v -= 25; 302 } 303 } 304 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313 { 314 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315 PetscErrorCode ierr; 316 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317 PetscInt *diag = a->diag,oidx; 318 MatScalar *aa=a->a,*v; 319 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 320 PetscScalar *x,*b; 321 322 PetscFunctionBegin; 323 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 324 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326 327 /* forward solve the U^T */ 328 idx = 0; 329 for (i=0; i<n; i++) { 330 331 v = aa + 36*diag[i]; 332 /* multiply by the inverse of the block diagonal */ 333 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334 x6 = x[5+idx]; 335 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341 v += 36; 342 343 vi = aj + diag[i] + 1; 344 nz = ai[i+1] - diag[i] - 1; 345 while (nz--) { 346 oidx = 6*(*vi++); 347 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353 v += 36; 354 } 355 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356 x[5+idx] = s6; 357 idx += 6; 358 } 359 /* backward solve the L^T */ 360 for (i=n-1; i>=0; i--){ 361 v = aa + 36*diag[i] - 36; 362 vi = aj + diag[i] - 1; 363 nz = diag[i] - ai[i]; 364 idt = 6*i; 365 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366 s6 = x[5+idt]; 367 while (nz--) { 368 idx = 6*(*vi--); 369 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375 v -= 36; 376 } 377 } 378 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 379 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387 { 388 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389 PetscErrorCode ierr; 390 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391 PetscInt *diag = a->diag,oidx; 392 MatScalar *aa=a->a,*v; 393 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 394 PetscScalar *x,*b; 395 396 PetscFunctionBegin; 397 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 398 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400 401 /* forward solve the U^T */ 402 idx = 0; 403 for (i=0; i<n; i++) { 404 405 v = aa + 49*diag[i]; 406 /* multiply by the inverse of the block diagonal */ 407 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408 x6 = x[5+idx]; x7 = x[6+idx]; 409 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416 v += 49; 417 418 vi = aj + diag[i] + 1; 419 nz = ai[i+1] - diag[i] - 1; 420 while (nz--) { 421 oidx = 7*(*vi++); 422 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429 v += 49; 430 } 431 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432 x[5+idx] = s6;x[6+idx] = s7; 433 idx += 7; 434 } 435 /* backward solve the L^T */ 436 for (i=n-1; i>=0; i--){ 437 v = aa + 49*diag[i] - 49; 438 vi = aj + diag[i] - 1; 439 nz = diag[i] - ai[i]; 440 idt = 7*i; 441 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442 s6 = x[5+idt];s7 = x[6+idt]; 443 while (nz--) { 444 idx = 7*(*vi--); 445 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452 v -= 49; 453 } 454 } 455 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 /*---------------------------------------------------------------------------------------------*/ 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467 IS iscol=a->col,isrow=a->row; 468 PetscErrorCode ierr; 469 const PetscInt *r,*c,*rout,*cout; 470 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 471 PetscInt *diag = a->diag; 472 MatScalar *aa=a->a,*v; 473 PetscScalar s1,*x,*b,*t; 474 475 PetscFunctionBegin; 476 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 477 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 478 t = a->solve_work; 479 480 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 481 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 482 483 /* copy the b into temp work space according to permutation */ 484 for (i=0; i<n; i++) { 485 t[i] = b[c[i]]; 486 } 487 488 /* forward solve the U^T */ 489 for (i=0; i<n; i++) { 490 491 v = aa + diag[i]; 492 /* multiply by the inverse of the block diagonal */ 493 s1 = (*v++)*t[i]; 494 vi = aj + diag[i] + 1; 495 nz = ai[i+1] - diag[i] - 1; 496 while (nz--) { 497 t[*vi++] -= (*v++)*s1; 498 } 499 t[i] = s1; 500 } 501 /* backward solve the L^T */ 502 for (i=n-1; i>=0; i--){ 503 v = aa + diag[i] - 1; 504 vi = aj + diag[i] - 1; 505 nz = diag[i] - ai[i]; 506 s1 = t[i]; 507 while (nz--) { 508 t[*vi--] -= (*v--)*s1; 509 } 510 } 511 512 /* copy t into x according to permutation */ 513 for (i=0; i<n; i++) { 514 x[r[i]] = t[i]; 515 } 516 517 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 518 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 519 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 520 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 #undef __FUNCT__ 526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 527 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 528 { 529 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 530 IS iscol=a->col,isrow=a->row; 531 PetscErrorCode ierr; 532 const PetscInt *r,*c,*rout,*cout; 533 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 534 PetscInt *diag = a->diag,ii,ic,ir,oidx; 535 MatScalar *aa=a->a,*v; 536 PetscScalar s1,s2,x1,x2; 537 PetscScalar *x,*b,*t; 538 539 PetscFunctionBegin; 540 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 541 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 542 t = a->solve_work; 543 544 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 545 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 546 547 /* copy the b into temp work space according to permutation */ 548 ii = 0; 549 for (i=0; i<n; i++) { 550 ic = 2*c[i]; 551 t[ii] = b[ic]; 552 t[ii+1] = b[ic+1]; 553 ii += 2; 554 } 555 556 /* forward solve the U^T */ 557 idx = 0; 558 for (i=0; i<n; i++) { 559 560 v = aa + 4*diag[i]; 561 /* multiply by the inverse of the block diagonal */ 562 x1 = t[idx]; x2 = t[1+idx]; 563 s1 = v[0]*x1 + v[1]*x2; 564 s2 = v[2]*x1 + v[3]*x2; 565 v += 4; 566 567 vi = aj + diag[i] + 1; 568 nz = ai[i+1] - diag[i] - 1; 569 while (nz--) { 570 oidx = 2*(*vi++); 571 t[oidx] -= v[0]*s1 + v[1]*s2; 572 t[oidx+1] -= v[2]*s1 + v[3]*s2; 573 v += 4; 574 } 575 t[idx] = s1;t[1+idx] = s2; 576 idx += 2; 577 } 578 /* backward solve the L^T */ 579 for (i=n-1; i>=0; i--){ 580 v = aa + 4*diag[i] - 4; 581 vi = aj + diag[i] - 1; 582 nz = diag[i] - ai[i]; 583 idt = 2*i; 584 s1 = t[idt]; s2 = t[1+idt]; 585 while (nz--) { 586 idx = 2*(*vi--); 587 t[idx] -= v[0]*s1 + v[1]*s2; 588 t[idx+1] -= v[2]*s1 + v[3]*s2; 589 v -= 4; 590 } 591 } 592 593 /* copy t into x according to permutation */ 594 ii = 0; 595 for (i=0; i<n; i++) { 596 ir = 2*r[i]; 597 x[ir] = t[ii]; 598 x[ir+1] = t[ii+1]; 599 ii += 2; 600 } 601 602 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 603 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 604 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 605 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 606 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 607 PetscFunctionReturn(0); 608 } 609 610 #undef __FUNCT__ 611 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 612 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 613 { 614 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 615 IS iscol=a->col,isrow=a->row; 616 PetscErrorCode ierr; 617 const PetscInt *r,*c,*rout,*cout; 618 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 619 PetscInt *diag = a->diag,ii,ic,ir,oidx; 620 MatScalar *aa=a->a,*v; 621 PetscScalar s1,s2,s3,x1,x2,x3; 622 PetscScalar *x,*b,*t; 623 624 PetscFunctionBegin; 625 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 626 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 627 t = a->solve_work; 628 629 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 630 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 631 632 /* copy the b into temp work space according to permutation */ 633 ii = 0; 634 for (i=0; i<n; i++) { 635 ic = 3*c[i]; 636 t[ii] = b[ic]; 637 t[ii+1] = b[ic+1]; 638 t[ii+2] = b[ic+2]; 639 ii += 3; 640 } 641 642 /* forward solve the U^T */ 643 idx = 0; 644 for (i=0; i<n; i++) { 645 646 v = aa + 9*diag[i]; 647 /* multiply by the inverse of the block diagonal */ 648 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 649 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 650 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 651 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 652 v += 9; 653 654 vi = aj + diag[i] + 1; 655 nz = ai[i+1] - diag[i] - 1; 656 while (nz--) { 657 oidx = 3*(*vi++); 658 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 659 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 660 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 661 v += 9; 662 } 663 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 664 idx += 3; 665 } 666 /* backward solve the L^T */ 667 for (i=n-1; i>=0; i--){ 668 v = aa + 9*diag[i] - 9; 669 vi = aj + diag[i] - 1; 670 nz = diag[i] - ai[i]; 671 idt = 3*i; 672 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 673 while (nz--) { 674 idx = 3*(*vi--); 675 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 676 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 677 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 678 v -= 9; 679 } 680 } 681 682 /* copy t into x according to permutation */ 683 ii = 0; 684 for (i=0; i<n; i++) { 685 ir = 3*r[i]; 686 x[ir] = t[ii]; 687 x[ir+1] = t[ii+1]; 688 x[ir+2] = t[ii+2]; 689 ii += 3; 690 } 691 692 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 693 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 694 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 695 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 696 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 697 PetscFunctionReturn(0); 698 } 699 700 #undef __FUNCT__ 701 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 702 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 703 { 704 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 705 IS iscol=a->col,isrow=a->row; 706 PetscErrorCode ierr; 707 const PetscInt *r,*c,*rout,*cout; 708 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 709 PetscInt *diag = a->diag,ii,ic,ir,oidx; 710 MatScalar *aa=a->a,*v; 711 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 712 PetscScalar *x,*b,*t; 713 714 PetscFunctionBegin; 715 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 716 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 717 t = a->solve_work; 718 719 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 720 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 721 722 /* copy the b into temp work space according to permutation */ 723 ii = 0; 724 for (i=0; i<n; i++) { 725 ic = 4*c[i]; 726 t[ii] = b[ic]; 727 t[ii+1] = b[ic+1]; 728 t[ii+2] = b[ic+2]; 729 t[ii+3] = b[ic+3]; 730 ii += 4; 731 } 732 733 /* forward solve the U^T */ 734 idx = 0; 735 for (i=0; i<n; i++) { 736 737 v = aa + 16*diag[i]; 738 /* multiply by the inverse of the block diagonal */ 739 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 740 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 741 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 742 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 743 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 744 v += 16; 745 746 vi = aj + diag[i] + 1; 747 nz = ai[i+1] - diag[i] - 1; 748 while (nz--) { 749 oidx = 4*(*vi++); 750 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 751 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 752 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 753 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 754 v += 16; 755 } 756 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 757 idx += 4; 758 } 759 /* backward solve the L^T */ 760 for (i=n-1; i>=0; i--){ 761 v = aa + 16*diag[i] - 16; 762 vi = aj + diag[i] - 1; 763 nz = diag[i] - ai[i]; 764 idt = 4*i; 765 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 766 while (nz--) { 767 idx = 4*(*vi--); 768 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 769 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 770 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 771 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 772 v -= 16; 773 } 774 } 775 776 /* copy t into x according to permutation */ 777 ii = 0; 778 for (i=0; i<n; i++) { 779 ir = 4*r[i]; 780 x[ir] = t[ii]; 781 x[ir+1] = t[ii+1]; 782 x[ir+2] = t[ii+2]; 783 x[ir+3] = t[ii+3]; 784 ii += 4; 785 } 786 787 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 788 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 789 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 790 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 791 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 792 PetscFunctionReturn(0); 793 } 794 795 #undef __FUNCT__ 796 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 797 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 798 { 799 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 800 IS iscol=a->col,isrow=a->row; 801 PetscErrorCode ierr; 802 const PetscInt *r,*c,*rout,*cout; 803 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 804 PetscInt *diag = a->diag,ii,ic,ir,oidx; 805 MatScalar *aa=a->a,*v; 806 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 807 PetscScalar *x,*b,*t; 808 809 PetscFunctionBegin; 810 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 811 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 812 t = a->solve_work; 813 814 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 815 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 816 817 /* copy the b into temp work space according to permutation */ 818 ii = 0; 819 for (i=0; i<n; i++) { 820 ic = 5*c[i]; 821 t[ii] = b[ic]; 822 t[ii+1] = b[ic+1]; 823 t[ii+2] = b[ic+2]; 824 t[ii+3] = b[ic+3]; 825 t[ii+4] = b[ic+4]; 826 ii += 5; 827 } 828 829 /* forward solve the U^T */ 830 idx = 0; 831 for (i=0; i<n; i++) { 832 833 v = aa + 25*diag[i]; 834 /* multiply by the inverse of the block diagonal */ 835 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 836 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 837 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 838 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 839 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 840 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 841 v += 25; 842 843 vi = aj + diag[i] + 1; 844 nz = ai[i+1] - diag[i] - 1; 845 while (nz--) { 846 oidx = 5*(*vi++); 847 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 848 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 849 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 850 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 851 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 852 v += 25; 853 } 854 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 855 idx += 5; 856 } 857 /* backward solve the L^T */ 858 for (i=n-1; i>=0; i--){ 859 v = aa + 25*diag[i] - 25; 860 vi = aj + diag[i] - 1; 861 nz = diag[i] - ai[i]; 862 idt = 5*i; 863 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 864 while (nz--) { 865 idx = 5*(*vi--); 866 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 867 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 868 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 869 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 870 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 871 v -= 25; 872 } 873 } 874 875 /* copy t into x according to permutation */ 876 ii = 0; 877 for (i=0; i<n; i++) { 878 ir = 5*r[i]; 879 x[ir] = t[ii]; 880 x[ir+1] = t[ii+1]; 881 x[ir+2] = t[ii+2]; 882 x[ir+3] = t[ii+3]; 883 x[ir+4] = t[ii+4]; 884 ii += 5; 885 } 886 887 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 888 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 889 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 890 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 891 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 892 PetscFunctionReturn(0); 893 } 894 895 #undef __FUNCT__ 896 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 897 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 898 { 899 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 900 IS iscol=a->col,isrow=a->row; 901 PetscErrorCode ierr; 902 const PetscInt *r,*c,*rout,*cout; 903 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 904 PetscInt *diag = a->diag,ii,ic,ir,oidx; 905 MatScalar *aa=a->a,*v; 906 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 907 PetscScalar *x,*b,*t; 908 909 PetscFunctionBegin; 910 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 912 t = a->solve_work; 913 914 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 915 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 916 917 /* copy the b into temp work space according to permutation */ 918 ii = 0; 919 for (i=0; i<n; i++) { 920 ic = 6*c[i]; 921 t[ii] = b[ic]; 922 t[ii+1] = b[ic+1]; 923 t[ii+2] = b[ic+2]; 924 t[ii+3] = b[ic+3]; 925 t[ii+4] = b[ic+4]; 926 t[ii+5] = b[ic+5]; 927 ii += 6; 928 } 929 930 /* forward solve the U^T */ 931 idx = 0; 932 for (i=0; i<n; i++) { 933 934 v = aa + 36*diag[i]; 935 /* multiply by the inverse of the block diagonal */ 936 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 937 x6 = t[5+idx]; 938 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 939 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 940 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 941 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 942 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 943 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 944 v += 36; 945 946 vi = aj + diag[i] + 1; 947 nz = ai[i+1] - diag[i] - 1; 948 while (nz--) { 949 oidx = 6*(*vi++); 950 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 951 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 952 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 953 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 954 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 955 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 956 v += 36; 957 } 958 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 959 t[5+idx] = s6; 960 idx += 6; 961 } 962 /* backward solve the L^T */ 963 for (i=n-1; i>=0; i--){ 964 v = aa + 36*diag[i] - 36; 965 vi = aj + diag[i] - 1; 966 nz = diag[i] - ai[i]; 967 idt = 6*i; 968 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 969 s6 = t[5+idt]; 970 while (nz--) { 971 idx = 6*(*vi--); 972 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 973 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 974 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 975 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 976 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 977 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 978 v -= 36; 979 } 980 } 981 982 /* copy t into x according to permutation */ 983 ii = 0; 984 for (i=0; i<n; i++) { 985 ir = 6*r[i]; 986 x[ir] = t[ii]; 987 x[ir+1] = t[ii+1]; 988 x[ir+2] = t[ii+2]; 989 x[ir+3] = t[ii+3]; 990 x[ir+4] = t[ii+4]; 991 x[ir+5] = t[ii+5]; 992 ii += 6; 993 } 994 995 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 996 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 997 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 998 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 999 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1000 PetscFunctionReturn(0); 1001 } 1002 1003 #undef __FUNCT__ 1004 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1005 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1006 { 1007 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1008 IS iscol=a->col,isrow=a->row; 1009 PetscErrorCode ierr; 1010 const PetscInt *r,*c,*rout,*cout; 1011 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1012 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1013 MatScalar *aa=a->a,*v; 1014 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1015 PetscScalar *x,*b,*t; 1016 1017 PetscFunctionBegin; 1018 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1019 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1020 t = a->solve_work; 1021 1022 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1023 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1024 1025 /* copy the b into temp work space according to permutation */ 1026 ii = 0; 1027 for (i=0; i<n; i++) { 1028 ic = 7*c[i]; 1029 t[ii] = b[ic]; 1030 t[ii+1] = b[ic+1]; 1031 t[ii+2] = b[ic+2]; 1032 t[ii+3] = b[ic+3]; 1033 t[ii+4] = b[ic+4]; 1034 t[ii+5] = b[ic+5]; 1035 t[ii+6] = b[ic+6]; 1036 ii += 7; 1037 } 1038 1039 /* forward solve the U^T */ 1040 idx = 0; 1041 for (i=0; i<n; i++) { 1042 1043 v = aa + 49*diag[i]; 1044 /* multiply by the inverse of the block diagonal */ 1045 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1046 x6 = t[5+idx]; x7 = t[6+idx]; 1047 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1048 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1049 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1050 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1051 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1052 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1053 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1054 v += 49; 1055 1056 vi = aj + diag[i] + 1; 1057 nz = ai[i+1] - diag[i] - 1; 1058 while (nz--) { 1059 oidx = 7*(*vi++); 1060 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1061 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1062 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1063 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1064 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1065 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1066 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1067 v += 49; 1068 } 1069 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1070 t[5+idx] = s6;t[6+idx] = s7; 1071 idx += 7; 1072 } 1073 /* backward solve the L^T */ 1074 for (i=n-1; i>=0; i--){ 1075 v = aa + 49*diag[i] - 49; 1076 vi = aj + diag[i] - 1; 1077 nz = diag[i] - ai[i]; 1078 idt = 7*i; 1079 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1080 s6 = t[5+idt];s7 = t[6+idt]; 1081 while (nz--) { 1082 idx = 7*(*vi--); 1083 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1084 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1085 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1086 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1087 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1088 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1089 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1090 v -= 49; 1091 } 1092 } 1093 1094 /* copy t into x according to permutation */ 1095 ii = 0; 1096 for (i=0; i<n; i++) { 1097 ir = 7*r[i]; 1098 x[ir] = t[ii]; 1099 x[ir+1] = t[ii+1]; 1100 x[ir+2] = t[ii+2]; 1101 x[ir+3] = t[ii+3]; 1102 x[ir+4] = t[ii+4]; 1103 x[ir+5] = t[ii+5]; 1104 x[ir+6] = t[ii+6]; 1105 ii += 7; 1106 } 1107 1108 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1109 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1110 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1111 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1112 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1113 PetscFunctionReturn(0); 1114 } 1115 1116 /* ----------------------------------------------------------- */ 1117 #undef __FUNCT__ 1118 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1119 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1120 { 1121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1122 IS iscol=a->col,isrow=a->row; 1123 PetscErrorCode ierr; 1124 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1125 PetscInt i,n=a->mbs; 1126 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1127 MatScalar *aa=a->a,*v; 1128 PetscScalar *x,*b,*s,*t,*ls; 1129 1130 PetscFunctionBegin; 1131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1133 t = a->solve_work; 1134 1135 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1136 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1137 1138 /* forward solve the lower triangular */ 1139 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1140 for (i=1; i<n; i++) { 1141 v = aa + bs2*ai[i]; 1142 vi = aj + ai[i]; 1143 nz = a->diag[i] - ai[i]; 1144 s = t + bs*i; 1145 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1146 while (nz--) { 1147 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1148 v += bs2; 1149 } 1150 } 1151 /* backward solve the upper triangular */ 1152 ls = a->solve_work + A->cmap->n; 1153 for (i=n-1; i>=0; i--){ 1154 v = aa + bs2*(a->diag[i] + 1); 1155 vi = aj + a->diag[i] + 1; 1156 nz = ai[i+1] - a->diag[i] - 1; 1157 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1158 while (nz--) { 1159 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1160 v += bs2; 1161 } 1162 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1163 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1164 } 1165 1166 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1167 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1176 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1177 { 1178 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1179 IS iscol=a->col,isrow=a->row; 1180 PetscErrorCode ierr; 1181 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1182 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1183 MatScalar *aa=a->a,*v; 1184 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1185 PetscScalar *x,*b,*t; 1186 1187 PetscFunctionBegin; 1188 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1189 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1190 t = a->solve_work; 1191 1192 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1193 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1194 1195 /* forward solve the lower triangular */ 1196 idx = 7*(*r++); 1197 t[0] = b[idx]; t[1] = b[1+idx]; 1198 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1199 t[5] = b[5+idx]; t[6] = b[6+idx]; 1200 1201 for (i=1; i<n; i++) { 1202 v = aa + 49*ai[i]; 1203 vi = aj + ai[i]; 1204 nz = diag[i] - ai[i]; 1205 idx = 7*(*r++); 1206 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1207 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1208 while (nz--) { 1209 idx = 7*(*vi++); 1210 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1211 x4 = t[3+idx];x5 = t[4+idx]; 1212 x6 = t[5+idx];x7 = t[6+idx]; 1213 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1214 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1215 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1216 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1217 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1218 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1219 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1220 v += 49; 1221 } 1222 idx = 7*i; 1223 t[idx] = s1;t[1+idx] = s2; 1224 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1225 t[5+idx] = s6;t[6+idx] = s7; 1226 } 1227 /* backward solve the upper triangular */ 1228 for (i=n-1; i>=0; i--){ 1229 v = aa + 49*diag[i] + 49; 1230 vi = aj + diag[i] + 1; 1231 nz = ai[i+1] - diag[i] - 1; 1232 idt = 7*i; 1233 s1 = t[idt]; s2 = t[1+idt]; 1234 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1235 s6 = t[5+idt];s7 = t[6+idt]; 1236 while (nz--) { 1237 idx = 7*(*vi++); 1238 x1 = t[idx]; x2 = t[1+idx]; 1239 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1240 x6 = t[5+idx]; x7 = t[6+idx]; 1241 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1242 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1243 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1244 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1245 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1246 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1247 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1248 v += 49; 1249 } 1250 idc = 7*(*c--); 1251 v = aa + 49*diag[i]; 1252 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1253 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1254 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1255 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1256 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1257 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1258 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1259 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1260 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1261 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1262 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1263 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1264 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1265 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1266 } 1267 1268 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1269 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1270 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1271 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1272 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1273 PetscFunctionReturn(0); 1274 } 1275 1276 #undef __FUNCT__ 1277 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1278 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1279 { 1280 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1281 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1282 PetscErrorCode ierr; 1283 PetscInt *diag = a->diag,jdx; 1284 const MatScalar *aa=a->a,*v; 1285 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1286 const PetscScalar *b; 1287 1288 PetscFunctionBegin; 1289 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1290 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1291 /* forward solve the lower triangular */ 1292 idx = 0; 1293 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1294 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1295 x[6] = b[6+idx]; 1296 for (i=1; i<n; i++) { 1297 v = aa + 49*ai[i]; 1298 vi = aj + ai[i]; 1299 nz = diag[i] - ai[i]; 1300 idx = 7*i; 1301 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1302 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1303 s7 = b[6+idx]; 1304 while (nz--) { 1305 jdx = 7*(*vi++); 1306 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1307 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1308 x7 = x[6+jdx]; 1309 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1310 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1311 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1312 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1313 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1314 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1315 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1316 v += 49; 1317 } 1318 x[idx] = s1; 1319 x[1+idx] = s2; 1320 x[2+idx] = s3; 1321 x[3+idx] = s4; 1322 x[4+idx] = s5; 1323 x[5+idx] = s6; 1324 x[6+idx] = s7; 1325 } 1326 /* backward solve the upper triangular */ 1327 for (i=n-1; i>=0; i--){ 1328 v = aa + 49*diag[i] + 49; 1329 vi = aj + diag[i] + 1; 1330 nz = ai[i+1] - diag[i] - 1; 1331 idt = 7*i; 1332 s1 = x[idt]; s2 = x[1+idt]; 1333 s3 = x[2+idt]; s4 = x[3+idt]; 1334 s5 = x[4+idt]; s6 = x[5+idt]; 1335 s7 = x[6+idt]; 1336 while (nz--) { 1337 idx = 7*(*vi++); 1338 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1339 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1340 x7 = x[6+idx]; 1341 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1342 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1343 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1344 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1345 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1346 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1347 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1348 v += 49; 1349 } 1350 v = aa + 49*diag[i]; 1351 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1352 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1353 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1354 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1355 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1356 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1357 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1358 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1359 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1360 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1361 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1362 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1363 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1364 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1365 } 1366 1367 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1369 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1370 PetscFunctionReturn(0); 1371 } 1372 1373 #undef __FUNCT__ 1374 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1375 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1376 { 1377 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1378 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1379 PetscErrorCode ierr; 1380 PetscInt idx,jdx,idt; 1381 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1382 const MatScalar *aa=a->a,*v; 1383 PetscScalar *x; 1384 const PetscScalar *b; 1385 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1386 1387 PetscFunctionBegin; 1388 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1389 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1390 /* forward solve the lower triangular */ 1391 idx = 0; 1392 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1393 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1394 for (i=1; i<n; i++) { 1395 v = aa + bs2*ai[i]; 1396 vi = aj + ai[i]; 1397 nz = ai[i+1] - ai[i]; 1398 idx = bs*i; 1399 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1400 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1401 while (nz--) { 1402 jdx = bs*(*vi++); 1403 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1404 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1405 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1406 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1407 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1408 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1409 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1410 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1411 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1412 v += bs2; 1413 } 1414 1415 x[idx] = s1; 1416 x[1+idx] = s2; 1417 x[2+idx] = s3; 1418 x[3+idx] = s4; 1419 x[4+idx] = s5; 1420 x[5+idx] = s6; 1421 x[6+idx] = s7; 1422 } 1423 1424 /* backward solve the upper triangular */ 1425 for (i=n-1; i>=0; i--){ 1426 v = aa + bs2*ai[2*n-i]; 1427 vi = aj + ai[2*n-i]; 1428 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1429 idt = bs*i; 1430 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1431 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1432 while (nz--) { 1433 idx = bs*(*vi++); 1434 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1435 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1436 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1437 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1438 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1439 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1440 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1441 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1442 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1443 v += bs2; 1444 } 1445 /* x = inv_diagonal*x */ 1446 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1447 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1448 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1449 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1450 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1451 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1452 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1453 } 1454 1455 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1457 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1458 PetscFunctionReturn(0); 1459 } 1460 1461 #undef __FUNCT__ 1462 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1463 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1464 { 1465 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1466 IS iscol=a->col,isrow=a->row; 1467 PetscErrorCode ierr; 1468 const PetscInt *r,*c,*rout,*cout; 1469 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1470 const MatScalar *aa=a->a,*v; 1471 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1472 const PetscScalar *b; 1473 PetscFunctionBegin; 1474 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1475 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1476 t = a->solve_work; 1477 1478 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1479 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1480 1481 /* forward solve the lower triangular */ 1482 idx = 6*(*r++); 1483 t[0] = b[idx]; t[1] = b[1+idx]; 1484 t[2] = b[2+idx]; t[3] = b[3+idx]; 1485 t[4] = b[4+idx]; t[5] = b[5+idx]; 1486 for (i=1; i<n; i++) { 1487 v = aa + 36*ai[i]; 1488 vi = aj + ai[i]; 1489 nz = diag[i] - ai[i]; 1490 idx = 6*(*r++); 1491 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1492 s5 = b[4+idx]; s6 = b[5+idx]; 1493 while (nz--) { 1494 idx = 6*(*vi++); 1495 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1496 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1497 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1498 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1499 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1500 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1501 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1502 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1503 v += 36; 1504 } 1505 idx = 6*i; 1506 t[idx] = s1;t[1+idx] = s2; 1507 t[2+idx] = s3;t[3+idx] = s4; 1508 t[4+idx] = s5;t[5+idx] = s6; 1509 } 1510 /* backward solve the upper triangular */ 1511 for (i=n-1; i>=0; i--){ 1512 v = aa + 36*diag[i] + 36; 1513 vi = aj + diag[i] + 1; 1514 nz = ai[i+1] - diag[i] - 1; 1515 idt = 6*i; 1516 s1 = t[idt]; s2 = t[1+idt]; 1517 s3 = t[2+idt];s4 = t[3+idt]; 1518 s5 = t[4+idt];s6 = t[5+idt]; 1519 while (nz--) { 1520 idx = 6*(*vi++); 1521 x1 = t[idx]; x2 = t[1+idx]; 1522 x3 = t[2+idx]; x4 = t[3+idx]; 1523 x5 = t[4+idx]; x6 = t[5+idx]; 1524 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1525 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1526 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1527 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1528 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1529 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1530 v += 36; 1531 } 1532 idc = 6*(*c--); 1533 v = aa + 36*diag[i]; 1534 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1535 v[18]*s4+v[24]*s5+v[30]*s6; 1536 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1537 v[19]*s4+v[25]*s5+v[31]*s6; 1538 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1539 v[20]*s4+v[26]*s5+v[32]*s6; 1540 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1541 v[21]*s4+v[27]*s5+v[33]*s6; 1542 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1543 v[22]*s4+v[28]*s5+v[34]*s6; 1544 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1545 v[23]*s4+v[29]*s5+v[35]*s6; 1546 } 1547 1548 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1549 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1550 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1551 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1552 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1553 PetscFunctionReturn(0); 1554 } 1555 1556 #undef __FUNCT__ 1557 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1558 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1559 { 1560 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1561 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1562 PetscErrorCode ierr; 1563 PetscInt *diag = a->diag,jdx; 1564 const MatScalar *aa=a->a,*v; 1565 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1566 const PetscScalar *b; 1567 1568 PetscFunctionBegin; 1569 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1570 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1571 /* forward solve the lower triangular */ 1572 idx = 0; 1573 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1574 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1575 for (i=1; i<n; i++) { 1576 v = aa + 36*ai[i]; 1577 vi = aj + ai[i]; 1578 nz = diag[i] - ai[i]; 1579 idx = 6*i; 1580 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1581 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1582 while (nz--) { 1583 jdx = 6*(*vi++); 1584 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1585 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1586 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1587 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1588 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1589 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1590 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1591 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1592 v += 36; 1593 } 1594 x[idx] = s1; 1595 x[1+idx] = s2; 1596 x[2+idx] = s3; 1597 x[3+idx] = s4; 1598 x[4+idx] = s5; 1599 x[5+idx] = s6; 1600 } 1601 /* backward solve the upper triangular */ 1602 for (i=n-1; i>=0; i--){ 1603 v = aa + 36*diag[i] + 36; 1604 vi = aj + diag[i] + 1; 1605 nz = ai[i+1] - diag[i] - 1; 1606 idt = 6*i; 1607 s1 = x[idt]; s2 = x[1+idt]; 1608 s3 = x[2+idt]; s4 = x[3+idt]; 1609 s5 = x[4+idt]; s6 = x[5+idt]; 1610 while (nz--) { 1611 idx = 6*(*vi++); 1612 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1613 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1614 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1615 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1616 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1617 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1618 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1619 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1620 v += 36; 1621 } 1622 v = aa + 36*diag[i]; 1623 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1624 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1625 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1626 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1627 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1628 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1629 } 1630 1631 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1632 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1633 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1634 PetscFunctionReturn(0); 1635 } 1636 1637 #undef __FUNCT__ 1638 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1639 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1640 { 1641 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1642 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1643 PetscErrorCode ierr; 1644 PetscInt idx,jdx,idt; 1645 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1646 const MatScalar *aa=a->a,*v; 1647 PetscScalar *x; 1648 const PetscScalar *b; 1649 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1650 1651 PetscFunctionBegin; 1652 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1653 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1654 /* forward solve the lower triangular */ 1655 idx = 0; 1656 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1657 x[4] = b[4+idx];x[5] = b[5+idx]; 1658 for (i=1; i<n; i++) { 1659 v = aa + bs2*ai[i]; 1660 vi = aj + ai[i]; 1661 nz = ai[i+1] - ai[i]; 1662 idx = bs*i; 1663 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1664 s5 = b[4+idx];s6 = b[5+idx]; 1665 while (nz--) { 1666 jdx = bs*(*vi++); 1667 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1668 x5 = x[4+jdx]; x6 = x[5+jdx]; 1669 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1670 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1671 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1672 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1673 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1674 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1675 v += bs2; 1676 } 1677 1678 x[idx] = s1; 1679 x[1+idx] = s2; 1680 x[2+idx] = s3; 1681 x[3+idx] = s4; 1682 x[4+idx] = s5; 1683 x[5+idx] = s6; 1684 } 1685 1686 /* backward solve the upper triangular */ 1687 for (i=n-1; i>=0; i--){ 1688 v = aa + bs2*ai[2*n-i]; 1689 vi = aj + ai[2*n-i]; 1690 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1691 idt = bs*i; 1692 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1693 s5 = x[4+idt];s6 = x[5+idt]; 1694 while (nz--) { 1695 idx = bs*(*vi++); 1696 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1697 x5 = x[4+idx];x6 = x[5+idx]; 1698 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1699 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1700 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1701 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1702 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1703 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1704 v += bs2; 1705 } 1706 /* x = inv_diagonal*x */ 1707 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1708 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1709 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1710 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1711 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1712 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1713 } 1714 1715 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1716 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1717 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1718 PetscFunctionReturn(0); 1719 } 1720 #undef __FUNCT__ 1721 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1722 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1723 { 1724 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1725 IS iscol=a->col,isrow=a->row; 1726 PetscErrorCode ierr; 1727 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 1728 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1729 const MatScalar *aa=a->a,*v; 1730 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1731 const PetscScalar *b; 1732 1733 PetscFunctionBegin; 1734 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1735 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1736 t = a->solve_work; 1737 1738 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1739 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1740 1741 /* forward solve the lower triangular */ 1742 idx = 5*(*r++); 1743 t[0] = b[idx]; t[1] = b[1+idx]; 1744 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1745 for (i=1; i<n; i++) { 1746 v = aa + 25*ai[i]; 1747 vi = aj + ai[i]; 1748 nz = diag[i] - ai[i]; 1749 idx = 5*(*r++); 1750 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1751 s5 = b[4+idx]; 1752 while (nz--) { 1753 idx = 5*(*vi++); 1754 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1755 x4 = t[3+idx];x5 = t[4+idx]; 1756 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1757 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1758 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1759 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1760 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1761 v += 25; 1762 } 1763 idx = 5*i; 1764 t[idx] = s1;t[1+idx] = s2; 1765 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1766 } 1767 /* backward solve the upper triangular */ 1768 for (i=n-1; i>=0; i--){ 1769 v = aa + 25*diag[i] + 25; 1770 vi = aj + diag[i] + 1; 1771 nz = ai[i+1] - diag[i] - 1; 1772 idt = 5*i; 1773 s1 = t[idt]; s2 = t[1+idt]; 1774 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1775 while (nz--) { 1776 idx = 5*(*vi++); 1777 x1 = t[idx]; x2 = t[1+idx]; 1778 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1779 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1780 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1781 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1782 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1783 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1784 v += 25; 1785 } 1786 idc = 5*(*c--); 1787 v = aa + 25*diag[i]; 1788 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1789 v[15]*s4+v[20]*s5; 1790 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1791 v[16]*s4+v[21]*s5; 1792 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1793 v[17]*s4+v[22]*s5; 1794 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1795 v[18]*s4+v[23]*s5; 1796 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1797 v[19]*s4+v[24]*s5; 1798 } 1799 1800 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1801 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1802 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1803 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1804 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1805 PetscFunctionReturn(0); 1806 } 1807 1808 #undef __FUNCT__ 1809 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1810 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1811 { 1812 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1813 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1814 PetscErrorCode ierr; 1815 PetscInt *diag = a->diag,jdx; 1816 const MatScalar *aa=a->a,*v; 1817 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1818 const PetscScalar *b; 1819 1820 PetscFunctionBegin; 1821 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1822 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1823 /* forward solve the lower triangular */ 1824 idx = 0; 1825 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1826 for (i=1; i<n; i++) { 1827 v = aa + 25*ai[i]; 1828 vi = aj + ai[i]; 1829 nz = diag[i] - ai[i]; 1830 idx = 5*i; 1831 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1832 while (nz--) { 1833 jdx = 5*(*vi++); 1834 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1835 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1836 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1837 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1838 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1839 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1840 v += 25; 1841 } 1842 x[idx] = s1; 1843 x[1+idx] = s2; 1844 x[2+idx] = s3; 1845 x[3+idx] = s4; 1846 x[4+idx] = s5; 1847 } 1848 /* backward solve the upper triangular */ 1849 for (i=n-1; i>=0; i--){ 1850 v = aa + 25*diag[i] + 25; 1851 vi = aj + diag[i] + 1; 1852 nz = ai[i+1] - diag[i] - 1; 1853 idt = 5*i; 1854 s1 = x[idt]; s2 = x[1+idt]; 1855 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1856 while (nz--) { 1857 idx = 5*(*vi++); 1858 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1859 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1860 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1861 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1862 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1863 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1864 v += 25; 1865 } 1866 v = aa + 25*diag[i]; 1867 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1868 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1869 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1870 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1871 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1872 } 1873 1874 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1875 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1876 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1877 PetscFunctionReturn(0); 1878 } 1879 1880 #undef __FUNCT__ 1881 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 1882 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1883 { 1884 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1885 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1886 PetscErrorCode ierr; 1887 PetscInt jdx; 1888 const MatScalar *aa=a->a,*v; 1889 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1890 const PetscScalar *b; 1891 1892 PetscFunctionBegin; 1893 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1894 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1895 /* forward solve the lower triangular */ 1896 idx = 0; 1897 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1898 for (i=1; i<n; i++) { 1899 v = aa + 25*ai[i]; 1900 vi = aj + ai[i]; 1901 nz = ai[i+1] - ai[i]; 1902 idx = 5*i; 1903 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1904 while (nz--) { 1905 jdx = 5*(*vi++); 1906 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1907 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1908 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1909 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1910 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1911 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1912 v += 25; 1913 } 1914 x[idx] = s1; 1915 x[1+idx] = s2; 1916 x[2+idx] = s3; 1917 x[3+idx] = s4; 1918 x[4+idx] = s5; 1919 } 1920 1921 /* backward solve the upper triangular */ 1922 for (i=n-1; i>=0; i--){ 1923 v = aa + 25*ai[2*n-i]; 1924 vi = aj + ai[2*n-i]; 1925 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1926 idt = 5*i; 1927 s1 = x[idt]; s2 = x[1+idt]; 1928 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1929 while (nz--) { 1930 idx = 5*(*vi++); 1931 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1932 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1933 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1934 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1935 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1936 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1937 v += 25; 1938 } 1939 /* x = inv_diagonal*x */ 1940 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1941 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1942 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1943 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1944 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1945 } 1946 1947 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1948 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1949 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1950 PetscFunctionReturn(0); 1951 } 1952 1953 #undef __FUNCT__ 1954 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1955 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1956 { 1957 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1958 IS iscol=a->col,isrow=a->row; 1959 PetscErrorCode ierr; 1960 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1961 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1962 const MatScalar *aa=a->a,*v; 1963 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1964 const PetscScalar *b; 1965 1966 PetscFunctionBegin; 1967 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1968 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1969 t = a->solve_work; 1970 1971 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1972 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1973 1974 /* forward solve the lower triangular */ 1975 idx = 4*(*r++); 1976 t[0] = b[idx]; t[1] = b[1+idx]; 1977 t[2] = b[2+idx]; t[3] = b[3+idx]; 1978 for (i=1; i<n; i++) { 1979 v = aa + 16*ai[i]; 1980 vi = aj + ai[i]; 1981 nz = diag[i] - ai[i]; 1982 idx = 4*(*r++); 1983 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1984 while (nz--) { 1985 idx = 4*(*vi++); 1986 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1987 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1988 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1989 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1990 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1991 v += 16; 1992 } 1993 idx = 4*i; 1994 t[idx] = s1;t[1+idx] = s2; 1995 t[2+idx] = s3;t[3+idx] = s4; 1996 } 1997 /* backward solve the upper triangular */ 1998 for (i=n-1; i>=0; i--){ 1999 v = aa + 16*diag[i] + 16; 2000 vi = aj + diag[i] + 1; 2001 nz = ai[i+1] - diag[i] - 1; 2002 idt = 4*i; 2003 s1 = t[idt]; s2 = t[1+idt]; 2004 s3 = t[2+idt];s4 = t[3+idt]; 2005 while (nz--) { 2006 idx = 4*(*vi++); 2007 x1 = t[idx]; x2 = t[1+idx]; 2008 x3 = t[2+idx]; x4 = t[3+idx]; 2009 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2010 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2011 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2012 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2013 v += 16; 2014 } 2015 idc = 4*(*c--); 2016 v = aa + 16*diag[i]; 2017 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2018 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2019 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2020 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2021 } 2022 2023 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2024 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2025 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2026 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2027 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2028 PetscFunctionReturn(0); 2029 } 2030 2031 #undef __FUNCT__ 2032 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2033 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2034 { 2035 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2036 IS iscol=a->col,isrow=a->row; 2037 PetscErrorCode ierr; 2038 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2039 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2040 const MatScalar *aa=a->a,*v; 2041 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2042 PetscScalar *x; 2043 const PetscScalar *b; 2044 2045 PetscFunctionBegin; 2046 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2047 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2048 t = (MatScalar *)a->solve_work; 2049 2050 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2051 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2052 2053 /* forward solve the lower triangular */ 2054 idx = 4*(*r++); 2055 t[0] = (MatScalar)b[idx]; 2056 t[1] = (MatScalar)b[1+idx]; 2057 t[2] = (MatScalar)b[2+idx]; 2058 t[3] = (MatScalar)b[3+idx]; 2059 for (i=1; i<n; i++) { 2060 v = aa + 16*ai[i]; 2061 vi = aj + ai[i]; 2062 nz = diag[i] - ai[i]; 2063 idx = 4*(*r++); 2064 s1 = (MatScalar)b[idx]; 2065 s2 = (MatScalar)b[1+idx]; 2066 s3 = (MatScalar)b[2+idx]; 2067 s4 = (MatScalar)b[3+idx]; 2068 while (nz--) { 2069 idx = 4*(*vi++); 2070 x1 = t[idx]; 2071 x2 = t[1+idx]; 2072 x3 = t[2+idx]; 2073 x4 = t[3+idx]; 2074 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2075 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2076 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2077 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2078 v += 16; 2079 } 2080 idx = 4*i; 2081 t[idx] = s1; 2082 t[1+idx] = s2; 2083 t[2+idx] = s3; 2084 t[3+idx] = s4; 2085 } 2086 /* backward solve the upper triangular */ 2087 for (i=n-1; i>=0; i--){ 2088 v = aa + 16*diag[i] + 16; 2089 vi = aj + diag[i] + 1; 2090 nz = ai[i+1] - diag[i] - 1; 2091 idt = 4*i; 2092 s1 = t[idt]; 2093 s2 = t[1+idt]; 2094 s3 = t[2+idt]; 2095 s4 = t[3+idt]; 2096 while (nz--) { 2097 idx = 4*(*vi++); 2098 x1 = t[idx]; 2099 x2 = t[1+idx]; 2100 x3 = t[2+idx]; 2101 x4 = t[3+idx]; 2102 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2103 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2104 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2105 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2106 v += 16; 2107 } 2108 idc = 4*(*c--); 2109 v = aa + 16*diag[i]; 2110 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2111 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2112 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2113 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2114 x[idc] = (PetscScalar)t[idt]; 2115 x[1+idc] = (PetscScalar)t[1+idt]; 2116 x[2+idc] = (PetscScalar)t[2+idt]; 2117 x[3+idc] = (PetscScalar)t[3+idt]; 2118 } 2119 2120 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2121 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2122 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2123 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2124 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2125 PetscFunctionReturn(0); 2126 } 2127 2128 #if defined (PETSC_HAVE_SSE) 2129 2130 #include PETSC_HAVE_SSE 2131 2132 #undef __FUNCT__ 2133 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2134 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 2135 { 2136 /* 2137 Note: This code uses demotion of double 2138 to float when performing the mixed-mode computation. 2139 This may not be numerically reasonable for all applications. 2140 */ 2141 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2142 IS iscol=a->col,isrow=a->row; 2143 PetscErrorCode ierr; 2144 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 2145 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2146 MatScalar *aa=a->a,*v; 2147 PetscScalar *x,*b,*t; 2148 2149 /* Make space in temp stack for 16 Byte Aligned arrays */ 2150 float ssealignedspace[11],*tmps,*tmpx; 2151 unsigned long offset; 2152 2153 PetscFunctionBegin; 2154 SSE_SCOPE_BEGIN; 2155 2156 offset = (unsigned long)ssealignedspace % 16; 2157 if (offset) offset = (16 - offset)/4; 2158 tmps = &ssealignedspace[offset]; 2159 tmpx = &ssealignedspace[offset+4]; 2160 PREFETCH_NTA(aa+16*ai[1]); 2161 2162 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2163 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2164 t = a->solve_work; 2165 2166 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2167 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2168 2169 /* forward solve the lower triangular */ 2170 idx = 4*(*r++); 2171 t[0] = b[idx]; t[1] = b[1+idx]; 2172 t[2] = b[2+idx]; t[3] = b[3+idx]; 2173 v = aa + 16*ai[1]; 2174 2175 for (i=1; i<n;) { 2176 PREFETCH_NTA(&v[8]); 2177 vi = aj + ai[i]; 2178 nz = diag[i] - ai[i]; 2179 idx = 4*(*r++); 2180 2181 /* Demote sum from double to float */ 2182 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2183 LOAD_PS(tmps,XMM7); 2184 2185 while (nz--) { 2186 PREFETCH_NTA(&v[16]); 2187 idx = 4*(*vi++); 2188 2189 /* Demote solution (so far) from double to float */ 2190 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2191 2192 /* 4x4 Matrix-Vector product with negative accumulation: */ 2193 SSE_INLINE_BEGIN_2(tmpx,v) 2194 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2195 2196 /* First Column */ 2197 SSE_COPY_PS(XMM0,XMM6) 2198 SSE_SHUFFLE(XMM0,XMM0,0x00) 2199 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2200 SSE_SUB_PS(XMM7,XMM0) 2201 2202 /* Second Column */ 2203 SSE_COPY_PS(XMM1,XMM6) 2204 SSE_SHUFFLE(XMM1,XMM1,0x55) 2205 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2206 SSE_SUB_PS(XMM7,XMM1) 2207 2208 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2209 2210 /* Third Column */ 2211 SSE_COPY_PS(XMM2,XMM6) 2212 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2213 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2214 SSE_SUB_PS(XMM7,XMM2) 2215 2216 /* Fourth Column */ 2217 SSE_COPY_PS(XMM3,XMM6) 2218 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2219 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2220 SSE_SUB_PS(XMM7,XMM3) 2221 SSE_INLINE_END_2 2222 2223 v += 16; 2224 } 2225 idx = 4*i; 2226 v = aa + 16*ai[++i]; 2227 PREFETCH_NTA(v); 2228 STORE_PS(tmps,XMM7); 2229 2230 /* Promote result from float to double */ 2231 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 2232 } 2233 /* backward solve the upper triangular */ 2234 idt = 4*(n-1); 2235 ai16 = 16*diag[n-1]; 2236 v = aa + ai16 + 16; 2237 for (i=n-1; i>=0;){ 2238 PREFETCH_NTA(&v[8]); 2239 vi = aj + diag[i] + 1; 2240 nz = ai[i+1] - diag[i] - 1; 2241 2242 /* Demote accumulator from double to float */ 2243 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 2244 LOAD_PS(tmps,XMM7); 2245 2246 while (nz--) { 2247 PREFETCH_NTA(&v[16]); 2248 idx = 4*(*vi++); 2249 2250 /* Demote solution (so far) from double to float */ 2251 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2252 2253 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2254 SSE_INLINE_BEGIN_2(tmpx,v) 2255 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2256 2257 /* First Column */ 2258 SSE_COPY_PS(XMM0,XMM6) 2259 SSE_SHUFFLE(XMM0,XMM0,0x00) 2260 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2261 SSE_SUB_PS(XMM7,XMM0) 2262 2263 /* Second Column */ 2264 SSE_COPY_PS(XMM1,XMM6) 2265 SSE_SHUFFLE(XMM1,XMM1,0x55) 2266 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2267 SSE_SUB_PS(XMM7,XMM1) 2268 2269 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2270 2271 /* Third Column */ 2272 SSE_COPY_PS(XMM2,XMM6) 2273 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2274 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2275 SSE_SUB_PS(XMM7,XMM2) 2276 2277 /* Fourth Column */ 2278 SSE_COPY_PS(XMM3,XMM6) 2279 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2280 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2281 SSE_SUB_PS(XMM7,XMM3) 2282 SSE_INLINE_END_2 2283 v += 16; 2284 } 2285 v = aa + ai16; 2286 ai16 = 16*diag[--i]; 2287 PREFETCH_NTA(aa+ai16+16); 2288 /* 2289 Scale the result by the diagonal 4x4 block, 2290 which was inverted as part of the factorization 2291 */ 2292 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2293 /* First Column */ 2294 SSE_COPY_PS(XMM0,XMM7) 2295 SSE_SHUFFLE(XMM0,XMM0,0x00) 2296 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2297 2298 /* Second Column */ 2299 SSE_COPY_PS(XMM1,XMM7) 2300 SSE_SHUFFLE(XMM1,XMM1,0x55) 2301 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2302 SSE_ADD_PS(XMM0,XMM1) 2303 2304 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2305 2306 /* Third Column */ 2307 SSE_COPY_PS(XMM2,XMM7) 2308 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2309 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2310 SSE_ADD_PS(XMM0,XMM2) 2311 2312 /* Fourth Column */ 2313 SSE_COPY_PS(XMM3,XMM7) 2314 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2315 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2316 SSE_ADD_PS(XMM0,XMM3) 2317 2318 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2319 SSE_INLINE_END_3 2320 2321 /* Promote solution from float to double */ 2322 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2323 2324 /* Apply reordering to t and stream into x. */ 2325 /* This way, x doesn't pollute the cache. */ 2326 /* Be careful with size: 2 doubles = 4 floats! */ 2327 idc = 4*(*c--); 2328 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2329 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2330 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2331 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2332 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2333 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2334 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2335 SSE_INLINE_END_2 2336 v = aa + ai16 + 16; 2337 idt -= 4; 2338 } 2339 2340 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2341 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2342 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2343 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2344 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2345 SSE_SCOPE_END; 2346 PetscFunctionReturn(0); 2347 } 2348 2349 #endif 2350 2351 2352 /* 2353 Special case where the matrix was ILU(0) factored in the natural 2354 ordering. This eliminates the need for the column and row permutation. 2355 */ 2356 #undef __FUNCT__ 2357 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2358 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2359 { 2360 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2361 PetscInt n=a->mbs; 2362 const PetscInt *ai=a->i,*aj=a->j; 2363 PetscErrorCode ierr; 2364 const PetscInt *diag = a->diag; 2365 const MatScalar *aa=a->a; 2366 PetscScalar *x; 2367 const PetscScalar *b; 2368 2369 PetscFunctionBegin; 2370 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2371 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2372 2373 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2374 { 2375 static PetscScalar w[2000]; /* very BAD need to fix */ 2376 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2377 } 2378 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2379 { 2380 static PetscScalar w[2000]; /* very BAD need to fix */ 2381 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2382 } 2383 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2384 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2385 #else 2386 { 2387 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2388 const MatScalar *v; 2389 PetscInt jdx,idt,idx,nz,i,ai16; 2390 const PetscInt *vi; 2391 2392 /* forward solve the lower triangular */ 2393 idx = 0; 2394 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2395 for (i=1; i<n; i++) { 2396 v = aa + 16*ai[i]; 2397 vi = aj + ai[i]; 2398 nz = diag[i] - ai[i]; 2399 idx += 4; 2400 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2401 while (nz--) { 2402 jdx = 4*(*vi++); 2403 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2404 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2405 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2406 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2407 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2408 v += 16; 2409 } 2410 x[idx] = s1; 2411 x[1+idx] = s2; 2412 x[2+idx] = s3; 2413 x[3+idx] = s4; 2414 } 2415 /* backward solve the upper triangular */ 2416 idt = 4*(n-1); 2417 for (i=n-1; i>=0; i--){ 2418 ai16 = 16*diag[i]; 2419 v = aa + ai16 + 16; 2420 vi = aj + diag[i] + 1; 2421 nz = ai[i+1] - diag[i] - 1; 2422 s1 = x[idt]; s2 = x[1+idt]; 2423 s3 = x[2+idt];s4 = x[3+idt]; 2424 while (nz--) { 2425 idx = 4*(*vi++); 2426 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2427 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2428 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2429 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2430 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2431 v += 16; 2432 } 2433 v = aa + ai16; 2434 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2435 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2436 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2437 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2438 idt -= 4; 2439 } 2440 } 2441 #endif 2442 2443 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2444 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2445 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2446 PetscFunctionReturn(0); 2447 } 2448 2449 #undef __FUNCT__ 2450 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 2451 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2452 { 2453 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2454 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2455 PetscErrorCode ierr; 2456 PetscInt idx,jdx,idt; 2457 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2458 const MatScalar *aa=a->a,*v; 2459 PetscScalar *x; 2460 const PetscScalar *b; 2461 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2462 2463 PetscFunctionBegin; 2464 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2465 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2466 /* forward solve the lower triangular */ 2467 idx = 0; 2468 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2469 for (i=1; i<n; i++) { 2470 v = aa + bs2*ai[i]; 2471 vi = aj + ai[i]; 2472 nz = ai[i+1] - ai[i]; 2473 idx = bs*i; 2474 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2475 while (nz--) { 2476 jdx = bs*(*vi++); 2477 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2478 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2479 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2480 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2481 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2482 2483 v += bs2; 2484 } 2485 2486 x[idx] = s1; 2487 x[1+idx] = s2; 2488 x[2+idx] = s3; 2489 x[3+idx] = s4; 2490 } 2491 2492 /* backward solve the upper triangular */ 2493 for (i=n-1; i>=0; i--){ 2494 v = aa + bs2*ai[2*n-i]; 2495 vi = aj + ai[2*n-i]; 2496 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2497 idt = bs*i; 2498 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2499 2500 while (nz--) { 2501 idx = bs*(*vi++); 2502 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2503 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2504 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2505 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2506 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2507 2508 v += bs2; 2509 } 2510 /* x = inv_diagonal*x */ 2511 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2512 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 2513 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2514 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2515 2516 } 2517 2518 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2519 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2520 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2521 PetscFunctionReturn(0); 2522 } 2523 2524 2525 2526 #undef __FUNCT__ 2527 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2528 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2529 { 2530 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2531 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2532 PetscErrorCode ierr; 2533 PetscInt *diag = a->diag; 2534 MatScalar *aa=a->a; 2535 PetscScalar *x,*b; 2536 2537 PetscFunctionBegin; 2538 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2539 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2540 2541 { 2542 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2543 MatScalar *v,*t=(MatScalar *)x; 2544 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2545 2546 /* forward solve the lower triangular */ 2547 idx = 0; 2548 t[0] = (MatScalar)b[0]; 2549 t[1] = (MatScalar)b[1]; 2550 t[2] = (MatScalar)b[2]; 2551 t[3] = (MatScalar)b[3]; 2552 for (i=1; i<n; i++) { 2553 v = aa + 16*ai[i]; 2554 vi = aj + ai[i]; 2555 nz = diag[i] - ai[i]; 2556 idx += 4; 2557 s1 = (MatScalar)b[idx]; 2558 s2 = (MatScalar)b[1+idx]; 2559 s3 = (MatScalar)b[2+idx]; 2560 s4 = (MatScalar)b[3+idx]; 2561 while (nz--) { 2562 jdx = 4*(*vi++); 2563 x1 = t[jdx]; 2564 x2 = t[1+jdx]; 2565 x3 = t[2+jdx]; 2566 x4 = t[3+jdx]; 2567 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2568 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2569 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2570 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2571 v += 16; 2572 } 2573 t[idx] = s1; 2574 t[1+idx] = s2; 2575 t[2+idx] = s3; 2576 t[3+idx] = s4; 2577 } 2578 /* backward solve the upper triangular */ 2579 idt = 4*(n-1); 2580 for (i=n-1; i>=0; i--){ 2581 ai16 = 16*diag[i]; 2582 v = aa + ai16 + 16; 2583 vi = aj + diag[i] + 1; 2584 nz = ai[i+1] - diag[i] - 1; 2585 s1 = t[idt]; 2586 s2 = t[1+idt]; 2587 s3 = t[2+idt]; 2588 s4 = t[3+idt]; 2589 while (nz--) { 2590 idx = 4*(*vi++); 2591 x1 = (MatScalar)x[idx]; 2592 x2 = (MatScalar)x[1+idx]; 2593 x3 = (MatScalar)x[2+idx]; 2594 x4 = (MatScalar)x[3+idx]; 2595 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2596 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2597 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2598 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2599 v += 16; 2600 } 2601 v = aa + ai16; 2602 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2603 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2604 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2605 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2606 idt -= 4; 2607 } 2608 } 2609 2610 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2611 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2612 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2613 PetscFunctionReturn(0); 2614 } 2615 2616 #if defined (PETSC_HAVE_SSE) 2617 2618 #include PETSC_HAVE_SSE 2619 #undef __FUNCT__ 2620 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2621 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2622 { 2623 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2624 unsigned short *aj=(unsigned short *)a->j; 2625 PetscErrorCode ierr; 2626 int *ai=a->i,n=a->mbs,*diag = a->diag; 2627 MatScalar *aa=a->a; 2628 PetscScalar *x,*b; 2629 2630 PetscFunctionBegin; 2631 SSE_SCOPE_BEGIN; 2632 /* 2633 Note: This code currently uses demotion of double 2634 to float when performing the mixed-mode computation. 2635 This may not be numerically reasonable for all applications. 2636 */ 2637 PREFETCH_NTA(aa+16*ai[1]); 2638 2639 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2640 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2641 { 2642 /* x will first be computed in single precision then promoted inplace to double */ 2643 MatScalar *v,*t=(MatScalar *)x; 2644 int nz,i,idt,ai16; 2645 unsigned int jdx,idx; 2646 unsigned short *vi; 2647 /* Forward solve the lower triangular factor. */ 2648 2649 /* First block is the identity. */ 2650 idx = 0; 2651 CONVERT_DOUBLE4_FLOAT4(t,b); 2652 v = aa + 16*((unsigned int)ai[1]); 2653 2654 for (i=1; i<n;) { 2655 PREFETCH_NTA(&v[8]); 2656 vi = aj + ai[i]; 2657 nz = diag[i] - ai[i]; 2658 idx += 4; 2659 2660 /* Demote RHS from double to float. */ 2661 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2662 LOAD_PS(&t[idx],XMM7); 2663 2664 while (nz--) { 2665 PREFETCH_NTA(&v[16]); 2666 jdx = 4*((unsigned int)(*vi++)); 2667 2668 /* 4x4 Matrix-Vector product with negative accumulation: */ 2669 SSE_INLINE_BEGIN_2(&t[jdx],v) 2670 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2671 2672 /* First Column */ 2673 SSE_COPY_PS(XMM0,XMM6) 2674 SSE_SHUFFLE(XMM0,XMM0,0x00) 2675 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2676 SSE_SUB_PS(XMM7,XMM0) 2677 2678 /* Second Column */ 2679 SSE_COPY_PS(XMM1,XMM6) 2680 SSE_SHUFFLE(XMM1,XMM1,0x55) 2681 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2682 SSE_SUB_PS(XMM7,XMM1) 2683 2684 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2685 2686 /* Third Column */ 2687 SSE_COPY_PS(XMM2,XMM6) 2688 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2689 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2690 SSE_SUB_PS(XMM7,XMM2) 2691 2692 /* Fourth Column */ 2693 SSE_COPY_PS(XMM3,XMM6) 2694 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2695 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2696 SSE_SUB_PS(XMM7,XMM3) 2697 SSE_INLINE_END_2 2698 2699 v += 16; 2700 } 2701 v = aa + 16*ai[++i]; 2702 PREFETCH_NTA(v); 2703 STORE_PS(&t[idx],XMM7); 2704 } 2705 2706 /* Backward solve the upper triangular factor.*/ 2707 2708 idt = 4*(n-1); 2709 ai16 = 16*diag[n-1]; 2710 v = aa + ai16 + 16; 2711 for (i=n-1; i>=0;){ 2712 PREFETCH_NTA(&v[8]); 2713 vi = aj + diag[i] + 1; 2714 nz = ai[i+1] - diag[i] - 1; 2715 2716 LOAD_PS(&t[idt],XMM7); 2717 2718 while (nz--) { 2719 PREFETCH_NTA(&v[16]); 2720 idx = 4*((unsigned int)(*vi++)); 2721 2722 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2723 SSE_INLINE_BEGIN_2(&t[idx],v) 2724 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2725 2726 /* First Column */ 2727 SSE_COPY_PS(XMM0,XMM6) 2728 SSE_SHUFFLE(XMM0,XMM0,0x00) 2729 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2730 SSE_SUB_PS(XMM7,XMM0) 2731 2732 /* Second Column */ 2733 SSE_COPY_PS(XMM1,XMM6) 2734 SSE_SHUFFLE(XMM1,XMM1,0x55) 2735 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2736 SSE_SUB_PS(XMM7,XMM1) 2737 2738 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2739 2740 /* Third Column */ 2741 SSE_COPY_PS(XMM2,XMM6) 2742 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2743 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2744 SSE_SUB_PS(XMM7,XMM2) 2745 2746 /* Fourth Column */ 2747 SSE_COPY_PS(XMM3,XMM6) 2748 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2749 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2750 SSE_SUB_PS(XMM7,XMM3) 2751 SSE_INLINE_END_2 2752 v += 16; 2753 } 2754 v = aa + ai16; 2755 ai16 = 16*diag[--i]; 2756 PREFETCH_NTA(aa+ai16+16); 2757 /* 2758 Scale the result by the diagonal 4x4 block, 2759 which was inverted as part of the factorization 2760 */ 2761 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2762 /* First Column */ 2763 SSE_COPY_PS(XMM0,XMM7) 2764 SSE_SHUFFLE(XMM0,XMM0,0x00) 2765 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2766 2767 /* Second Column */ 2768 SSE_COPY_PS(XMM1,XMM7) 2769 SSE_SHUFFLE(XMM1,XMM1,0x55) 2770 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2771 SSE_ADD_PS(XMM0,XMM1) 2772 2773 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2774 2775 /* Third Column */ 2776 SSE_COPY_PS(XMM2,XMM7) 2777 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2778 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2779 SSE_ADD_PS(XMM0,XMM2) 2780 2781 /* Fourth Column */ 2782 SSE_COPY_PS(XMM3,XMM7) 2783 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2784 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2785 SSE_ADD_PS(XMM0,XMM3) 2786 2787 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2788 SSE_INLINE_END_3 2789 2790 v = aa + ai16 + 16; 2791 idt -= 4; 2792 } 2793 2794 /* Convert t from single precision back to double precision (inplace)*/ 2795 idt = 4*(n-1); 2796 for (i=n-1;i>=0;i--) { 2797 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2798 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2799 PetscScalar *xtemp=&x[idt]; 2800 MatScalar *ttemp=&t[idt]; 2801 xtemp[3] = (PetscScalar)ttemp[3]; 2802 xtemp[2] = (PetscScalar)ttemp[2]; 2803 xtemp[1] = (PetscScalar)ttemp[1]; 2804 xtemp[0] = (PetscScalar)ttemp[0]; 2805 idt -= 4; 2806 } 2807 2808 } /* End of artificial scope. */ 2809 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2810 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2811 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2812 SSE_SCOPE_END; 2813 PetscFunctionReturn(0); 2814 } 2815 2816 #undef __FUNCT__ 2817 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2818 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2819 { 2820 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2821 int *aj=a->j; 2822 PetscErrorCode ierr; 2823 int *ai=a->i,n=a->mbs,*diag = a->diag; 2824 MatScalar *aa=a->a; 2825 PetscScalar *x,*b; 2826 2827 PetscFunctionBegin; 2828 SSE_SCOPE_BEGIN; 2829 /* 2830 Note: This code currently uses demotion of double 2831 to float when performing the mixed-mode computation. 2832 This may not be numerically reasonable for all applications. 2833 */ 2834 PREFETCH_NTA(aa+16*ai[1]); 2835 2836 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2837 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2838 { 2839 /* x will first be computed in single precision then promoted inplace to double */ 2840 MatScalar *v,*t=(MatScalar *)x; 2841 int nz,i,idt,ai16; 2842 int jdx,idx; 2843 int *vi; 2844 /* Forward solve the lower triangular factor. */ 2845 2846 /* First block is the identity. */ 2847 idx = 0; 2848 CONVERT_DOUBLE4_FLOAT4(t,b); 2849 v = aa + 16*ai[1]; 2850 2851 for (i=1; i<n;) { 2852 PREFETCH_NTA(&v[8]); 2853 vi = aj + ai[i]; 2854 nz = diag[i] - ai[i]; 2855 idx += 4; 2856 2857 /* Demote RHS from double to float. */ 2858 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2859 LOAD_PS(&t[idx],XMM7); 2860 2861 while (nz--) { 2862 PREFETCH_NTA(&v[16]); 2863 jdx = 4*(*vi++); 2864 /* jdx = *vi++; */ 2865 2866 /* 4x4 Matrix-Vector product with negative accumulation: */ 2867 SSE_INLINE_BEGIN_2(&t[jdx],v) 2868 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2869 2870 /* First Column */ 2871 SSE_COPY_PS(XMM0,XMM6) 2872 SSE_SHUFFLE(XMM0,XMM0,0x00) 2873 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2874 SSE_SUB_PS(XMM7,XMM0) 2875 2876 /* Second Column */ 2877 SSE_COPY_PS(XMM1,XMM6) 2878 SSE_SHUFFLE(XMM1,XMM1,0x55) 2879 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2880 SSE_SUB_PS(XMM7,XMM1) 2881 2882 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2883 2884 /* Third Column */ 2885 SSE_COPY_PS(XMM2,XMM6) 2886 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2887 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2888 SSE_SUB_PS(XMM7,XMM2) 2889 2890 /* Fourth Column */ 2891 SSE_COPY_PS(XMM3,XMM6) 2892 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2893 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2894 SSE_SUB_PS(XMM7,XMM3) 2895 SSE_INLINE_END_2 2896 2897 v += 16; 2898 } 2899 v = aa + 16*ai[++i]; 2900 PREFETCH_NTA(v); 2901 STORE_PS(&t[idx],XMM7); 2902 } 2903 2904 /* Backward solve the upper triangular factor.*/ 2905 2906 idt = 4*(n-1); 2907 ai16 = 16*diag[n-1]; 2908 v = aa + ai16 + 16; 2909 for (i=n-1; i>=0;){ 2910 PREFETCH_NTA(&v[8]); 2911 vi = aj + diag[i] + 1; 2912 nz = ai[i+1] - diag[i] - 1; 2913 2914 LOAD_PS(&t[idt],XMM7); 2915 2916 while (nz--) { 2917 PREFETCH_NTA(&v[16]); 2918 idx = 4*(*vi++); 2919 /* idx = *vi++; */ 2920 2921 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2922 SSE_INLINE_BEGIN_2(&t[idx],v) 2923 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2924 2925 /* First Column */ 2926 SSE_COPY_PS(XMM0,XMM6) 2927 SSE_SHUFFLE(XMM0,XMM0,0x00) 2928 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2929 SSE_SUB_PS(XMM7,XMM0) 2930 2931 /* Second Column */ 2932 SSE_COPY_PS(XMM1,XMM6) 2933 SSE_SHUFFLE(XMM1,XMM1,0x55) 2934 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2935 SSE_SUB_PS(XMM7,XMM1) 2936 2937 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2938 2939 /* Third Column */ 2940 SSE_COPY_PS(XMM2,XMM6) 2941 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2942 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2943 SSE_SUB_PS(XMM7,XMM2) 2944 2945 /* Fourth Column */ 2946 SSE_COPY_PS(XMM3,XMM6) 2947 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2948 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2949 SSE_SUB_PS(XMM7,XMM3) 2950 SSE_INLINE_END_2 2951 v += 16; 2952 } 2953 v = aa + ai16; 2954 ai16 = 16*diag[--i]; 2955 PREFETCH_NTA(aa+ai16+16); 2956 /* 2957 Scale the result by the diagonal 4x4 block, 2958 which was inverted as part of the factorization 2959 */ 2960 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2961 /* First Column */ 2962 SSE_COPY_PS(XMM0,XMM7) 2963 SSE_SHUFFLE(XMM0,XMM0,0x00) 2964 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2965 2966 /* Second Column */ 2967 SSE_COPY_PS(XMM1,XMM7) 2968 SSE_SHUFFLE(XMM1,XMM1,0x55) 2969 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2970 SSE_ADD_PS(XMM0,XMM1) 2971 2972 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2973 2974 /* Third Column */ 2975 SSE_COPY_PS(XMM2,XMM7) 2976 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2977 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2978 SSE_ADD_PS(XMM0,XMM2) 2979 2980 /* Fourth Column */ 2981 SSE_COPY_PS(XMM3,XMM7) 2982 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2983 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2984 SSE_ADD_PS(XMM0,XMM3) 2985 2986 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2987 SSE_INLINE_END_3 2988 2989 v = aa + ai16 + 16; 2990 idt -= 4; 2991 } 2992 2993 /* Convert t from single precision back to double precision (inplace)*/ 2994 idt = 4*(n-1); 2995 for (i=n-1;i>=0;i--) { 2996 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2997 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2998 PetscScalar *xtemp=&x[idt]; 2999 MatScalar *ttemp=&t[idt]; 3000 xtemp[3] = (PetscScalar)ttemp[3]; 3001 xtemp[2] = (PetscScalar)ttemp[2]; 3002 xtemp[1] = (PetscScalar)ttemp[1]; 3003 xtemp[0] = (PetscScalar)ttemp[0]; 3004 idt -= 4; 3005 } 3006 3007 } /* End of artificial scope. */ 3008 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3009 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3010 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3011 SSE_SCOPE_END; 3012 PetscFunctionReturn(0); 3013 } 3014 3015 #endif 3016 #undef __FUNCT__ 3017 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3018 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 3019 { 3020 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3021 IS iscol=a->col,isrow=a->row; 3022 PetscErrorCode ierr; 3023 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3024 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3025 const MatScalar *aa=a->a,*v; 3026 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3027 const PetscScalar *b; 3028 3029 PetscFunctionBegin; 3030 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3031 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3032 t = a->solve_work; 3033 3034 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3035 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3036 3037 /* forward solve the lower triangular */ 3038 idx = 3*(*r++); 3039 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3040 for (i=1; i<n; i++) { 3041 v = aa + 9*ai[i]; 3042 vi = aj + ai[i]; 3043 nz = diag[i] - ai[i]; 3044 idx = 3*(*r++); 3045 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3046 while (nz--) { 3047 idx = 3*(*vi++); 3048 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3049 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3050 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3051 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3052 v += 9; 3053 } 3054 idx = 3*i; 3055 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3056 } 3057 /* backward solve the upper triangular */ 3058 for (i=n-1; i>=0; i--){ 3059 v = aa + 9*diag[i] + 9; 3060 vi = aj + diag[i] + 1; 3061 nz = ai[i+1] - diag[i] - 1; 3062 idt = 3*i; 3063 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3064 while (nz--) { 3065 idx = 3*(*vi++); 3066 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3067 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3068 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3069 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3070 v += 9; 3071 } 3072 idc = 3*(*c--); 3073 v = aa + 9*diag[i]; 3074 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3075 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3076 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3077 } 3078 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3079 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3080 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3081 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3082 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3083 PetscFunctionReturn(0); 3084 } 3085 3086 /* 3087 Special case where the matrix was ILU(0) factored in the natural 3088 ordering. This eliminates the need for the column and row permutation. 3089 */ 3090 #undef __FUNCT__ 3091 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 3092 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 3093 { 3094 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3095 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3096 PetscErrorCode ierr; 3097 PetscInt *diag = a->diag; 3098 const MatScalar *aa=a->a,*v; 3099 PetscScalar *x,s1,s2,s3,x1,x2,x3; 3100 const PetscScalar *b; 3101 PetscInt jdx,idt,idx,nz,*vi,i; 3102 3103 PetscFunctionBegin; 3104 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3105 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3106 3107 /* forward solve the lower triangular */ 3108 idx = 0; 3109 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 3110 for (i=1; i<n; i++) { 3111 v = aa + 9*ai[i]; 3112 vi = aj + ai[i]; 3113 nz = diag[i] - ai[i]; 3114 idx += 3; 3115 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3116 while (nz--) { 3117 jdx = 3*(*vi++); 3118 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 3119 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3120 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3121 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3122 v += 9; 3123 } 3124 x[idx] = s1; 3125 x[1+idx] = s2; 3126 x[2+idx] = s3; 3127 } 3128 /* backward solve the upper triangular */ 3129 for (i=n-1; i>=0; i--){ 3130 v = aa + 9*diag[i] + 9; 3131 vi = aj + diag[i] + 1; 3132 nz = ai[i+1] - diag[i] - 1; 3133 idt = 3*i; 3134 s1 = x[idt]; s2 = x[1+idt]; 3135 s3 = x[2+idt]; 3136 while (nz--) { 3137 idx = 3*(*vi++); 3138 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 3139 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3140 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3141 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3142 v += 9; 3143 } 3144 v = aa + 9*diag[i]; 3145 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3146 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3147 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3148 } 3149 3150 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3151 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3152 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3153 PetscFunctionReturn(0); 3154 } 3155 3156 #undef __FUNCT__ 3157 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 3158 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3159 { 3160 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3161 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3162 PetscErrorCode ierr; 3163 PetscInt idx,jdx,idt; 3164 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3165 const MatScalar *aa=a->a,*v; 3166 PetscScalar *x; 3167 const PetscScalar *b; 3168 PetscScalar s1,s2,s3,x1,x2,x3; 3169 3170 PetscFunctionBegin; 3171 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3172 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3173 /* forward solve the lower triangular */ 3174 idx = 0; 3175 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3176 for (i=1; i<n; i++) { 3177 v = aa + bs2*ai[i]; 3178 vi = aj + ai[i]; 3179 nz = ai[i+1] - ai[i]; 3180 idx = bs*i; 3181 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3182 while (nz--) { 3183 jdx = bs*(*vi++); 3184 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3185 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3186 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3187 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3188 3189 v += bs2; 3190 } 3191 3192 x[idx] = s1; 3193 x[1+idx] = s2; 3194 x[2+idx] = s3; 3195 } 3196 3197 /* backward solve the upper triangular */ 3198 for (i=n-1; i>=0; i--){ 3199 v = aa + bs2*ai[2*n-i]; 3200 vi = aj + ai[2*n-i]; 3201 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3202 idt = bs*i; 3203 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3204 3205 while (nz--) { 3206 idx = bs*(*vi++); 3207 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3208 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3209 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3210 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3211 3212 v += bs2; 3213 } 3214 /* x = inv_diagonal*x */ 3215 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3216 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3217 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3218 3219 } 3220 3221 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3222 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3223 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3224 PetscFunctionReturn(0); 3225 } 3226 3227 #undef __FUNCT__ 3228 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 3229 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 3230 { 3231 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3232 IS iscol=a->col,isrow=a->row; 3233 PetscErrorCode ierr; 3234 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3235 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3236 const MatScalar *aa=a->a,*v; 3237 PetscScalar *x,s1,s2,x1,x2,*t; 3238 const PetscScalar *b; 3239 3240 PetscFunctionBegin; 3241 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3242 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3243 t = a->solve_work; 3244 3245 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3246 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3247 3248 /* forward solve the lower triangular */ 3249 idx = 2*(*r++); 3250 t[0] = b[idx]; t[1] = b[1+idx]; 3251 for (i=1; i<n; i++) { 3252 v = aa + 4*ai[i]; 3253 vi = aj + ai[i]; 3254 nz = diag[i] - ai[i]; 3255 idx = 2*(*r++); 3256 s1 = b[idx]; s2 = b[1+idx]; 3257 while (nz--) { 3258 idx = 2*(*vi++); 3259 x1 = t[idx]; x2 = t[1+idx]; 3260 s1 -= v[0]*x1 + v[2]*x2; 3261 s2 -= v[1]*x1 + v[3]*x2; 3262 v += 4; 3263 } 3264 idx = 2*i; 3265 t[idx] = s1; t[1+idx] = s2; 3266 } 3267 /* backward solve the upper triangular */ 3268 for (i=n-1; i>=0; i--){ 3269 v = aa + 4*diag[i] + 4; 3270 vi = aj + diag[i] + 1; 3271 nz = ai[i+1] - diag[i] - 1; 3272 idt = 2*i; 3273 s1 = t[idt]; s2 = t[1+idt]; 3274 while (nz--) { 3275 idx = 2*(*vi++); 3276 x1 = t[idx]; x2 = t[1+idx]; 3277 s1 -= v[0]*x1 + v[2]*x2; 3278 s2 -= v[1]*x1 + v[3]*x2; 3279 v += 4; 3280 } 3281 idc = 2*(*c--); 3282 v = aa + 4*diag[i]; 3283 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3284 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 3285 } 3286 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3287 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3288 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3289 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3290 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3291 PetscFunctionReturn(0); 3292 } 3293 3294 /* 3295 Special case where the matrix was ILU(0) factored in the natural 3296 ordering. This eliminates the need for the column and row permutation. 3297 */ 3298 #undef __FUNCT__ 3299 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3300 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 3301 { 3302 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3303 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3304 PetscErrorCode ierr; 3305 PetscInt *diag = a->diag; 3306 const MatScalar *aa=a->a,*v; 3307 PetscScalar *x,s1,s2,x1,x2; 3308 const PetscScalar *b; 3309 PetscInt jdx,idt,idx,nz,*vi,i; 3310 3311 PetscFunctionBegin; 3312 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3313 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3314 3315 /* forward solve the lower triangular */ 3316 idx = 0; 3317 x[0] = b[0]; x[1] = b[1]; 3318 for (i=1; i<n; i++) { 3319 v = aa + 4*ai[i]; 3320 vi = aj + ai[i]; 3321 nz = diag[i] - ai[i]; 3322 idx += 2; 3323 s1 = b[idx];s2 = b[1+idx]; 3324 while (nz--) { 3325 jdx = 2*(*vi++); 3326 x1 = x[jdx];x2 = x[1+jdx]; 3327 s1 -= v[0]*x1 + v[2]*x2; 3328 s2 -= v[1]*x1 + v[3]*x2; 3329 v += 4; 3330 } 3331 x[idx] = s1; 3332 x[1+idx] = s2; 3333 } 3334 /* backward solve the upper triangular */ 3335 for (i=n-1; i>=0; i--){ 3336 v = aa + 4*diag[i] + 4; 3337 vi = aj + diag[i] + 1; 3338 nz = ai[i+1] - diag[i] - 1; 3339 idt = 2*i; 3340 s1 = x[idt]; s2 = x[1+idt]; 3341 while (nz--) { 3342 idx = 2*(*vi++); 3343 x1 = x[idx]; x2 = x[1+idx]; 3344 s1 -= v[0]*x1 + v[2]*x2; 3345 s2 -= v[1]*x1 + v[3]*x2; 3346 v += 4; 3347 } 3348 v = aa + 4*diag[i]; 3349 x[idt] = v[0]*s1 + v[2]*s2; 3350 x[1+idt] = v[1]*s1 + v[3]*s2; 3351 } 3352 3353 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3354 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3355 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3356 PetscFunctionReturn(0); 3357 } 3358 3359 #undef __FUNCT__ 3360 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 3361 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3362 { 3363 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3364 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 3365 PetscErrorCode ierr; 3366 PetscInt jdx; 3367 const MatScalar *aa=a->a,*v; 3368 PetscScalar *x,s1,s2,x1,x2; 3369 const PetscScalar *b; 3370 3371 PetscFunctionBegin; 3372 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3373 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3374 /* forward solve the lower triangular */ 3375 idx = 0; 3376 x[0] = b[idx]; x[1] = b[1+idx]; 3377 for (i=1; i<n; i++) { 3378 v = aa + 4*ai[i]; 3379 vi = aj + ai[i]; 3380 nz = ai[i+1] - ai[i]; 3381 idx = 2*i; 3382 s1 = b[idx];s2 = b[1+idx]; 3383 while (nz--) { 3384 jdx = 2*(*vi++); 3385 x1 = x[jdx];x2 = x[1+jdx]; 3386 s1 -= v[0]*x1 + v[2]*x2; 3387 s2 -= v[1]*x1 + v[3]*x2; 3388 v += 4; 3389 } 3390 x[idx] = s1; 3391 x[1+idx] = s2; 3392 } 3393 3394 /* backward solve the upper triangular */ 3395 for (i=n-1; i>=0; i--){ 3396 v = aa + 4*ai[2*n-i]; 3397 vi = aj + ai[2*n-i]; 3398 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3399 idt = 2*i; 3400 s1 = x[idt]; s2 = x[1+idt]; 3401 while (nz--) { 3402 idx = 2*(*vi++); 3403 x1 = x[idx]; x2 = x[1+idx]; 3404 s1 -= v[0]*x1 + v[2]*x2; 3405 s2 -= v[1]*x1 + v[3]*x2; 3406 v += 4; 3407 } 3408 /* x = inv_diagonal*x */ 3409 x[idt] = v[0]*s1 + v[2]*s2; 3410 x[1+idt] = v[1]*s1 + v[3]*s2; 3411 } 3412 3413 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3414 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3415 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3416 PetscFunctionReturn(0); 3417 } 3418 3419 #undef __FUNCT__ 3420 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 3421 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 3422 { 3423 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3424 IS iscol=a->col,isrow=a->row; 3425 PetscErrorCode ierr; 3426 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3427 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3428 MatScalar *aa=a->a,*v; 3429 PetscScalar *x,*b,s1,*t; 3430 3431 PetscFunctionBegin; 3432 if (!n) PetscFunctionReturn(0); 3433 3434 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3435 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3436 t = a->solve_work; 3437 3438 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3439 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3440 3441 /* forward solve the lower triangular */ 3442 t[0] = b[*r++]; 3443 for (i=1; i<n; i++) { 3444 v = aa + ai[i]; 3445 vi = aj + ai[i]; 3446 nz = diag[i] - ai[i]; 3447 s1 = b[*r++]; 3448 while (nz--) { 3449 s1 -= (*v++)*t[*vi++]; 3450 } 3451 t[i] = s1; 3452 } 3453 /* backward solve the upper triangular */ 3454 for (i=n-1; i>=0; i--){ 3455 v = aa + diag[i] + 1; 3456 vi = aj + diag[i] + 1; 3457 nz = ai[i+1] - diag[i] - 1; 3458 s1 = t[i]; 3459 while (nz--) { 3460 s1 -= (*v++)*t[*vi++]; 3461 } 3462 x[*c--] = t[i] = aa[diag[i]]*s1; 3463 } 3464 3465 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3466 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3467 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3468 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3469 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3470 PetscFunctionReturn(0); 3471 } 3472 /* 3473 Special case where the matrix was ILU(0) factored in the natural 3474 ordering. This eliminates the need for the column and row permutation. 3475 */ 3476 #undef __FUNCT__ 3477 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3478 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3479 { 3480 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3481 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3482 PetscErrorCode ierr; 3483 PetscInt *diag = a->diag; 3484 MatScalar *aa=a->a; 3485 PetscScalar *x,*b; 3486 PetscScalar s1,x1; 3487 MatScalar *v; 3488 PetscInt jdx,idt,idx,nz,*vi,i; 3489 3490 PetscFunctionBegin; 3491 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3492 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3493 3494 /* forward solve the lower triangular */ 3495 idx = 0; 3496 x[0] = b[0]; 3497 for (i=1; i<n; i++) { 3498 v = aa + ai[i]; 3499 vi = aj + ai[i]; 3500 nz = diag[i] - ai[i]; 3501 idx += 1; 3502 s1 = b[idx]; 3503 while (nz--) { 3504 jdx = *vi++; 3505 x1 = x[jdx]; 3506 s1 -= v[0]*x1; 3507 v += 1; 3508 } 3509 x[idx] = s1; 3510 } 3511 /* backward solve the upper triangular */ 3512 for (i=n-1; i>=0; i--){ 3513 v = aa + diag[i] + 1; 3514 vi = aj + diag[i] + 1; 3515 nz = ai[i+1] - diag[i] - 1; 3516 idt = i; 3517 s1 = x[idt]; 3518 while (nz--) { 3519 idx = *vi++; 3520 x1 = x[idx]; 3521 s1 -= v[0]*x1; 3522 v += 1; 3523 } 3524 v = aa + diag[i]; 3525 x[idt] = v[0]*s1; 3526 } 3527 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3528 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3529 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3530 PetscFunctionReturn(0); 3531 } 3532 3533 /* ----------------------------------------------------------------*/ 3534 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 3535 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3536 3537 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 3538 #undef __FUNCT__ 3539 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 3540 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 3541 { 3542 Mat C=B; 3543 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 3544 IS isrow = b->row,isicol = b->icol; 3545 PetscErrorCode ierr; 3546 const PetscInt *r,*ic,*ics; 3547 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 3548 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 3549 MatScalar *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a; 3550 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 3551 MatScalar *v_work; 3552 3553 PetscFunctionBegin; 3554 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3555 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3556 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3557 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 3558 ics = ic; 3559 3560 /* generate work space needed by dense LU factorization */ 3561 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 3562 multiplier = v_work + bs; 3563 v_pivots = (PetscInt*)(multiplier + bs2); 3564 3565 for (i=0; i<n; i++){ 3566 /* zero rtmp */ 3567 /* L part */ 3568 nz = bi[i+1] - bi[i]; 3569 bjtmp = bj + bi[i]; 3570 for (j=0; j<nz; j++){ 3571 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3572 } 3573 3574 /* U part */ 3575 nz = bi[2*n-i+1] - bi[2*n-i]; 3576 bjtmp = bj + bi[2*n-i]; 3577 for (j=0; j<nz; j++){ 3578 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3579 } 3580 3581 /* load in initial (unfactored row) */ 3582 nz = ai[r[i]+1] - ai[r[i]]; 3583 ajtmp = aj + ai[r[i]]; 3584 v = aa + bs2*ai[r[i]]; 3585 for (j=0; j<nz; j++) { 3586 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 3587 } 3588 3589 /* elimination */ 3590 bjtmp = bj + bi[i]; 3591 row = *bjtmp++; 3592 nzL = bi[i+1] - bi[i]; 3593 k = 0; 3594 while (k < nzL) { 3595 pc = rtmp + bs2*row; 3596 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 3597 if (flg) { 3598 pv = b->a + bs2*bdiag[row]; 3599 Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */ 3600 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 3601 pv = b->a + bs2*bi[2*n-row]; 3602 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 3603 for (j=0; j<nz; j++) { 3604 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 3605 } 3606 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 3607 } 3608 row = *bjtmp++; k++; 3609 } 3610 3611 /* finished row so stick it into b->a */ 3612 /* L part */ 3613 pv = b->a + bs2*bi[i] ; 3614 pj = b->j + bi[i] ; 3615 nz = bi[i+1] - bi[i]; 3616 for (j=0; j<nz; j++) { 3617 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3618 } 3619 3620 /* Mark diagonal and invert diagonal for simplier triangular solves */ 3621 pv = b->a + bs2*bdiag[i]; 3622 pj = b->j + bdiag[i]; 3623 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 3624 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3625 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 3626 3627 /* U part */ 3628 pv = b->a + bs2*bi[2*n-i]; 3629 pj = b->j + bi[2*n-i]; 3630 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3631 for (j=0; j<nz; j++){ 3632 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3633 } 3634 } 3635 3636 ierr = PetscFree(rtmp);CHKERRQ(ierr); 3637 ierr = PetscFree(v_work);CHKERRQ(ierr); 3638 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3639 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3640 3641 switch (A->rmap->bs){ 3642 case 2: 3643 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 3644 break; 3645 case 3: 3646 C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 3647 break; 3648 case 4: 3649 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 3650 break; 3651 case 5: 3652 C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3653 break; 3654 case 6: 3655 C->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 3656 break; 3657 case 7: 3658 C->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 3659 break; 3660 default: 3661 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3662 break; 3663 } 3664 C->assembled = PETSC_TRUE; 3665 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 3666 PetscFunctionReturn(0); 3667 } 3668 3669 /* 3670 ilu(0) with natural ordering under new data structure. 3671 Factored arrays bj and ba are stored as 3672 L(0,:), L(1,:), ...,L(n-1,:), U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:) 3673 3674 bi=fact->i is an array of size 2n+2, in which 3675 bi+ 3676 bi[i] -> 1st entry of L(i,:),i=0,...,i-1 3677 bi[n] -> end of L(n-1,:)+1 3678 bi[n+1] -> 1st entry of U(n-1,:) 3679 bi[2n-i] -> 1st entry of U(i,:) 3680 bi[2n-i+1] -> end of U(i,:)+1, the 1st entry of U(i-1,:) 3681 bi[2n] -> end of U(0,:)+1 3682 3683 U(i,:) contains diag[i] as its last entry, i.e., 3684 U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i]) 3685 */ 3686 #undef __FUNCT__ 3687 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 3688 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3689 { 3690 3691 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3692 PetscErrorCode ierr; 3693 PetscInt mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 3694 PetscInt i,j,nz=a->nz,*bi,*bj,*bdiag; 3695 3696 PetscFunctionBegin; 3697 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3698 b = (Mat_SeqBAIJ*)(fact)->data; 3699 bdiag = b->diag; 3700 3701 /* replace matrix arrays with single allocations, then reset values */ 3702 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 3703 3704 ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr); 3705 ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr); 3706 ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr); 3707 b->singlemalloc = PETSC_FALSE; 3708 if (mbs > 0) { 3709 ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr); 3710 } 3711 3712 /* set bi and bj with new data structure */ 3713 bi = b->i; 3714 bj = b->j; 3715 3716 /* L part */ 3717 bi[0] = 0; 3718 for (i=0; i<mbs; i++){ 3719 nz = adiag[i] - ai[i]; 3720 bi[i+1] = bi[i] + nz; 3721 aj = a->j + ai[i]; 3722 for (j=0; j<nz; j++){ 3723 *bj = aj[j]; bj++; 3724 } 3725 } 3726 3727 /* U part */ 3728 bi[mbs+1] = bi[mbs]; 3729 for (i=mbs-1; i>=0; i--){ 3730 nz = ai[i+1] - adiag[i] - 1; 3731 if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz); 3732 bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1; 3733 aj = a->j + adiag[i] + 1; 3734 for (j=0; j<nz; j++){ 3735 *bj = aj[j]; bj++; 3736 } 3737 /* diag[i] */ 3738 *bj = i; bj++; 3739 bdiag[i] = bi[2*mbs-i+1]-1; 3740 } 3741 PetscFunctionReturn(0); 3742 } 3743 3744 /* 3745 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3746 except that the data structure of Mat_SeqAIJ is slightly different. 3747 Not a good example of code reuse. 3748 */ 3749 3750 #undef __FUNCT__ 3751 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3752 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3753 { 3754 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3755 IS isicol; 3756 PetscErrorCode ierr; 3757 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 3758 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 3759 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3760 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 3761 PetscTruth col_identity,row_identity,both_identity,flg; 3762 PetscReal f; 3763 3764 PetscFunctionBegin; 3765 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 3766 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 3767 3768 f = info->fill; 3769 levels = (PetscInt)info->levels; 3770 diagonal_fill = (PetscInt)info->diagonal_fill; 3771 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3772 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3773 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3774 both_identity = (PetscTruth) (row_identity && col_identity); 3775 3776 if (!levels && both_identity) { /* special case copy the nonzero structure */ 3777 3778 PetscTruth newdatastruct=PETSC_FALSE; 3779 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 3780 if (newdatastruct){ 3781 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 3782 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 3783 } else { 3784 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3785 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3786 } 3787 3788 fact->factor = MAT_FACTOR_ILU; 3789 b = (Mat_SeqBAIJ*)(fact)->data; 3790 b->row = isrow; 3791 b->col = iscol; 3792 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3793 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3794 b->icol = isicol; 3795 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3796 ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3797 PetscFunctionReturn(0); 3798 } 3799 3800 /* general case perform the symbolic factorization */ 3801 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3802 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3803 3804 /* get new row pointers */ 3805 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3806 ainew[0] = 0; 3807 /* don't know how many column pointers are needed so estimate */ 3808 jmax = (PetscInt)(f*ai[n] + 1); 3809 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3810 /* ajfill is level of fill for each fill entry */ 3811 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3812 /* fill is a linked list of nonzeros in active row */ 3813 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3814 /* im is level for each filled value */ 3815 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3816 /* dloc is location of diagonal in factor */ 3817 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3818 dloc[0] = 0; 3819 for (prow=0; prow<n; prow++) { 3820 3821 /* copy prow into linked list */ 3822 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3823 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 3824 xi = aj + ai[r[prow]]; 3825 fill[n] = n; 3826 fill[prow] = -1; /* marker for diagonal entry */ 3827 while (nz--) { 3828 fm = n; 3829 idx = ic[*xi++]; 3830 do { 3831 m = fm; 3832 fm = fill[m]; 3833 } while (fm < idx); 3834 fill[m] = idx; 3835 fill[idx] = fm; 3836 im[idx] = 0; 3837 } 3838 3839 /* make sure diagonal entry is included */ 3840 if (diagonal_fill && fill[prow] == -1) { 3841 fm = n; 3842 while (fill[fm] < prow) fm = fill[fm]; 3843 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3844 fill[fm] = prow; 3845 im[prow] = 0; 3846 nzf++; 3847 dcount++; 3848 } 3849 3850 nzi = 0; 3851 row = fill[n]; 3852 while (row < prow) { 3853 incrlev = im[row] + 1; 3854 nz = dloc[row]; 3855 xi = ajnew + ainew[row] + nz + 1; 3856 flev = ajfill + ainew[row] + nz + 1; 3857 nnz = ainew[row+1] - ainew[row] - nz - 1; 3858 fm = row; 3859 while (nnz-- > 0) { 3860 idx = *xi++; 3861 if (*flev + incrlev > levels) { 3862 flev++; 3863 continue; 3864 } 3865 do { 3866 m = fm; 3867 fm = fill[m]; 3868 } while (fm < idx); 3869 if (fm != idx) { 3870 im[idx] = *flev + incrlev; 3871 fill[m] = idx; 3872 fill[idx] = fm; 3873 fm = idx; 3874 nzf++; 3875 } else { 3876 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3877 } 3878 flev++; 3879 } 3880 row = fill[row]; 3881 nzi++; 3882 } 3883 /* copy new filled row into permanent storage */ 3884 ainew[prow+1] = ainew[prow] + nzf; 3885 if (ainew[prow+1] > jmax) { 3886 3887 /* estimate how much additional space we will need */ 3888 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3889 /* just double the memory each time */ 3890 PetscInt maxadd = jmax; 3891 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3892 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3893 jmax += maxadd; 3894 3895 /* allocate a longer ajnew and ajfill */ 3896 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3897 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3898 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3899 ajnew = xitmp; 3900 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3901 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3902 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3903 ajfill = xitmp; 3904 reallocate++; /* count how many reallocations are needed */ 3905 } 3906 xitmp = ajnew + ainew[prow]; 3907 flev = ajfill + ainew[prow]; 3908 dloc[prow] = nzi; 3909 fm = fill[n]; 3910 while (nzf--) { 3911 *xitmp++ = fm; 3912 *flev++ = im[fm]; 3913 fm = fill[fm]; 3914 } 3915 /* make sure row has diagonal entry */ 3916 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3917 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3918 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3919 } 3920 } 3921 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3922 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3923 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3924 ierr = PetscFree(fill);CHKERRQ(ierr); 3925 ierr = PetscFree(im);CHKERRQ(ierr); 3926 3927 #if defined(PETSC_USE_INFO) 3928 { 3929 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3930 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3931 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3932 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3933 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3934 if (diagonal_fill) { 3935 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3936 } 3937 } 3938 #endif 3939 3940 /* put together the new matrix */ 3941 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3942 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3943 b = (Mat_SeqBAIJ*)(fact)->data; 3944 b->free_a = PETSC_TRUE; 3945 b->free_ij = PETSC_TRUE; 3946 b->singlemalloc = PETSC_FALSE; 3947 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3948 b->j = ajnew; 3949 b->i = ainew; 3950 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3951 b->diag = dloc; 3952 b->ilen = 0; 3953 b->imax = 0; 3954 b->row = isrow; 3955 b->col = iscol; 3956 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3957 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3958 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3959 b->icol = isicol; 3960 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3961 /* In b structure: Free imax, ilen, old a, old j. 3962 Allocate dloc, solve_work, new a, new j */ 3963 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3964 b->maxnz = b->nz = ainew[n]; 3965 3966 (fact)->info.factor_mallocs = reallocate; 3967 (fact)->info.fill_ratio_given = f; 3968 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3969 3970 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3971 PetscFunctionReturn(0); 3972 } 3973 3974 #undef __FUNCT__ 3975 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3976 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3977 { 3978 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3979 /* int i,*AJ=a->j,nz=a->nz; */ 3980 PetscFunctionBegin; 3981 /* Undo Column scaling */ 3982 /* while (nz--) { */ 3983 /* AJ[i] = AJ[i]/4; */ 3984 /* } */ 3985 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3986 A->ops->setunfactored = PETSC_NULL; 3987 PetscFunctionReturn(0); 3988 } 3989 3990 #undef __FUNCT__ 3991 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3992 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3993 { 3994 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3995 PetscInt *AJ=a->j,nz=a->nz; 3996 unsigned short *aj=(unsigned short *)AJ; 3997 PetscFunctionBegin; 3998 /* Is this really necessary? */ 3999 while (nz--) { 4000 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 4001 } 4002 A->ops->setunfactored = PETSC_NULL; 4003 PetscFunctionReturn(0); 4004 } 4005 4006 4007