1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124 PetscInt *diag = a->diag,oidx; 125 MatScalar *aa=a->a,*v; 126 PetscScalar s1,s2,s3,x1,x2,x3; 127 PetscScalar *x,*b; 128 129 PetscFunctionBegin; 130 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133 134 /* forward solve the U^T */ 135 idx = 0; 136 for (i=0; i<n; i++) { 137 138 v = aa + 9*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144 v += 9; 145 146 vi = aj + diag[i] + 1; 147 nz = ai[i+1] - diag[i] - 1; 148 while (nz--) { 149 oidx = 3*(*vi++); 150 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153 v += 9; 154 } 155 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156 idx += 3; 157 } 158 /* backward solve the L^T */ 159 for (i=n-1; i>=0; i--){ 160 v = aa + 9*diag[i] - 9; 161 vi = aj + diag[i] - 1; 162 nz = diag[i] - ai[i]; 163 idt = 3*i; 164 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165 while (nz--) { 166 idx = 3*(*vi--); 167 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170 v -= 9; 171 } 172 } 173 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182 { 183 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184 PetscErrorCode ierr; 185 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186 PetscInt *diag = a->diag,oidx; 187 MatScalar *aa=a->a,*v; 188 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 189 PetscScalar *x,*b; 190 191 PetscFunctionBegin; 192 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 193 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195 196 /* forward solve the U^T */ 197 idx = 0; 198 for (i=0; i<n; i++) { 199 200 v = aa + 16*diag[i]; 201 /* multiply by the inverse of the block diagonal */ 202 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207 v += 16; 208 209 vi = aj + diag[i] + 1; 210 nz = ai[i+1] - diag[i] - 1; 211 while (nz--) { 212 oidx = 4*(*vi++); 213 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217 v += 16; 218 } 219 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220 idx += 4; 221 } 222 /* backward solve the L^T */ 223 for (i=n-1; i>=0; i--){ 224 v = aa + 16*diag[i] - 16; 225 vi = aj + diag[i] - 1; 226 nz = diag[i] - ai[i]; 227 idt = 4*i; 228 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229 while (nz--) { 230 idx = 4*(*vi--); 231 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235 v -= 16; 236 } 237 } 238 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 239 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247 { 248 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249 PetscErrorCode ierr; 250 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251 PetscInt *diag = a->diag,oidx; 252 MatScalar *aa=a->a,*v; 253 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 254 PetscScalar *x,*b; 255 256 PetscFunctionBegin; 257 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 258 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 259 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260 261 /* forward solve the U^T */ 262 idx = 0; 263 for (i=0; i<n; i++) { 264 265 v = aa + 25*diag[i]; 266 /* multiply by the inverse of the block diagonal */ 267 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273 v += 25; 274 275 vi = aj + diag[i] + 1; 276 nz = ai[i+1] - diag[i] - 1; 277 while (nz--) { 278 oidx = 5*(*vi++); 279 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284 v += 25; 285 } 286 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287 idx += 5; 288 } 289 /* backward solve the L^T */ 290 for (i=n-1; i>=0; i--){ 291 v = aa + 25*diag[i] - 25; 292 vi = aj + diag[i] - 1; 293 nz = diag[i] - ai[i]; 294 idt = 5*i; 295 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296 while (nz--) { 297 idx = 5*(*vi--); 298 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303 v -= 25; 304 } 305 } 306 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 307 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311 312 #undef __FUNCT__ 313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315 { 316 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317 PetscErrorCode ierr; 318 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319 PetscInt *diag = a->diag,oidx; 320 MatScalar *aa=a->a,*v; 321 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 322 PetscScalar *x,*b; 323 324 PetscFunctionBegin; 325 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 326 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 327 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328 329 /* forward solve the U^T */ 330 idx = 0; 331 for (i=0; i<n; i++) { 332 333 v = aa + 36*diag[i]; 334 /* multiply by the inverse of the block diagonal */ 335 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336 x6 = x[5+idx]; 337 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343 v += 36; 344 345 vi = aj + diag[i] + 1; 346 nz = ai[i+1] - diag[i] - 1; 347 while (nz--) { 348 oidx = 6*(*vi++); 349 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355 v += 36; 356 } 357 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358 x[5+idx] = s6; 359 idx += 6; 360 } 361 /* backward solve the L^T */ 362 for (i=n-1; i>=0; i--){ 363 v = aa + 36*diag[i] - 36; 364 vi = aj + diag[i] - 1; 365 nz = diag[i] - ai[i]; 366 idt = 6*i; 367 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368 s6 = x[5+idt]; 369 while (nz--) { 370 idx = 6*(*vi--); 371 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377 v -= 36; 378 } 379 } 380 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389 { 390 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391 PetscErrorCode ierr; 392 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393 PetscInt *diag = a->diag,oidx; 394 MatScalar *aa=a->a,*v; 395 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 396 PetscScalar *x,*b; 397 398 PetscFunctionBegin; 399 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 400 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 401 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402 403 /* forward solve the U^T */ 404 idx = 0; 405 for (i=0; i<n; i++) { 406 407 v = aa + 49*diag[i]; 408 /* multiply by the inverse of the block diagonal */ 409 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410 x6 = x[5+idx]; x7 = x[6+idx]; 411 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418 v += 49; 419 420 vi = aj + diag[i] + 1; 421 nz = ai[i+1] - diag[i] - 1; 422 while (nz--) { 423 oidx = 7*(*vi++); 424 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431 v += 49; 432 } 433 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434 x[5+idx] = s6;x[6+idx] = s7; 435 idx += 7; 436 } 437 /* backward solve the L^T */ 438 for (i=n-1; i>=0; i--){ 439 v = aa + 49*diag[i] - 49; 440 vi = aj + diag[i] - 1; 441 nz = diag[i] - ai[i]; 442 idt = 7*i; 443 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444 s6 = x[5+idt];s7 = x[6+idt]; 445 while (nz--) { 446 idx = 7*(*vi--); 447 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454 v -= 49; 455 } 456 } 457 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460 PetscFunctionReturn(0); 461 } 462 463 /*---------------------------------------------------------------------------------------------*/ 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469 IS iscol=a->col,isrow=a->row; 470 PetscErrorCode ierr; 471 const PetscInt *r,*c,*rout,*cout; 472 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473 PetscInt *diag = a->diag; 474 MatScalar *aa=a->a,*v; 475 PetscScalar s1,*x,*b,*t; 476 477 PetscFunctionBegin; 478 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 482 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484 485 /* copy the b into temp work space according to permutation */ 486 for (i=0; i<n; i++) { 487 t[i] = b[c[i]]; 488 } 489 490 /* forward solve the U^T */ 491 for (i=0; i<n; i++) { 492 493 v = aa + diag[i]; 494 /* multiply by the inverse of the block diagonal */ 495 s1 = (*v++)*t[i]; 496 vi = aj + diag[i] + 1; 497 nz = ai[i+1] - diag[i] - 1; 498 while (nz--) { 499 t[*vi++] -= (*v++)*s1; 500 } 501 t[i] = s1; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--){ 505 v = aa + diag[i] - 1; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 s1 = t[i]; 509 while (nz--) { 510 t[*vi--] -= (*v--)*s1; 511 } 512 } 513 514 /* copy t into x according to permutation */ 515 for (i=0; i<n; i++) { 516 x[r[i]] = t[i]; 517 } 518 519 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 521 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530 { 531 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532 IS iscol=a->col,isrow=a->row; 533 PetscErrorCode ierr; 534 const PetscInt *r,*c,*rout,*cout; 535 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536 PetscInt *diag = a->diag,ii,ic,ir,oidx; 537 MatScalar *aa=a->a,*v; 538 PetscScalar s1,s2,x1,x2; 539 PetscScalar *x,*b,*t; 540 541 PetscFunctionBegin; 542 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544 t = a->solve_work; 545 546 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548 549 /* copy the b into temp work space according to permutation */ 550 ii = 0; 551 for (i=0; i<n; i++) { 552 ic = 2*c[i]; 553 t[ii] = b[ic]; 554 t[ii+1] = b[ic+1]; 555 ii += 2; 556 } 557 558 /* forward solve the U^T */ 559 idx = 0; 560 for (i=0; i<n; i++) { 561 562 v = aa + 4*diag[i]; 563 /* multiply by the inverse of the block diagonal */ 564 x1 = t[idx]; x2 = t[1+idx]; 565 s1 = v[0]*x1 + v[1]*x2; 566 s2 = v[2]*x1 + v[3]*x2; 567 v += 4; 568 569 vi = aj + diag[i] + 1; 570 nz = ai[i+1] - diag[i] - 1; 571 while (nz--) { 572 oidx = 2*(*vi++); 573 t[oidx] -= v[0]*s1 + v[1]*s2; 574 t[oidx+1] -= v[2]*s1 + v[3]*s2; 575 v += 4; 576 } 577 t[idx] = s1;t[1+idx] = s2; 578 idx += 2; 579 } 580 /* backward solve the L^T */ 581 for (i=n-1; i>=0; i--){ 582 v = aa + 4*diag[i] - 4; 583 vi = aj + diag[i] - 1; 584 nz = diag[i] - ai[i]; 585 idt = 2*i; 586 s1 = t[idt]; s2 = t[1+idt]; 587 while (nz--) { 588 idx = 2*(*vi--); 589 t[idx] -= v[0]*s1 + v[1]*s2; 590 t[idx+1] -= v[2]*s1 + v[3]*s2; 591 v -= 4; 592 } 593 } 594 595 /* copy t into x according to permutation */ 596 ii = 0; 597 for (i=0; i<n; i++) { 598 ir = 2*r[i]; 599 x[ir] = t[ii]; 600 x[ir+1] = t[ii+1]; 601 ii += 2; 602 } 603 604 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 606 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 #undef __FUNCT__ 613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615 { 616 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617 IS iscol=a->col,isrow=a->row; 618 PetscErrorCode ierr; 619 const PetscInt *r,*c,*rout,*cout; 620 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621 PetscInt *diag = a->diag,ii,ic,ir,oidx; 622 MatScalar *aa=a->a,*v; 623 PetscScalar s1,s2,s3,x1,x2,x3; 624 PetscScalar *x,*b,*t; 625 626 PetscFunctionBegin; 627 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 628 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629 t = a->solve_work; 630 631 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633 634 /* copy the b into temp work space according to permutation */ 635 ii = 0; 636 for (i=0; i<n; i++) { 637 ic = 3*c[i]; 638 t[ii] = b[ic]; 639 t[ii+1] = b[ic+1]; 640 t[ii+2] = b[ic+2]; 641 ii += 3; 642 } 643 644 /* forward solve the U^T */ 645 idx = 0; 646 for (i=0; i<n; i++) { 647 648 v = aa + 9*diag[i]; 649 /* multiply by the inverse of the block diagonal */ 650 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654 v += 9; 655 656 vi = aj + diag[i] + 1; 657 nz = ai[i+1] - diag[i] - 1; 658 while (nz--) { 659 oidx = 3*(*vi++); 660 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663 v += 9; 664 } 665 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666 idx += 3; 667 } 668 /* backward solve the L^T */ 669 for (i=n-1; i>=0; i--){ 670 v = aa + 9*diag[i] - 9; 671 vi = aj + diag[i] - 1; 672 nz = diag[i] - ai[i]; 673 idt = 3*i; 674 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675 while (nz--) { 676 idx = 3*(*vi--); 677 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680 v -= 9; 681 } 682 } 683 684 /* copy t into x according to permutation */ 685 ii = 0; 686 for (i=0; i<n; i++) { 687 ir = 3*r[i]; 688 x[ir] = t[ii]; 689 x[ir+1] = t[ii+1]; 690 x[ir+2] = t[ii+2]; 691 ii += 3; 692 } 693 694 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 696 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707 IS iscol=a->col,isrow=a->row; 708 PetscErrorCode ierr; 709 const PetscInt *r,*c,*rout,*cout; 710 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711 PetscInt *diag = a->diag,ii,ic,ir,oidx; 712 MatScalar *aa=a->a,*v; 713 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 714 PetscScalar *x,*b,*t; 715 716 PetscFunctionBegin; 717 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 t = a->solve_work; 720 721 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723 724 /* copy the b into temp work space according to permutation */ 725 ii = 0; 726 for (i=0; i<n; i++) { 727 ic = 4*c[i]; 728 t[ii] = b[ic]; 729 t[ii+1] = b[ic+1]; 730 t[ii+2] = b[ic+2]; 731 t[ii+3] = b[ic+3]; 732 ii += 4; 733 } 734 735 /* forward solve the U^T */ 736 idx = 0; 737 for (i=0; i<n; i++) { 738 739 v = aa + 16*diag[i]; 740 /* multiply by the inverse of the block diagonal */ 741 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746 v += 16; 747 748 vi = aj + diag[i] + 1; 749 nz = ai[i+1] - diag[i] - 1; 750 while (nz--) { 751 oidx = 4*(*vi++); 752 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756 v += 16; 757 } 758 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759 idx += 4; 760 } 761 /* backward solve the L^T */ 762 for (i=n-1; i>=0; i--){ 763 v = aa + 16*diag[i] - 16; 764 vi = aj + diag[i] - 1; 765 nz = diag[i] - ai[i]; 766 idt = 4*i; 767 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768 while (nz--) { 769 idx = 4*(*vi--); 770 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774 v -= 16; 775 } 776 } 777 778 /* copy t into x according to permutation */ 779 ii = 0; 780 for (i=0; i<n; i++) { 781 ir = 4*r[i]; 782 x[ir] = t[ii]; 783 x[ir+1] = t[ii+1]; 784 x[ir+2] = t[ii+2]; 785 x[ir+3] = t[ii+3]; 786 ii += 4; 787 } 788 789 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 791 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800 { 801 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802 IS iscol=a->col,isrow=a->row; 803 PetscErrorCode ierr; 804 const PetscInt *r,*c,*rout,*cout; 805 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806 PetscInt *diag = a->diag,ii,ic,ir,oidx; 807 MatScalar *aa=a->a,*v; 808 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 809 PetscScalar *x,*b,*t; 810 811 PetscFunctionBegin; 812 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 813 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814 t = a->solve_work; 815 816 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818 819 /* copy the b into temp work space according to permutation */ 820 ii = 0; 821 for (i=0; i<n; i++) { 822 ic = 5*c[i]; 823 t[ii] = b[ic]; 824 t[ii+1] = b[ic+1]; 825 t[ii+2] = b[ic+2]; 826 t[ii+3] = b[ic+3]; 827 t[ii+4] = b[ic+4]; 828 ii += 5; 829 } 830 831 /* forward solve the U^T */ 832 idx = 0; 833 for (i=0; i<n; i++) { 834 835 v = aa + 25*diag[i]; 836 /* multiply by the inverse of the block diagonal */ 837 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843 v += 25; 844 845 vi = aj + diag[i] + 1; 846 nz = ai[i+1] - diag[i] - 1; 847 while (nz--) { 848 oidx = 5*(*vi++); 849 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v += 25; 855 } 856 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857 idx += 5; 858 } 859 /* backward solve the L^T */ 860 for (i=n-1; i>=0; i--){ 861 v = aa + 25*diag[i] - 25; 862 vi = aj + diag[i] - 1; 863 nz = diag[i] - ai[i]; 864 idt = 5*i; 865 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866 while (nz--) { 867 idx = 5*(*vi--); 868 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873 v -= 25; 874 } 875 } 876 877 /* copy t into x according to permutation */ 878 ii = 0; 879 for (i=0; i<n; i++) { 880 ir = 5*r[i]; 881 x[ir] = t[ii]; 882 x[ir+1] = t[ii+1]; 883 x[ir+2] = t[ii+2]; 884 x[ir+3] = t[ii+3]; 885 x[ir+4] = t[ii+4]; 886 ii += 5; 887 } 888 889 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 891 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902 IS iscol=a->col,isrow=a->row; 903 PetscErrorCode ierr; 904 const PetscInt *r,*c,*rout,*cout; 905 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906 PetscInt *diag = a->diag,ii,ic,ir,oidx; 907 MatScalar *aa=a->a,*v; 908 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 909 PetscScalar *x,*b,*t; 910 911 PetscFunctionBegin; 912 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 t = a->solve_work; 915 916 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918 919 /* copy the b into temp work space according to permutation */ 920 ii = 0; 921 for (i=0; i<n; i++) { 922 ic = 6*c[i]; 923 t[ii] = b[ic]; 924 t[ii+1] = b[ic+1]; 925 t[ii+2] = b[ic+2]; 926 t[ii+3] = b[ic+3]; 927 t[ii+4] = b[ic+4]; 928 t[ii+5] = b[ic+5]; 929 ii += 6; 930 } 931 932 /* forward solve the U^T */ 933 idx = 0; 934 for (i=0; i<n; i++) { 935 936 v = aa + 36*diag[i]; 937 /* multiply by the inverse of the block diagonal */ 938 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939 x6 = t[5+idx]; 940 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946 v += 36; 947 948 vi = aj + diag[i] + 1; 949 nz = ai[i+1] - diag[i] - 1; 950 while (nz--) { 951 oidx = 6*(*vi++); 952 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958 v += 36; 959 } 960 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961 t[5+idx] = s6; 962 idx += 6; 963 } 964 /* backward solve the L^T */ 965 for (i=n-1; i>=0; i--){ 966 v = aa + 36*diag[i] - 36; 967 vi = aj + diag[i] - 1; 968 nz = diag[i] - ai[i]; 969 idt = 6*i; 970 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971 s6 = t[5+idt]; 972 while (nz--) { 973 idx = 6*(*vi--); 974 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980 v -= 36; 981 } 982 } 983 984 /* copy t into x according to permutation */ 985 ii = 0; 986 for (i=0; i<n; i++) { 987 ir = 6*r[i]; 988 x[ir] = t[ii]; 989 x[ir+1] = t[ii+1]; 990 x[ir+2] = t[ii+2]; 991 x[ir+3] = t[ii+3]; 992 x[ir+4] = t[ii+4]; 993 x[ir+5] = t[ii+5]; 994 ii += 6; 995 } 996 997 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 999 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1000 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008 { 1009 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010 IS iscol=a->col,isrow=a->row; 1011 PetscErrorCode ierr; 1012 const PetscInt *r,*c,*rout,*cout; 1013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015 MatScalar *aa=a->a,*v; 1016 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1017 PetscScalar *x,*b,*t; 1018 1019 PetscFunctionBegin; 1020 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022 t = a->solve_work; 1023 1024 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026 1027 /* copy the b into temp work space according to permutation */ 1028 ii = 0; 1029 for (i=0; i<n; i++) { 1030 ic = 7*c[i]; 1031 t[ii] = b[ic]; 1032 t[ii+1] = b[ic+1]; 1033 t[ii+2] = b[ic+2]; 1034 t[ii+3] = b[ic+3]; 1035 t[ii+4] = b[ic+4]; 1036 t[ii+5] = b[ic+5]; 1037 t[ii+6] = b[ic+6]; 1038 ii += 7; 1039 } 1040 1041 /* forward solve the U^T */ 1042 idx = 0; 1043 for (i=0; i<n; i++) { 1044 1045 v = aa + 49*diag[i]; 1046 /* multiply by the inverse of the block diagonal */ 1047 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048 x6 = t[5+idx]; x7 = t[6+idx]; 1049 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056 v += 49; 1057 1058 vi = aj + diag[i] + 1; 1059 nz = ai[i+1] - diag[i] - 1; 1060 while (nz--) { 1061 oidx = 7*(*vi++); 1062 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v += 49; 1070 } 1071 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072 t[5+idx] = s6;t[6+idx] = s7; 1073 idx += 7; 1074 } 1075 /* backward solve the L^T */ 1076 for (i=n-1; i>=0; i--){ 1077 v = aa + 49*diag[i] - 49; 1078 vi = aj + diag[i] - 1; 1079 nz = diag[i] - ai[i]; 1080 idt = 7*i; 1081 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082 s6 = t[5+idt];s7 = t[6+idt]; 1083 while (nz--) { 1084 idx = 7*(*vi--); 1085 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092 v -= 49; 1093 } 1094 } 1095 1096 /* copy t into x according to permutation */ 1097 ii = 0; 1098 for (i=0; i<n; i++) { 1099 ir = 7*r[i]; 1100 x[ir] = t[ii]; 1101 x[ir+1] = t[ii+1]; 1102 x[ir+2] = t[ii+2]; 1103 x[ir+3] = t[ii+3]; 1104 x[ir+4] = t[ii+4]; 1105 x[ir+5] = t[ii+5]; 1106 x[ir+6] = t[ii+6]; 1107 ii += 7; 1108 } 1109 1110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1112 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 /* ----------------------------------------------------------- */ 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1124 IS iscol=a->col,isrow=a->row; 1125 PetscErrorCode ierr; 1126 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1127 PetscInt i,n=a->mbs; 1128 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1129 MatScalar *aa=a->a,*v; 1130 PetscScalar *x,*b,*s,*t,*ls; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135 t = a->solve_work; 1136 1137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1139 1140 /* forward solve the lower triangular */ 1141 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1142 for (i=1; i<n; i++) { 1143 v = aa + bs2*ai[i]; 1144 vi = aj + ai[i]; 1145 nz = a->diag[i] - ai[i]; 1146 s = t + bs*i; 1147 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 while (nz--) { 1149 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1150 v += bs2; 1151 } 1152 } 1153 /* backward solve the upper triangular */ 1154 ls = a->solve_work + A->cmap->n; 1155 for (i=n-1; i>=0; i--){ 1156 v = aa + bs2*(a->diag[i] + 1); 1157 vi = aj + a->diag[i] + 1; 1158 nz = ai[i+1] - a->diag[i] - 1; 1159 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1160 while (nz--) { 1161 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1162 v += bs2; 1163 } 1164 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1165 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1166 } 1167 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1179 { 1180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1181 IS iscol=a->col,isrow=a->row; 1182 PetscErrorCode ierr; 1183 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1184 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1185 MatScalar *aa=a->a,*v; 1186 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1187 PetscScalar *x,*b,*t; 1188 1189 PetscFunctionBegin; 1190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192 t = a->solve_work; 1193 1194 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1195 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1196 1197 /* forward solve the lower triangular */ 1198 idx = 7*(*r++); 1199 t[0] = b[idx]; t[1] = b[1+idx]; 1200 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201 t[5] = b[5+idx]; t[6] = b[6+idx]; 1202 1203 for (i=1; i<n; i++) { 1204 v = aa + 49*ai[i]; 1205 vi = aj + ai[i]; 1206 nz = diag[i] - ai[i]; 1207 idx = 7*(*r++); 1208 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1210 while (nz--) { 1211 idx = 7*(*vi++); 1212 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213 x4 = t[3+idx];x5 = t[4+idx]; 1214 x6 = t[5+idx];x7 = t[6+idx]; 1215 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1222 v += 49; 1223 } 1224 idx = 7*i; 1225 t[idx] = s1;t[1+idx] = s2; 1226 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227 t[5+idx] = s6;t[6+idx] = s7; 1228 } 1229 /* backward solve the upper triangular */ 1230 for (i=n-1; i>=0; i--){ 1231 v = aa + 49*diag[i] + 49; 1232 vi = aj + diag[i] + 1; 1233 nz = ai[i+1] - diag[i] - 1; 1234 idt = 7*i; 1235 s1 = t[idt]; s2 = t[1+idt]; 1236 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237 s6 = t[5+idt];s7 = t[6+idt]; 1238 while (nz--) { 1239 idx = 7*(*vi++); 1240 x1 = t[idx]; x2 = t[1+idx]; 1241 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242 x6 = t[5+idx]; x7 = t[6+idx]; 1243 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1250 v += 49; 1251 } 1252 idc = 7*(*c--); 1253 v = aa + 49*diag[i]; 1254 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1268 } 1269 1270 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1271 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1272 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1273 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1275 PetscFunctionReturn(0); 1276 } 1277 1278 #undef __FUNCT__ 1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1280 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1281 { 1282 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1283 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1284 PetscErrorCode ierr; 1285 PetscInt *diag = a->diag,jdx; 1286 const MatScalar *aa=a->a,*v; 1287 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1288 const PetscScalar *b; 1289 1290 PetscFunctionBegin; 1291 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1292 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1293 /* forward solve the lower triangular */ 1294 idx = 0; 1295 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1296 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1297 x[6] = b[6+idx]; 1298 for (i=1; i<n; i++) { 1299 v = aa + 49*ai[i]; 1300 vi = aj + ai[i]; 1301 nz = diag[i] - ai[i]; 1302 idx = 7*i; 1303 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1304 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1305 s7 = b[6+idx]; 1306 while (nz--) { 1307 jdx = 7*(*vi++); 1308 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1309 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1310 x7 = x[6+jdx]; 1311 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1312 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1313 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1314 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1315 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1316 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1317 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1318 v += 49; 1319 } 1320 x[idx] = s1; 1321 x[1+idx] = s2; 1322 x[2+idx] = s3; 1323 x[3+idx] = s4; 1324 x[4+idx] = s5; 1325 x[5+idx] = s6; 1326 x[6+idx] = s7; 1327 } 1328 /* backward solve the upper triangular */ 1329 for (i=n-1; i>=0; i--){ 1330 v = aa + 49*diag[i] + 49; 1331 vi = aj + diag[i] + 1; 1332 nz = ai[i+1] - diag[i] - 1; 1333 idt = 7*i; 1334 s1 = x[idt]; s2 = x[1+idt]; 1335 s3 = x[2+idt]; s4 = x[3+idt]; 1336 s5 = x[4+idt]; s6 = x[5+idt]; 1337 s7 = x[6+idt]; 1338 while (nz--) { 1339 idx = 7*(*vi++); 1340 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1341 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1342 x7 = x[6+idx]; 1343 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1344 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1345 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1346 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1347 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1348 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1349 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1350 v += 49; 1351 } 1352 v = aa + 49*diag[i]; 1353 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1354 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1355 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1356 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1357 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1358 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1359 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1360 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1361 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1362 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1363 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1364 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1365 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1366 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1367 } 1368 1369 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1370 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1371 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1372 PetscFunctionReturn(0); 1373 } 1374 1375 #undef __FUNCT__ 1376 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1377 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1378 { 1379 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1380 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1381 PetscErrorCode ierr; 1382 PetscInt idx,jdx,idt; 1383 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1384 const MatScalar *aa=a->a,*v; 1385 PetscScalar *x; 1386 const PetscScalar *b; 1387 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1388 1389 PetscFunctionBegin; 1390 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1391 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1392 /* forward solve the lower triangular */ 1393 idx = 0; 1394 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1395 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1396 for (i=1; i<n; i++) { 1397 v = aa + bs2*ai[i]; 1398 vi = aj + ai[i]; 1399 nz = ai[i+1] - ai[i]; 1400 idx = bs*i; 1401 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1402 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1403 while (nz--) { 1404 jdx = bs*(*vi++); 1405 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1406 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1407 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1408 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1409 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1410 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1411 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1412 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1413 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1414 v += bs2; 1415 } 1416 1417 x[idx] = s1; 1418 x[1+idx] = s2; 1419 x[2+idx] = s3; 1420 x[3+idx] = s4; 1421 x[4+idx] = s5; 1422 x[5+idx] = s6; 1423 x[6+idx] = s7; 1424 } 1425 1426 /* backward solve the upper triangular */ 1427 for (i=n-1; i>=0; i--){ 1428 v = aa + bs2*ai[2*n-i]; 1429 vi = aj + ai[2*n-i]; 1430 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1431 idt = bs*i; 1432 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1433 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1434 while (nz--) { 1435 idx = bs*(*vi++); 1436 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1437 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1438 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1439 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1440 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1441 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1442 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1443 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1444 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1445 v += bs2; 1446 } 1447 /* x = inv_diagonal*x */ 1448 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1449 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1450 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1451 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1452 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1453 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1454 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1455 } 1456 1457 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1459 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1460 PetscFunctionReturn(0); 1461 } 1462 1463 #undef __FUNCT__ 1464 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1465 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1466 { 1467 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1468 IS iscol=a->col,isrow=a->row; 1469 PetscErrorCode ierr; 1470 const PetscInt *r,*c,*rout,*cout; 1471 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1472 const MatScalar *aa=a->a,*v; 1473 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1474 const PetscScalar *b; 1475 PetscFunctionBegin; 1476 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1477 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1478 t = a->solve_work; 1479 1480 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1481 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1482 1483 /* forward solve the lower triangular */ 1484 idx = 6*(*r++); 1485 t[0] = b[idx]; t[1] = b[1+idx]; 1486 t[2] = b[2+idx]; t[3] = b[3+idx]; 1487 t[4] = b[4+idx]; t[5] = b[5+idx]; 1488 for (i=1; i<n; i++) { 1489 v = aa + 36*ai[i]; 1490 vi = aj + ai[i]; 1491 nz = diag[i] - ai[i]; 1492 idx = 6*(*r++); 1493 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1494 s5 = b[4+idx]; s6 = b[5+idx]; 1495 while (nz--) { 1496 idx = 6*(*vi++); 1497 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1498 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1499 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1500 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1501 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1502 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1503 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1504 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1505 v += 36; 1506 } 1507 idx = 6*i; 1508 t[idx] = s1;t[1+idx] = s2; 1509 t[2+idx] = s3;t[3+idx] = s4; 1510 t[4+idx] = s5;t[5+idx] = s6; 1511 } 1512 /* backward solve the upper triangular */ 1513 for (i=n-1; i>=0; i--){ 1514 v = aa + 36*diag[i] + 36; 1515 vi = aj + diag[i] + 1; 1516 nz = ai[i+1] - diag[i] - 1; 1517 idt = 6*i; 1518 s1 = t[idt]; s2 = t[1+idt]; 1519 s3 = t[2+idt];s4 = t[3+idt]; 1520 s5 = t[4+idt];s6 = t[5+idt]; 1521 while (nz--) { 1522 idx = 6*(*vi++); 1523 x1 = t[idx]; x2 = t[1+idx]; 1524 x3 = t[2+idx]; x4 = t[3+idx]; 1525 x5 = t[4+idx]; x6 = t[5+idx]; 1526 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1527 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1528 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1529 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1530 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1531 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1532 v += 36; 1533 } 1534 idc = 6*(*c--); 1535 v = aa + 36*diag[i]; 1536 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1537 v[18]*s4+v[24]*s5+v[30]*s6; 1538 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1539 v[19]*s4+v[25]*s5+v[31]*s6; 1540 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1541 v[20]*s4+v[26]*s5+v[32]*s6; 1542 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1543 v[21]*s4+v[27]*s5+v[33]*s6; 1544 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1545 v[22]*s4+v[28]*s5+v[34]*s6; 1546 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1547 v[23]*s4+v[29]*s5+v[35]*s6; 1548 } 1549 1550 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1551 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1552 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1553 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1554 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1555 PetscFunctionReturn(0); 1556 } 1557 1558 #undef __FUNCT__ 1559 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1560 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1561 { 1562 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1563 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1564 PetscErrorCode ierr; 1565 PetscInt *diag = a->diag,jdx; 1566 const MatScalar *aa=a->a,*v; 1567 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1568 const PetscScalar *b; 1569 1570 PetscFunctionBegin; 1571 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1572 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1573 /* forward solve the lower triangular */ 1574 idx = 0; 1575 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1576 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1577 for (i=1; i<n; i++) { 1578 v = aa + 36*ai[i]; 1579 vi = aj + ai[i]; 1580 nz = diag[i] - ai[i]; 1581 idx = 6*i; 1582 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1583 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1584 while (nz--) { 1585 jdx = 6*(*vi++); 1586 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1587 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1588 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1589 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1590 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1591 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1592 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1593 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1594 v += 36; 1595 } 1596 x[idx] = s1; 1597 x[1+idx] = s2; 1598 x[2+idx] = s3; 1599 x[3+idx] = s4; 1600 x[4+idx] = s5; 1601 x[5+idx] = s6; 1602 } 1603 /* backward solve the upper triangular */ 1604 for (i=n-1; i>=0; i--){ 1605 v = aa + 36*diag[i] + 36; 1606 vi = aj + diag[i] + 1; 1607 nz = ai[i+1] - diag[i] - 1; 1608 idt = 6*i; 1609 s1 = x[idt]; s2 = x[1+idt]; 1610 s3 = x[2+idt]; s4 = x[3+idt]; 1611 s5 = x[4+idt]; s6 = x[5+idt]; 1612 while (nz--) { 1613 idx = 6*(*vi++); 1614 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1615 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1616 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1617 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1618 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1619 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1620 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1621 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1622 v += 36; 1623 } 1624 v = aa + 36*diag[i]; 1625 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1626 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1627 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1628 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1629 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1630 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1631 } 1632 1633 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1634 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1635 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1636 PetscFunctionReturn(0); 1637 } 1638 1639 #undef __FUNCT__ 1640 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1641 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1642 { 1643 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1644 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1645 PetscErrorCode ierr; 1646 PetscInt idx,jdx,idt; 1647 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1648 const MatScalar *aa=a->a,*v; 1649 PetscScalar *x; 1650 const PetscScalar *b; 1651 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1652 1653 PetscFunctionBegin; 1654 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1655 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1656 /* forward solve the lower triangular */ 1657 idx = 0; 1658 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1659 x[4] = b[4+idx];x[5] = b[5+idx]; 1660 for (i=1; i<n; i++) { 1661 v = aa + bs2*ai[i]; 1662 vi = aj + ai[i]; 1663 nz = ai[i+1] - ai[i]; 1664 idx = bs*i; 1665 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1666 s5 = b[4+idx];s6 = b[5+idx]; 1667 while (nz--) { 1668 jdx = bs*(*vi++); 1669 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1670 x5 = x[4+jdx]; x6 = x[5+jdx]; 1671 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1672 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1673 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1674 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1675 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1676 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1677 v += bs2; 1678 } 1679 1680 x[idx] = s1; 1681 x[1+idx] = s2; 1682 x[2+idx] = s3; 1683 x[3+idx] = s4; 1684 x[4+idx] = s5; 1685 x[5+idx] = s6; 1686 } 1687 1688 /* backward solve the upper triangular */ 1689 for (i=n-1; i>=0; i--){ 1690 v = aa + bs2*ai[2*n-i]; 1691 vi = aj + ai[2*n-i]; 1692 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1693 idt = bs*i; 1694 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1695 s5 = x[4+idt];s6 = x[5+idt]; 1696 while (nz--) { 1697 idx = bs*(*vi++); 1698 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1699 x5 = x[4+idx];x6 = x[5+idx]; 1700 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1701 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1702 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1703 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1704 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1705 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1706 v += bs2; 1707 } 1708 /* x = inv_diagonal*x */ 1709 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1710 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1711 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1712 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1713 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1714 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1715 } 1716 1717 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1718 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1719 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1720 PetscFunctionReturn(0); 1721 } 1722 #undef __FUNCT__ 1723 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1724 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1725 { 1726 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1727 IS iscol=a->col,isrow=a->row; 1728 PetscErrorCode ierr; 1729 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 1730 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1731 const MatScalar *aa=a->a,*v; 1732 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1733 const PetscScalar *b; 1734 1735 PetscFunctionBegin; 1736 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1737 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1738 t = a->solve_work; 1739 1740 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1741 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1742 1743 /* forward solve the lower triangular */ 1744 idx = 5*(*r++); 1745 t[0] = b[idx]; t[1] = b[1+idx]; 1746 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1747 for (i=1; i<n; i++) { 1748 v = aa + 25*ai[i]; 1749 vi = aj + ai[i]; 1750 nz = diag[i] - ai[i]; 1751 idx = 5*(*r++); 1752 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1753 s5 = b[4+idx]; 1754 while (nz--) { 1755 idx = 5*(*vi++); 1756 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1757 x4 = t[3+idx];x5 = t[4+idx]; 1758 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1759 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1760 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1761 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1762 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1763 v += 25; 1764 } 1765 idx = 5*i; 1766 t[idx] = s1;t[1+idx] = s2; 1767 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1768 } 1769 /* backward solve the upper triangular */ 1770 for (i=n-1; i>=0; i--){ 1771 v = aa + 25*diag[i] + 25; 1772 vi = aj + diag[i] + 1; 1773 nz = ai[i+1] - diag[i] - 1; 1774 idt = 5*i; 1775 s1 = t[idt]; s2 = t[1+idt]; 1776 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1777 while (nz--) { 1778 idx = 5*(*vi++); 1779 x1 = t[idx]; x2 = t[1+idx]; 1780 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1781 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1782 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1783 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1784 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1785 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1786 v += 25; 1787 } 1788 idc = 5*(*c--); 1789 v = aa + 25*diag[i]; 1790 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1791 v[15]*s4+v[20]*s5; 1792 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1793 v[16]*s4+v[21]*s5; 1794 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1795 v[17]*s4+v[22]*s5; 1796 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1797 v[18]*s4+v[23]*s5; 1798 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1799 v[19]*s4+v[24]*s5; 1800 } 1801 1802 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1803 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1804 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1805 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1806 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1807 PetscFunctionReturn(0); 1808 } 1809 1810 #undef __FUNCT__ 1811 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1812 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1813 { 1814 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1815 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1816 PetscErrorCode ierr; 1817 PetscInt *diag = a->diag,jdx; 1818 const MatScalar *aa=a->a,*v; 1819 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1820 const PetscScalar *b; 1821 1822 PetscFunctionBegin; 1823 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1824 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1825 /* forward solve the lower triangular */ 1826 idx = 0; 1827 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1828 for (i=1; i<n; i++) { 1829 v = aa + 25*ai[i]; 1830 vi = aj + ai[i]; 1831 nz = diag[i] - ai[i]; 1832 idx = 5*i; 1833 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1834 while (nz--) { 1835 jdx = 5*(*vi++); 1836 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1837 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1838 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1839 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1840 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1841 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1842 v += 25; 1843 } 1844 x[idx] = s1; 1845 x[1+idx] = s2; 1846 x[2+idx] = s3; 1847 x[3+idx] = s4; 1848 x[4+idx] = s5; 1849 } 1850 /* backward solve the upper triangular */ 1851 for (i=n-1; i>=0; i--){ 1852 v = aa + 25*diag[i] + 25; 1853 vi = aj + diag[i] + 1; 1854 nz = ai[i+1] - diag[i] - 1; 1855 idt = 5*i; 1856 s1 = x[idt]; s2 = x[1+idt]; 1857 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1858 while (nz--) { 1859 idx = 5*(*vi++); 1860 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1861 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1862 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1863 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1864 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1865 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1866 v += 25; 1867 } 1868 v = aa + 25*diag[i]; 1869 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1870 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1871 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1872 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1873 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1874 } 1875 1876 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1877 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1878 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1879 PetscFunctionReturn(0); 1880 } 1881 1882 #undef __FUNCT__ 1883 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 1884 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1885 { 1886 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1887 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1888 PetscErrorCode ierr; 1889 PetscInt jdx; 1890 const MatScalar *aa=a->a,*v; 1891 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1892 const PetscScalar *b; 1893 1894 PetscFunctionBegin; 1895 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1896 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1897 /* forward solve the lower triangular */ 1898 idx = 0; 1899 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1900 for (i=1; i<n; i++) { 1901 v = aa + 25*ai[i]; 1902 vi = aj + ai[i]; 1903 nz = ai[i+1] - ai[i]; 1904 idx = 5*i; 1905 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1906 while (nz--) { 1907 jdx = 5*(*vi++); 1908 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1909 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1910 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1911 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1912 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1913 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1914 v += 25; 1915 } 1916 x[idx] = s1; 1917 x[1+idx] = s2; 1918 x[2+idx] = s3; 1919 x[3+idx] = s4; 1920 x[4+idx] = s5; 1921 } 1922 1923 /* backward solve the upper triangular */ 1924 for (i=n-1; i>=0; i--){ 1925 v = aa + 25*ai[2*n-i]; 1926 vi = aj + ai[2*n-i]; 1927 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1928 idt = 5*i; 1929 s1 = x[idt]; s2 = x[1+idt]; 1930 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1931 while (nz--) { 1932 idx = 5*(*vi++); 1933 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1934 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1935 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1936 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1937 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1938 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1939 v += 25; 1940 } 1941 /* x = inv_diagonal*x */ 1942 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1943 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1944 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1945 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1946 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1947 } 1948 1949 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1950 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1951 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1952 PetscFunctionReturn(0); 1953 } 1954 1955 #undef __FUNCT__ 1956 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1957 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1958 { 1959 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1960 IS iscol=a->col,isrow=a->row; 1961 PetscErrorCode ierr; 1962 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1963 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1964 const MatScalar *aa=a->a,*v; 1965 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1966 const PetscScalar *b; 1967 1968 PetscFunctionBegin; 1969 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1970 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1971 t = a->solve_work; 1972 1973 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1974 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1975 1976 /* forward solve the lower triangular */ 1977 idx = 4*(*r++); 1978 t[0] = b[idx]; t[1] = b[1+idx]; 1979 t[2] = b[2+idx]; t[3] = b[3+idx]; 1980 for (i=1; i<n; i++) { 1981 v = aa + 16*ai[i]; 1982 vi = aj + ai[i]; 1983 nz = diag[i] - ai[i]; 1984 idx = 4*(*r++); 1985 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1986 while (nz--) { 1987 idx = 4*(*vi++); 1988 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1989 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1990 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1991 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1992 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1993 v += 16; 1994 } 1995 idx = 4*i; 1996 t[idx] = s1;t[1+idx] = s2; 1997 t[2+idx] = s3;t[3+idx] = s4; 1998 } 1999 /* backward solve the upper triangular */ 2000 for (i=n-1; i>=0; i--){ 2001 v = aa + 16*diag[i] + 16; 2002 vi = aj + diag[i] + 1; 2003 nz = ai[i+1] - diag[i] - 1; 2004 idt = 4*i; 2005 s1 = t[idt]; s2 = t[1+idt]; 2006 s3 = t[2+idt];s4 = t[3+idt]; 2007 while (nz--) { 2008 idx = 4*(*vi++); 2009 x1 = t[idx]; x2 = t[1+idx]; 2010 x3 = t[2+idx]; x4 = t[3+idx]; 2011 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2012 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2013 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2014 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2015 v += 16; 2016 } 2017 idc = 4*(*c--); 2018 v = aa + 16*diag[i]; 2019 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2020 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2021 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2022 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2023 } 2024 2025 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2026 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2027 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2028 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2029 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2030 PetscFunctionReturn(0); 2031 } 2032 2033 #undef __FUNCT__ 2034 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2035 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2036 { 2037 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2038 IS iscol=a->col,isrow=a->row; 2039 PetscErrorCode ierr; 2040 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2041 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2042 const MatScalar *aa=a->a,*v; 2043 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2044 PetscScalar *x; 2045 const PetscScalar *b; 2046 2047 PetscFunctionBegin; 2048 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2049 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2050 t = (MatScalar *)a->solve_work; 2051 2052 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2053 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2054 2055 /* forward solve the lower triangular */ 2056 idx = 4*(*r++); 2057 t[0] = (MatScalar)b[idx]; 2058 t[1] = (MatScalar)b[1+idx]; 2059 t[2] = (MatScalar)b[2+idx]; 2060 t[3] = (MatScalar)b[3+idx]; 2061 for (i=1; i<n; i++) { 2062 v = aa + 16*ai[i]; 2063 vi = aj + ai[i]; 2064 nz = diag[i] - ai[i]; 2065 idx = 4*(*r++); 2066 s1 = (MatScalar)b[idx]; 2067 s2 = (MatScalar)b[1+idx]; 2068 s3 = (MatScalar)b[2+idx]; 2069 s4 = (MatScalar)b[3+idx]; 2070 while (nz--) { 2071 idx = 4*(*vi++); 2072 x1 = t[idx]; 2073 x2 = t[1+idx]; 2074 x3 = t[2+idx]; 2075 x4 = t[3+idx]; 2076 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2077 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2078 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2079 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2080 v += 16; 2081 } 2082 idx = 4*i; 2083 t[idx] = s1; 2084 t[1+idx] = s2; 2085 t[2+idx] = s3; 2086 t[3+idx] = s4; 2087 } 2088 /* backward solve the upper triangular */ 2089 for (i=n-1; i>=0; i--){ 2090 v = aa + 16*diag[i] + 16; 2091 vi = aj + diag[i] + 1; 2092 nz = ai[i+1] - diag[i] - 1; 2093 idt = 4*i; 2094 s1 = t[idt]; 2095 s2 = t[1+idt]; 2096 s3 = t[2+idt]; 2097 s4 = t[3+idt]; 2098 while (nz--) { 2099 idx = 4*(*vi++); 2100 x1 = t[idx]; 2101 x2 = t[1+idx]; 2102 x3 = t[2+idx]; 2103 x4 = t[3+idx]; 2104 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2105 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2106 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2107 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2108 v += 16; 2109 } 2110 idc = 4*(*c--); 2111 v = aa + 16*diag[i]; 2112 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2113 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2114 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2115 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2116 x[idc] = (PetscScalar)t[idt]; 2117 x[1+idc] = (PetscScalar)t[1+idt]; 2118 x[2+idc] = (PetscScalar)t[2+idt]; 2119 x[3+idc] = (PetscScalar)t[3+idt]; 2120 } 2121 2122 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2123 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2124 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2125 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2126 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2127 PetscFunctionReturn(0); 2128 } 2129 2130 #if defined (PETSC_HAVE_SSE) 2131 2132 #include PETSC_HAVE_SSE 2133 2134 #undef __FUNCT__ 2135 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2136 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 2137 { 2138 /* 2139 Note: This code uses demotion of double 2140 to float when performing the mixed-mode computation. 2141 This may not be numerically reasonable for all applications. 2142 */ 2143 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2144 IS iscol=a->col,isrow=a->row; 2145 PetscErrorCode ierr; 2146 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 2147 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2148 MatScalar *aa=a->a,*v; 2149 PetscScalar *x,*b,*t; 2150 2151 /* Make space in temp stack for 16 Byte Aligned arrays */ 2152 float ssealignedspace[11],*tmps,*tmpx; 2153 unsigned long offset; 2154 2155 PetscFunctionBegin; 2156 SSE_SCOPE_BEGIN; 2157 2158 offset = (unsigned long)ssealignedspace % 16; 2159 if (offset) offset = (16 - offset)/4; 2160 tmps = &ssealignedspace[offset]; 2161 tmpx = &ssealignedspace[offset+4]; 2162 PREFETCH_NTA(aa+16*ai[1]); 2163 2164 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2166 t = a->solve_work; 2167 2168 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2169 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2170 2171 /* forward solve the lower triangular */ 2172 idx = 4*(*r++); 2173 t[0] = b[idx]; t[1] = b[1+idx]; 2174 t[2] = b[2+idx]; t[3] = b[3+idx]; 2175 v = aa + 16*ai[1]; 2176 2177 for (i=1; i<n;) { 2178 PREFETCH_NTA(&v[8]); 2179 vi = aj + ai[i]; 2180 nz = diag[i] - ai[i]; 2181 idx = 4*(*r++); 2182 2183 /* Demote sum from double to float */ 2184 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2185 LOAD_PS(tmps,XMM7); 2186 2187 while (nz--) { 2188 PREFETCH_NTA(&v[16]); 2189 idx = 4*(*vi++); 2190 2191 /* Demote solution (so far) from double to float */ 2192 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2193 2194 /* 4x4 Matrix-Vector product with negative accumulation: */ 2195 SSE_INLINE_BEGIN_2(tmpx,v) 2196 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2197 2198 /* First Column */ 2199 SSE_COPY_PS(XMM0,XMM6) 2200 SSE_SHUFFLE(XMM0,XMM0,0x00) 2201 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2202 SSE_SUB_PS(XMM7,XMM0) 2203 2204 /* Second Column */ 2205 SSE_COPY_PS(XMM1,XMM6) 2206 SSE_SHUFFLE(XMM1,XMM1,0x55) 2207 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2208 SSE_SUB_PS(XMM7,XMM1) 2209 2210 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2211 2212 /* Third Column */ 2213 SSE_COPY_PS(XMM2,XMM6) 2214 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2215 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2216 SSE_SUB_PS(XMM7,XMM2) 2217 2218 /* Fourth Column */ 2219 SSE_COPY_PS(XMM3,XMM6) 2220 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2221 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2222 SSE_SUB_PS(XMM7,XMM3) 2223 SSE_INLINE_END_2 2224 2225 v += 16; 2226 } 2227 idx = 4*i; 2228 v = aa + 16*ai[++i]; 2229 PREFETCH_NTA(v); 2230 STORE_PS(tmps,XMM7); 2231 2232 /* Promote result from float to double */ 2233 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 2234 } 2235 /* backward solve the upper triangular */ 2236 idt = 4*(n-1); 2237 ai16 = 16*diag[n-1]; 2238 v = aa + ai16 + 16; 2239 for (i=n-1; i>=0;){ 2240 PREFETCH_NTA(&v[8]); 2241 vi = aj + diag[i] + 1; 2242 nz = ai[i+1] - diag[i] - 1; 2243 2244 /* Demote accumulator from double to float */ 2245 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 2246 LOAD_PS(tmps,XMM7); 2247 2248 while (nz--) { 2249 PREFETCH_NTA(&v[16]); 2250 idx = 4*(*vi++); 2251 2252 /* Demote solution (so far) from double to float */ 2253 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2254 2255 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2256 SSE_INLINE_BEGIN_2(tmpx,v) 2257 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2258 2259 /* First Column */ 2260 SSE_COPY_PS(XMM0,XMM6) 2261 SSE_SHUFFLE(XMM0,XMM0,0x00) 2262 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2263 SSE_SUB_PS(XMM7,XMM0) 2264 2265 /* Second Column */ 2266 SSE_COPY_PS(XMM1,XMM6) 2267 SSE_SHUFFLE(XMM1,XMM1,0x55) 2268 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2269 SSE_SUB_PS(XMM7,XMM1) 2270 2271 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2272 2273 /* Third Column */ 2274 SSE_COPY_PS(XMM2,XMM6) 2275 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2276 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2277 SSE_SUB_PS(XMM7,XMM2) 2278 2279 /* Fourth Column */ 2280 SSE_COPY_PS(XMM3,XMM6) 2281 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2282 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2283 SSE_SUB_PS(XMM7,XMM3) 2284 SSE_INLINE_END_2 2285 v += 16; 2286 } 2287 v = aa + ai16; 2288 ai16 = 16*diag[--i]; 2289 PREFETCH_NTA(aa+ai16+16); 2290 /* 2291 Scale the result by the diagonal 4x4 block, 2292 which was inverted as part of the factorization 2293 */ 2294 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2295 /* First Column */ 2296 SSE_COPY_PS(XMM0,XMM7) 2297 SSE_SHUFFLE(XMM0,XMM0,0x00) 2298 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2299 2300 /* Second Column */ 2301 SSE_COPY_PS(XMM1,XMM7) 2302 SSE_SHUFFLE(XMM1,XMM1,0x55) 2303 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2304 SSE_ADD_PS(XMM0,XMM1) 2305 2306 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2307 2308 /* Third Column */ 2309 SSE_COPY_PS(XMM2,XMM7) 2310 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2311 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2312 SSE_ADD_PS(XMM0,XMM2) 2313 2314 /* Fourth Column */ 2315 SSE_COPY_PS(XMM3,XMM7) 2316 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2317 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2318 SSE_ADD_PS(XMM0,XMM3) 2319 2320 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2321 SSE_INLINE_END_3 2322 2323 /* Promote solution from float to double */ 2324 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2325 2326 /* Apply reordering to t and stream into x. */ 2327 /* This way, x doesn't pollute the cache. */ 2328 /* Be careful with size: 2 doubles = 4 floats! */ 2329 idc = 4*(*c--); 2330 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2331 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2332 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2333 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2334 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2335 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2336 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2337 SSE_INLINE_END_2 2338 v = aa + ai16 + 16; 2339 idt -= 4; 2340 } 2341 2342 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2343 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2344 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2345 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2346 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2347 SSE_SCOPE_END; 2348 PetscFunctionReturn(0); 2349 } 2350 2351 #endif 2352 2353 2354 /* 2355 Special case where the matrix was ILU(0) factored in the natural 2356 ordering. This eliminates the need for the column and row permutation. 2357 */ 2358 #undef __FUNCT__ 2359 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2360 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2361 { 2362 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2363 PetscInt n=a->mbs; 2364 const PetscInt *ai=a->i,*aj=a->j; 2365 PetscErrorCode ierr; 2366 const PetscInt *diag = a->diag; 2367 const MatScalar *aa=a->a; 2368 PetscScalar *x; 2369 const PetscScalar *b; 2370 2371 PetscFunctionBegin; 2372 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2373 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2374 2375 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2376 { 2377 static PetscScalar w[2000]; /* very BAD need to fix */ 2378 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2379 } 2380 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2381 { 2382 static PetscScalar w[2000]; /* very BAD need to fix */ 2383 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2384 } 2385 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2386 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2387 #else 2388 { 2389 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2390 const MatScalar *v; 2391 PetscInt jdx,idt,idx,nz,i,ai16; 2392 const PetscInt *vi; 2393 2394 /* forward solve the lower triangular */ 2395 idx = 0; 2396 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2397 for (i=1; i<n; i++) { 2398 v = aa + 16*ai[i]; 2399 vi = aj + ai[i]; 2400 nz = diag[i] - ai[i]; 2401 idx += 4; 2402 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2403 while (nz--) { 2404 jdx = 4*(*vi++); 2405 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2406 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2407 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2408 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2409 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2410 v += 16; 2411 } 2412 x[idx] = s1; 2413 x[1+idx] = s2; 2414 x[2+idx] = s3; 2415 x[3+idx] = s4; 2416 } 2417 /* backward solve the upper triangular */ 2418 idt = 4*(n-1); 2419 for (i=n-1; i>=0; i--){ 2420 ai16 = 16*diag[i]; 2421 v = aa + ai16 + 16; 2422 vi = aj + diag[i] + 1; 2423 nz = ai[i+1] - diag[i] - 1; 2424 s1 = x[idt]; s2 = x[1+idt]; 2425 s3 = x[2+idt];s4 = x[3+idt]; 2426 while (nz--) { 2427 idx = 4*(*vi++); 2428 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2429 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2430 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2431 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2432 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2433 v += 16; 2434 } 2435 v = aa + ai16; 2436 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2437 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2438 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2439 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2440 idt -= 4; 2441 } 2442 } 2443 #endif 2444 2445 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2446 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2447 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2448 PetscFunctionReturn(0); 2449 } 2450 2451 #undef __FUNCT__ 2452 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 2453 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2454 { 2455 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2456 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2457 PetscErrorCode ierr; 2458 PetscInt idx,jdx,idt; 2459 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2460 const MatScalar *aa=a->a,*v; 2461 PetscScalar *x; 2462 const PetscScalar *b; 2463 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2464 2465 PetscFunctionBegin; 2466 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2467 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2468 /* forward solve the lower triangular */ 2469 idx = 0; 2470 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2471 for (i=1; i<n; i++) { 2472 v = aa + bs2*ai[i]; 2473 vi = aj + ai[i]; 2474 nz = ai[i+1] - ai[i]; 2475 idx = bs*i; 2476 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2477 while (nz--) { 2478 jdx = bs*(*vi++); 2479 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2480 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2481 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2482 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2483 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2484 2485 v += bs2; 2486 } 2487 2488 x[idx] = s1; 2489 x[1+idx] = s2; 2490 x[2+idx] = s3; 2491 x[3+idx] = s4; 2492 } 2493 2494 /* backward solve the upper triangular */ 2495 for (i=n-1; i>=0; i--){ 2496 v = aa + bs2*ai[2*n-i]; 2497 vi = aj + ai[2*n-i]; 2498 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2499 idt = bs*i; 2500 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2501 2502 while (nz--) { 2503 idx = bs*(*vi++); 2504 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2505 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2506 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2507 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2508 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2509 2510 v += bs2; 2511 } 2512 /* x = inv_diagonal*x */ 2513 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2514 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 2515 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2516 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2517 2518 } 2519 2520 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2521 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2522 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2523 PetscFunctionReturn(0); 2524 } 2525 2526 2527 2528 #undef __FUNCT__ 2529 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2530 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2531 { 2532 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2533 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2534 PetscErrorCode ierr; 2535 PetscInt *diag = a->diag; 2536 MatScalar *aa=a->a; 2537 PetscScalar *x,*b; 2538 2539 PetscFunctionBegin; 2540 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2541 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2542 2543 { 2544 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2545 MatScalar *v,*t=(MatScalar *)x; 2546 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2547 2548 /* forward solve the lower triangular */ 2549 idx = 0; 2550 t[0] = (MatScalar)b[0]; 2551 t[1] = (MatScalar)b[1]; 2552 t[2] = (MatScalar)b[2]; 2553 t[3] = (MatScalar)b[3]; 2554 for (i=1; i<n; i++) { 2555 v = aa + 16*ai[i]; 2556 vi = aj + ai[i]; 2557 nz = diag[i] - ai[i]; 2558 idx += 4; 2559 s1 = (MatScalar)b[idx]; 2560 s2 = (MatScalar)b[1+idx]; 2561 s3 = (MatScalar)b[2+idx]; 2562 s4 = (MatScalar)b[3+idx]; 2563 while (nz--) { 2564 jdx = 4*(*vi++); 2565 x1 = t[jdx]; 2566 x2 = t[1+jdx]; 2567 x3 = t[2+jdx]; 2568 x4 = t[3+jdx]; 2569 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2570 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2571 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2572 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2573 v += 16; 2574 } 2575 t[idx] = s1; 2576 t[1+idx] = s2; 2577 t[2+idx] = s3; 2578 t[3+idx] = s4; 2579 } 2580 /* backward solve the upper triangular */ 2581 idt = 4*(n-1); 2582 for (i=n-1; i>=0; i--){ 2583 ai16 = 16*diag[i]; 2584 v = aa + ai16 + 16; 2585 vi = aj + diag[i] + 1; 2586 nz = ai[i+1] - diag[i] - 1; 2587 s1 = t[idt]; 2588 s2 = t[1+idt]; 2589 s3 = t[2+idt]; 2590 s4 = t[3+idt]; 2591 while (nz--) { 2592 idx = 4*(*vi++); 2593 x1 = (MatScalar)x[idx]; 2594 x2 = (MatScalar)x[1+idx]; 2595 x3 = (MatScalar)x[2+idx]; 2596 x4 = (MatScalar)x[3+idx]; 2597 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2598 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2599 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2600 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2601 v += 16; 2602 } 2603 v = aa + ai16; 2604 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2605 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2606 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2607 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2608 idt -= 4; 2609 } 2610 } 2611 2612 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2613 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2614 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2615 PetscFunctionReturn(0); 2616 } 2617 2618 #if defined (PETSC_HAVE_SSE) 2619 2620 #include PETSC_HAVE_SSE 2621 #undef __FUNCT__ 2622 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2623 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2624 { 2625 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2626 unsigned short *aj=(unsigned short *)a->j; 2627 PetscErrorCode ierr; 2628 int *ai=a->i,n=a->mbs,*diag = a->diag; 2629 MatScalar *aa=a->a; 2630 PetscScalar *x,*b; 2631 2632 PetscFunctionBegin; 2633 SSE_SCOPE_BEGIN; 2634 /* 2635 Note: This code currently uses demotion of double 2636 to float when performing the mixed-mode computation. 2637 This may not be numerically reasonable for all applications. 2638 */ 2639 PREFETCH_NTA(aa+16*ai[1]); 2640 2641 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2642 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2643 { 2644 /* x will first be computed in single precision then promoted inplace to double */ 2645 MatScalar *v,*t=(MatScalar *)x; 2646 int nz,i,idt,ai16; 2647 unsigned int jdx,idx; 2648 unsigned short *vi; 2649 /* Forward solve the lower triangular factor. */ 2650 2651 /* First block is the identity. */ 2652 idx = 0; 2653 CONVERT_DOUBLE4_FLOAT4(t,b); 2654 v = aa + 16*((unsigned int)ai[1]); 2655 2656 for (i=1; i<n;) { 2657 PREFETCH_NTA(&v[8]); 2658 vi = aj + ai[i]; 2659 nz = diag[i] - ai[i]; 2660 idx += 4; 2661 2662 /* Demote RHS from double to float. */ 2663 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2664 LOAD_PS(&t[idx],XMM7); 2665 2666 while (nz--) { 2667 PREFETCH_NTA(&v[16]); 2668 jdx = 4*((unsigned int)(*vi++)); 2669 2670 /* 4x4 Matrix-Vector product with negative accumulation: */ 2671 SSE_INLINE_BEGIN_2(&t[jdx],v) 2672 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2673 2674 /* First Column */ 2675 SSE_COPY_PS(XMM0,XMM6) 2676 SSE_SHUFFLE(XMM0,XMM0,0x00) 2677 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2678 SSE_SUB_PS(XMM7,XMM0) 2679 2680 /* Second Column */ 2681 SSE_COPY_PS(XMM1,XMM6) 2682 SSE_SHUFFLE(XMM1,XMM1,0x55) 2683 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2684 SSE_SUB_PS(XMM7,XMM1) 2685 2686 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2687 2688 /* Third Column */ 2689 SSE_COPY_PS(XMM2,XMM6) 2690 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2691 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2692 SSE_SUB_PS(XMM7,XMM2) 2693 2694 /* Fourth Column */ 2695 SSE_COPY_PS(XMM3,XMM6) 2696 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2697 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2698 SSE_SUB_PS(XMM7,XMM3) 2699 SSE_INLINE_END_2 2700 2701 v += 16; 2702 } 2703 v = aa + 16*ai[++i]; 2704 PREFETCH_NTA(v); 2705 STORE_PS(&t[idx],XMM7); 2706 } 2707 2708 /* Backward solve the upper triangular factor.*/ 2709 2710 idt = 4*(n-1); 2711 ai16 = 16*diag[n-1]; 2712 v = aa + ai16 + 16; 2713 for (i=n-1; i>=0;){ 2714 PREFETCH_NTA(&v[8]); 2715 vi = aj + diag[i] + 1; 2716 nz = ai[i+1] - diag[i] - 1; 2717 2718 LOAD_PS(&t[idt],XMM7); 2719 2720 while (nz--) { 2721 PREFETCH_NTA(&v[16]); 2722 idx = 4*((unsigned int)(*vi++)); 2723 2724 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2725 SSE_INLINE_BEGIN_2(&t[idx],v) 2726 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2727 2728 /* First Column */ 2729 SSE_COPY_PS(XMM0,XMM6) 2730 SSE_SHUFFLE(XMM0,XMM0,0x00) 2731 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2732 SSE_SUB_PS(XMM7,XMM0) 2733 2734 /* Second Column */ 2735 SSE_COPY_PS(XMM1,XMM6) 2736 SSE_SHUFFLE(XMM1,XMM1,0x55) 2737 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2738 SSE_SUB_PS(XMM7,XMM1) 2739 2740 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2741 2742 /* Third Column */ 2743 SSE_COPY_PS(XMM2,XMM6) 2744 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2745 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2746 SSE_SUB_PS(XMM7,XMM2) 2747 2748 /* Fourth Column */ 2749 SSE_COPY_PS(XMM3,XMM6) 2750 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2751 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2752 SSE_SUB_PS(XMM7,XMM3) 2753 SSE_INLINE_END_2 2754 v += 16; 2755 } 2756 v = aa + ai16; 2757 ai16 = 16*diag[--i]; 2758 PREFETCH_NTA(aa+ai16+16); 2759 /* 2760 Scale the result by the diagonal 4x4 block, 2761 which was inverted as part of the factorization 2762 */ 2763 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2764 /* First Column */ 2765 SSE_COPY_PS(XMM0,XMM7) 2766 SSE_SHUFFLE(XMM0,XMM0,0x00) 2767 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2768 2769 /* Second Column */ 2770 SSE_COPY_PS(XMM1,XMM7) 2771 SSE_SHUFFLE(XMM1,XMM1,0x55) 2772 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2773 SSE_ADD_PS(XMM0,XMM1) 2774 2775 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2776 2777 /* Third Column */ 2778 SSE_COPY_PS(XMM2,XMM7) 2779 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2780 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2781 SSE_ADD_PS(XMM0,XMM2) 2782 2783 /* Fourth Column */ 2784 SSE_COPY_PS(XMM3,XMM7) 2785 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2786 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2787 SSE_ADD_PS(XMM0,XMM3) 2788 2789 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2790 SSE_INLINE_END_3 2791 2792 v = aa + ai16 + 16; 2793 idt -= 4; 2794 } 2795 2796 /* Convert t from single precision back to double precision (inplace)*/ 2797 idt = 4*(n-1); 2798 for (i=n-1;i>=0;i--) { 2799 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2800 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2801 PetscScalar *xtemp=&x[idt]; 2802 MatScalar *ttemp=&t[idt]; 2803 xtemp[3] = (PetscScalar)ttemp[3]; 2804 xtemp[2] = (PetscScalar)ttemp[2]; 2805 xtemp[1] = (PetscScalar)ttemp[1]; 2806 xtemp[0] = (PetscScalar)ttemp[0]; 2807 idt -= 4; 2808 } 2809 2810 } /* End of artificial scope. */ 2811 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2812 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2813 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2814 SSE_SCOPE_END; 2815 PetscFunctionReturn(0); 2816 } 2817 2818 #undef __FUNCT__ 2819 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2820 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2821 { 2822 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2823 int *aj=a->j; 2824 PetscErrorCode ierr; 2825 int *ai=a->i,n=a->mbs,*diag = a->diag; 2826 MatScalar *aa=a->a; 2827 PetscScalar *x,*b; 2828 2829 PetscFunctionBegin; 2830 SSE_SCOPE_BEGIN; 2831 /* 2832 Note: This code currently uses demotion of double 2833 to float when performing the mixed-mode computation. 2834 This may not be numerically reasonable for all applications. 2835 */ 2836 PREFETCH_NTA(aa+16*ai[1]); 2837 2838 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2839 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2840 { 2841 /* x will first be computed in single precision then promoted inplace to double */ 2842 MatScalar *v,*t=(MatScalar *)x; 2843 int nz,i,idt,ai16; 2844 int jdx,idx; 2845 int *vi; 2846 /* Forward solve the lower triangular factor. */ 2847 2848 /* First block is the identity. */ 2849 idx = 0; 2850 CONVERT_DOUBLE4_FLOAT4(t,b); 2851 v = aa + 16*ai[1]; 2852 2853 for (i=1; i<n;) { 2854 PREFETCH_NTA(&v[8]); 2855 vi = aj + ai[i]; 2856 nz = diag[i] - ai[i]; 2857 idx += 4; 2858 2859 /* Demote RHS from double to float. */ 2860 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2861 LOAD_PS(&t[idx],XMM7); 2862 2863 while (nz--) { 2864 PREFETCH_NTA(&v[16]); 2865 jdx = 4*(*vi++); 2866 /* jdx = *vi++; */ 2867 2868 /* 4x4 Matrix-Vector product with negative accumulation: */ 2869 SSE_INLINE_BEGIN_2(&t[jdx],v) 2870 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2871 2872 /* First Column */ 2873 SSE_COPY_PS(XMM0,XMM6) 2874 SSE_SHUFFLE(XMM0,XMM0,0x00) 2875 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2876 SSE_SUB_PS(XMM7,XMM0) 2877 2878 /* Second Column */ 2879 SSE_COPY_PS(XMM1,XMM6) 2880 SSE_SHUFFLE(XMM1,XMM1,0x55) 2881 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2882 SSE_SUB_PS(XMM7,XMM1) 2883 2884 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2885 2886 /* Third Column */ 2887 SSE_COPY_PS(XMM2,XMM6) 2888 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2889 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2890 SSE_SUB_PS(XMM7,XMM2) 2891 2892 /* Fourth Column */ 2893 SSE_COPY_PS(XMM3,XMM6) 2894 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2895 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2896 SSE_SUB_PS(XMM7,XMM3) 2897 SSE_INLINE_END_2 2898 2899 v += 16; 2900 } 2901 v = aa + 16*ai[++i]; 2902 PREFETCH_NTA(v); 2903 STORE_PS(&t[idx],XMM7); 2904 } 2905 2906 /* Backward solve the upper triangular factor.*/ 2907 2908 idt = 4*(n-1); 2909 ai16 = 16*diag[n-1]; 2910 v = aa + ai16 + 16; 2911 for (i=n-1; i>=0;){ 2912 PREFETCH_NTA(&v[8]); 2913 vi = aj + diag[i] + 1; 2914 nz = ai[i+1] - diag[i] - 1; 2915 2916 LOAD_PS(&t[idt],XMM7); 2917 2918 while (nz--) { 2919 PREFETCH_NTA(&v[16]); 2920 idx = 4*(*vi++); 2921 /* idx = *vi++; */ 2922 2923 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2924 SSE_INLINE_BEGIN_2(&t[idx],v) 2925 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2926 2927 /* First Column */ 2928 SSE_COPY_PS(XMM0,XMM6) 2929 SSE_SHUFFLE(XMM0,XMM0,0x00) 2930 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2931 SSE_SUB_PS(XMM7,XMM0) 2932 2933 /* Second Column */ 2934 SSE_COPY_PS(XMM1,XMM6) 2935 SSE_SHUFFLE(XMM1,XMM1,0x55) 2936 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2937 SSE_SUB_PS(XMM7,XMM1) 2938 2939 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2940 2941 /* Third Column */ 2942 SSE_COPY_PS(XMM2,XMM6) 2943 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2944 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2945 SSE_SUB_PS(XMM7,XMM2) 2946 2947 /* Fourth Column */ 2948 SSE_COPY_PS(XMM3,XMM6) 2949 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2950 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2951 SSE_SUB_PS(XMM7,XMM3) 2952 SSE_INLINE_END_2 2953 v += 16; 2954 } 2955 v = aa + ai16; 2956 ai16 = 16*diag[--i]; 2957 PREFETCH_NTA(aa+ai16+16); 2958 /* 2959 Scale the result by the diagonal 4x4 block, 2960 which was inverted as part of the factorization 2961 */ 2962 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2963 /* First Column */ 2964 SSE_COPY_PS(XMM0,XMM7) 2965 SSE_SHUFFLE(XMM0,XMM0,0x00) 2966 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2967 2968 /* Second Column */ 2969 SSE_COPY_PS(XMM1,XMM7) 2970 SSE_SHUFFLE(XMM1,XMM1,0x55) 2971 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2972 SSE_ADD_PS(XMM0,XMM1) 2973 2974 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2975 2976 /* Third Column */ 2977 SSE_COPY_PS(XMM2,XMM7) 2978 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2979 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2980 SSE_ADD_PS(XMM0,XMM2) 2981 2982 /* Fourth Column */ 2983 SSE_COPY_PS(XMM3,XMM7) 2984 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2985 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2986 SSE_ADD_PS(XMM0,XMM3) 2987 2988 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2989 SSE_INLINE_END_3 2990 2991 v = aa + ai16 + 16; 2992 idt -= 4; 2993 } 2994 2995 /* Convert t from single precision back to double precision (inplace)*/ 2996 idt = 4*(n-1); 2997 for (i=n-1;i>=0;i--) { 2998 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2999 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3000 PetscScalar *xtemp=&x[idt]; 3001 MatScalar *ttemp=&t[idt]; 3002 xtemp[3] = (PetscScalar)ttemp[3]; 3003 xtemp[2] = (PetscScalar)ttemp[2]; 3004 xtemp[1] = (PetscScalar)ttemp[1]; 3005 xtemp[0] = (PetscScalar)ttemp[0]; 3006 idt -= 4; 3007 } 3008 3009 } /* End of artificial scope. */ 3010 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3011 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3012 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3013 SSE_SCOPE_END; 3014 PetscFunctionReturn(0); 3015 } 3016 3017 #endif 3018 #undef __FUNCT__ 3019 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3020 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 3021 { 3022 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3023 IS iscol=a->col,isrow=a->row; 3024 PetscErrorCode ierr; 3025 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3026 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3027 const MatScalar *aa=a->a,*v; 3028 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3029 const PetscScalar *b; 3030 3031 PetscFunctionBegin; 3032 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3033 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3034 t = a->solve_work; 3035 3036 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3037 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3038 3039 /* forward solve the lower triangular */ 3040 idx = 3*(*r++); 3041 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3042 for (i=1; i<n; i++) { 3043 v = aa + 9*ai[i]; 3044 vi = aj + ai[i]; 3045 nz = diag[i] - ai[i]; 3046 idx = 3*(*r++); 3047 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3048 while (nz--) { 3049 idx = 3*(*vi++); 3050 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3051 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3052 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3053 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3054 v += 9; 3055 } 3056 idx = 3*i; 3057 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3058 } 3059 /* backward solve the upper triangular */ 3060 for (i=n-1; i>=0; i--){ 3061 v = aa + 9*diag[i] + 9; 3062 vi = aj + diag[i] + 1; 3063 nz = ai[i+1] - diag[i] - 1; 3064 idt = 3*i; 3065 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3066 while (nz--) { 3067 idx = 3*(*vi++); 3068 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3069 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3070 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3071 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3072 v += 9; 3073 } 3074 idc = 3*(*c--); 3075 v = aa + 9*diag[i]; 3076 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3077 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3078 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3079 } 3080 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3081 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3082 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3083 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3084 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3085 PetscFunctionReturn(0); 3086 } 3087 3088 /* 3089 Special case where the matrix was ILU(0) factored in the natural 3090 ordering. This eliminates the need for the column and row permutation. 3091 */ 3092 #undef __FUNCT__ 3093 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 3094 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 3095 { 3096 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3097 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3098 PetscErrorCode ierr; 3099 PetscInt *diag = a->diag; 3100 const MatScalar *aa=a->a,*v; 3101 PetscScalar *x,s1,s2,s3,x1,x2,x3; 3102 const PetscScalar *b; 3103 PetscInt jdx,idt,idx,nz,*vi,i; 3104 3105 PetscFunctionBegin; 3106 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3107 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3108 3109 /* forward solve the lower triangular */ 3110 idx = 0; 3111 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 3112 for (i=1; i<n; i++) { 3113 v = aa + 9*ai[i]; 3114 vi = aj + ai[i]; 3115 nz = diag[i] - ai[i]; 3116 idx += 3; 3117 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3118 while (nz--) { 3119 jdx = 3*(*vi++); 3120 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 3121 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3122 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3123 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3124 v += 9; 3125 } 3126 x[idx] = s1; 3127 x[1+idx] = s2; 3128 x[2+idx] = s3; 3129 } 3130 /* backward solve the upper triangular */ 3131 for (i=n-1; i>=0; i--){ 3132 v = aa + 9*diag[i] + 9; 3133 vi = aj + diag[i] + 1; 3134 nz = ai[i+1] - diag[i] - 1; 3135 idt = 3*i; 3136 s1 = x[idt]; s2 = x[1+idt]; 3137 s3 = x[2+idt]; 3138 while (nz--) { 3139 idx = 3*(*vi++); 3140 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 3141 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3142 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3143 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3144 v += 9; 3145 } 3146 v = aa + 9*diag[i]; 3147 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3148 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3149 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3150 } 3151 3152 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3153 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3154 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3155 PetscFunctionReturn(0); 3156 } 3157 3158 #undef __FUNCT__ 3159 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 3160 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3161 { 3162 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3163 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3164 PetscErrorCode ierr; 3165 PetscInt idx,jdx,idt; 3166 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3167 const MatScalar *aa=a->a,*v; 3168 PetscScalar *x; 3169 const PetscScalar *b; 3170 PetscScalar s1,s2,s3,x1,x2,x3; 3171 3172 PetscFunctionBegin; 3173 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3174 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3175 /* forward solve the lower triangular */ 3176 idx = 0; 3177 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3178 for (i=1; i<n; i++) { 3179 v = aa + bs2*ai[i]; 3180 vi = aj + ai[i]; 3181 nz = ai[i+1] - ai[i]; 3182 idx = bs*i; 3183 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3184 while (nz--) { 3185 jdx = bs*(*vi++); 3186 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3187 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3188 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3189 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3190 3191 v += bs2; 3192 } 3193 3194 x[idx] = s1; 3195 x[1+idx] = s2; 3196 x[2+idx] = s3; 3197 } 3198 3199 /* backward solve the upper triangular */ 3200 for (i=n-1; i>=0; i--){ 3201 v = aa + bs2*ai[2*n-i]; 3202 vi = aj + ai[2*n-i]; 3203 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3204 idt = bs*i; 3205 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3206 3207 while (nz--) { 3208 idx = bs*(*vi++); 3209 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3210 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3211 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3212 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3213 3214 v += bs2; 3215 } 3216 /* x = inv_diagonal*x */ 3217 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3218 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3219 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3220 3221 } 3222 3223 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3224 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3225 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3226 PetscFunctionReturn(0); 3227 } 3228 3229 #undef __FUNCT__ 3230 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 3231 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 3232 { 3233 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3234 IS iscol=a->col,isrow=a->row; 3235 PetscErrorCode ierr; 3236 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3237 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3238 const MatScalar *aa=a->a,*v; 3239 PetscScalar *x,s1,s2,x1,x2,*t; 3240 const PetscScalar *b; 3241 3242 PetscFunctionBegin; 3243 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3244 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3245 t = a->solve_work; 3246 3247 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3248 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3249 3250 /* forward solve the lower triangular */ 3251 idx = 2*(*r++); 3252 t[0] = b[idx]; t[1] = b[1+idx]; 3253 for (i=1; i<n; i++) { 3254 v = aa + 4*ai[i]; 3255 vi = aj + ai[i]; 3256 nz = diag[i] - ai[i]; 3257 idx = 2*(*r++); 3258 s1 = b[idx]; s2 = b[1+idx]; 3259 while (nz--) { 3260 idx = 2*(*vi++); 3261 x1 = t[idx]; x2 = t[1+idx]; 3262 s1 -= v[0]*x1 + v[2]*x2; 3263 s2 -= v[1]*x1 + v[3]*x2; 3264 v += 4; 3265 } 3266 idx = 2*i; 3267 t[idx] = s1; t[1+idx] = s2; 3268 } 3269 /* backward solve the upper triangular */ 3270 for (i=n-1; i>=0; i--){ 3271 v = aa + 4*diag[i] + 4; 3272 vi = aj + diag[i] + 1; 3273 nz = ai[i+1] - diag[i] - 1; 3274 idt = 2*i; 3275 s1 = t[idt]; s2 = t[1+idt]; 3276 while (nz--) { 3277 idx = 2*(*vi++); 3278 x1 = t[idx]; x2 = t[1+idx]; 3279 s1 -= v[0]*x1 + v[2]*x2; 3280 s2 -= v[1]*x1 + v[3]*x2; 3281 v += 4; 3282 } 3283 idc = 2*(*c--); 3284 v = aa + 4*diag[i]; 3285 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3286 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 3287 } 3288 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3289 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3290 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3291 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3292 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3293 PetscFunctionReturn(0); 3294 } 3295 3296 /* 3297 Special case where the matrix was ILU(0) factored in the natural 3298 ordering. This eliminates the need for the column and row permutation. 3299 */ 3300 #undef __FUNCT__ 3301 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3302 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 3303 { 3304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3305 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3306 PetscErrorCode ierr; 3307 PetscInt *diag = a->diag; 3308 const MatScalar *aa=a->a,*v; 3309 PetscScalar *x,s1,s2,x1,x2; 3310 const PetscScalar *b; 3311 PetscInt jdx,idt,idx,nz,*vi,i; 3312 3313 PetscFunctionBegin; 3314 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3315 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3316 3317 /* forward solve the lower triangular */ 3318 idx = 0; 3319 x[0] = b[0]; x[1] = b[1]; 3320 for (i=1; i<n; i++) { 3321 v = aa + 4*ai[i]; 3322 vi = aj + ai[i]; 3323 nz = diag[i] - ai[i]; 3324 idx += 2; 3325 s1 = b[idx];s2 = b[1+idx]; 3326 while (nz--) { 3327 jdx = 2*(*vi++); 3328 x1 = x[jdx];x2 = x[1+jdx]; 3329 s1 -= v[0]*x1 + v[2]*x2; 3330 s2 -= v[1]*x1 + v[3]*x2; 3331 v += 4; 3332 } 3333 x[idx] = s1; 3334 x[1+idx] = s2; 3335 } 3336 /* backward solve the upper triangular */ 3337 for (i=n-1; i>=0; i--){ 3338 v = aa + 4*diag[i] + 4; 3339 vi = aj + diag[i] + 1; 3340 nz = ai[i+1] - diag[i] - 1; 3341 idt = 2*i; 3342 s1 = x[idt]; s2 = x[1+idt]; 3343 while (nz--) { 3344 idx = 2*(*vi++); 3345 x1 = x[idx]; x2 = x[1+idx]; 3346 s1 -= v[0]*x1 + v[2]*x2; 3347 s2 -= v[1]*x1 + v[3]*x2; 3348 v += 4; 3349 } 3350 v = aa + 4*diag[i]; 3351 x[idt] = v[0]*s1 + v[2]*s2; 3352 x[1+idt] = v[1]*s1 + v[3]*s2; 3353 } 3354 3355 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3356 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3357 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3358 PetscFunctionReturn(0); 3359 } 3360 3361 #undef __FUNCT__ 3362 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 3363 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3364 { 3365 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3366 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 3367 PetscErrorCode ierr; 3368 PetscInt jdx; 3369 const MatScalar *aa=a->a,*v; 3370 PetscScalar *x,s1,s2,x1,x2; 3371 const PetscScalar *b; 3372 3373 PetscFunctionBegin; 3374 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3375 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3376 /* forward solve the lower triangular */ 3377 idx = 0; 3378 x[0] = b[idx]; x[1] = b[1+idx]; 3379 for (i=1; i<n; i++) { 3380 v = aa + 4*ai[i]; 3381 vi = aj + ai[i]; 3382 nz = ai[i+1] - ai[i]; 3383 idx = 2*i; 3384 s1 = b[idx];s2 = b[1+idx]; 3385 while (nz--) { 3386 jdx = 2*(*vi++); 3387 x1 = x[jdx];x2 = x[1+jdx]; 3388 s1 -= v[0]*x1 + v[2]*x2; 3389 s2 -= v[1]*x1 + v[3]*x2; 3390 v += 4; 3391 } 3392 x[idx] = s1; 3393 x[1+idx] = s2; 3394 } 3395 3396 /* backward solve the upper triangular */ 3397 for (i=n-1; i>=0; i--){ 3398 v = aa + 4*ai[2*n-i]; 3399 vi = aj + ai[2*n-i]; 3400 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3401 idt = 2*i; 3402 s1 = x[idt]; s2 = x[1+idt]; 3403 while (nz--) { 3404 idx = 2*(*vi++); 3405 x1 = x[idx]; x2 = x[1+idx]; 3406 s1 -= v[0]*x1 + v[2]*x2; 3407 s2 -= v[1]*x1 + v[3]*x2; 3408 v += 4; 3409 } 3410 /* x = inv_diagonal*x */ 3411 x[idt] = v[0]*s1 + v[2]*s2; 3412 x[1+idt] = v[1]*s1 + v[3]*s2; 3413 } 3414 3415 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3416 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3417 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3418 PetscFunctionReturn(0); 3419 } 3420 3421 #undef __FUNCT__ 3422 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 3423 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 3424 { 3425 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3426 IS iscol=a->col,isrow=a->row; 3427 PetscErrorCode ierr; 3428 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3429 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3430 MatScalar *aa=a->a,*v; 3431 PetscScalar *x,*b,s1,*t; 3432 3433 PetscFunctionBegin; 3434 if (!n) PetscFunctionReturn(0); 3435 3436 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3437 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3438 t = a->solve_work; 3439 3440 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3441 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3442 3443 /* forward solve the lower triangular */ 3444 t[0] = b[*r++]; 3445 for (i=1; i<n; i++) { 3446 v = aa + ai[i]; 3447 vi = aj + ai[i]; 3448 nz = diag[i] - ai[i]; 3449 s1 = b[*r++]; 3450 while (nz--) { 3451 s1 -= (*v++)*t[*vi++]; 3452 } 3453 t[i] = s1; 3454 } 3455 /* backward solve the upper triangular */ 3456 for (i=n-1; i>=0; i--){ 3457 v = aa + diag[i] + 1; 3458 vi = aj + diag[i] + 1; 3459 nz = ai[i+1] - diag[i] - 1; 3460 s1 = t[i]; 3461 while (nz--) { 3462 s1 -= (*v++)*t[*vi++]; 3463 } 3464 x[*c--] = t[i] = aa[diag[i]]*s1; 3465 } 3466 3467 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3468 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3469 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3470 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3471 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3472 PetscFunctionReturn(0); 3473 } 3474 /* 3475 Special case where the matrix was ILU(0) factored in the natural 3476 ordering. This eliminates the need for the column and row permutation. 3477 */ 3478 #undef __FUNCT__ 3479 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3480 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3481 { 3482 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3483 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3484 PetscErrorCode ierr; 3485 PetscInt *diag = a->diag; 3486 MatScalar *aa=a->a; 3487 PetscScalar *x,*b; 3488 PetscScalar s1,x1; 3489 MatScalar *v; 3490 PetscInt jdx,idt,idx,nz,*vi,i; 3491 3492 PetscFunctionBegin; 3493 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3494 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3495 3496 /* forward solve the lower triangular */ 3497 idx = 0; 3498 x[0] = b[0]; 3499 for (i=1; i<n; i++) { 3500 v = aa + ai[i]; 3501 vi = aj + ai[i]; 3502 nz = diag[i] - ai[i]; 3503 idx += 1; 3504 s1 = b[idx]; 3505 while (nz--) { 3506 jdx = *vi++; 3507 x1 = x[jdx]; 3508 s1 -= v[0]*x1; 3509 v += 1; 3510 } 3511 x[idx] = s1; 3512 } 3513 /* backward solve the upper triangular */ 3514 for (i=n-1; i>=0; i--){ 3515 v = aa + diag[i] + 1; 3516 vi = aj + diag[i] + 1; 3517 nz = ai[i+1] - diag[i] - 1; 3518 idt = i; 3519 s1 = x[idt]; 3520 while (nz--) { 3521 idx = *vi++; 3522 x1 = x[idx]; 3523 s1 -= v[0]*x1; 3524 v += 1; 3525 } 3526 v = aa + diag[i]; 3527 x[idt] = v[0]*s1; 3528 } 3529 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3530 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3531 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3532 PetscFunctionReturn(0); 3533 } 3534 3535 /* ----------------------------------------------------------------*/ 3536 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 3537 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3538 3539 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 3540 #undef __FUNCT__ 3541 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 3542 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 3543 { 3544 Mat C=B; 3545 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 3546 IS isrow = b->row,isicol = b->icol; 3547 PetscErrorCode ierr; 3548 const PetscInt *r,*ic,*ics; 3549 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 3550 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 3551 MatScalar *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a; 3552 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 3553 MatScalar *v_work; 3554 3555 PetscFunctionBegin; 3556 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3557 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3558 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3559 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 3560 ics = ic; 3561 3562 /* generate work space needed by dense LU factorization */ 3563 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 3564 multiplier = v_work + bs; 3565 v_pivots = (PetscInt*)(multiplier + bs2); 3566 3567 for (i=0; i<n; i++){ 3568 /* zero rtmp */ 3569 /* L part */ 3570 nz = bi[i+1] - bi[i]; 3571 bjtmp = bj + bi[i]; 3572 for (j=0; j<nz; j++){ 3573 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3574 } 3575 3576 /* U part */ 3577 nz = bi[2*n-i+1] - bi[2*n-i]; 3578 bjtmp = bj + bi[2*n-i]; 3579 for (j=0; j<nz; j++){ 3580 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3581 } 3582 3583 /* load in initial (unfactored row) */ 3584 nz = ai[r[i]+1] - ai[r[i]]; 3585 ajtmp = aj + ai[r[i]]; 3586 v = aa + bs2*ai[r[i]]; 3587 for (j=0; j<nz; j++) { 3588 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 3589 } 3590 3591 /* elimination */ 3592 bjtmp = bj + bi[i]; 3593 row = *bjtmp++; 3594 nzL = bi[i+1] - bi[i]; 3595 k = 0; 3596 while (k < nzL) { 3597 pc = rtmp + bs2*row; 3598 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 3599 if (flg) { 3600 pv = b->a + bs2*bdiag[row]; 3601 Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */ 3602 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 3603 pv = b->a + bs2*bi[2*n-row]; 3604 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 3605 for (j=0; j<nz; j++) { 3606 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 3607 } 3608 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 3609 } 3610 row = *bjtmp++; k++; 3611 } 3612 3613 /* finished row so stick it into b->a */ 3614 /* L part */ 3615 pv = b->a + bs2*bi[i] ; 3616 pj = b->j + bi[i] ; 3617 nz = bi[i+1] - bi[i]; 3618 for (j=0; j<nz; j++) { 3619 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3620 } 3621 3622 /* Mark diagonal and invert diagonal for simplier triangular solves */ 3623 pv = b->a + bs2*bdiag[i]; 3624 pj = b->j + bdiag[i]; 3625 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 3626 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3627 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 3628 3629 /* U part */ 3630 pv = b->a + bs2*bi[2*n-i]; 3631 pj = b->j + bi[2*n-i]; 3632 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3633 for (j=0; j<nz; j++){ 3634 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3635 } 3636 } 3637 3638 ierr = PetscFree(rtmp);CHKERRQ(ierr); 3639 ierr = PetscFree(v_work);CHKERRQ(ierr); 3640 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3641 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3642 3643 C->assembled = PETSC_TRUE; 3644 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 3645 PetscFunctionReturn(0); 3646 } 3647 3648 /* 3649 ilu(0) with natural ordering under new data structure. 3650 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 3651 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 3652 */ 3653 #undef __FUNCT__ 3654 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 3655 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3656 { 3657 3658 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3659 PetscErrorCode ierr; 3660 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 3661 PetscInt i,j,nz,*bi,*bj,*bdiag; 3662 3663 PetscFunctionBegin; 3664 /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */ 3665 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 3666 b = (Mat_SeqBAIJ*)(fact)->data; 3667 3668 /* allocate matrix arrays for new data structure */ 3669 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr); 3670 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr); 3671 b->singlemalloc = PETSC_TRUE; 3672 if (!b->diag){ 3673 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 3674 } 3675 bdiag = b->diag; 3676 3677 if (n > 0) { 3678 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 3679 } 3680 3681 /* set bi and bj with new data structure */ 3682 bi = b->i; 3683 bj = b->j; 3684 3685 /* L part */ 3686 bi[0] = 0; 3687 for (i=0; i<n; i++){ 3688 nz = adiag[i] - ai[i]; 3689 bi[i+1] = bi[i] + nz; 3690 aj = a->j + ai[i]; 3691 for (j=0; j<nz; j++){ 3692 *bj = aj[j]; bj++; 3693 } 3694 } 3695 3696 /* U part */ 3697 bi[n+1] = bi[n]; 3698 for (i=n-1; i>=0; i--){ 3699 nz = ai[i+1] - adiag[i] - 1; 3700 bi[2*n-i+1] = bi[2*n-i] + nz + 1; 3701 aj = a->j + adiag[i] + 1; 3702 for (j=0; j<nz; j++){ 3703 *bj = aj[j]; bj++; 3704 } 3705 /* diag[i] */ 3706 *bj = i; bj++; 3707 bdiag[i] = bi[2*n-i+1]-1; 3708 } 3709 PetscFunctionReturn(0); 3710 } 3711 3712 extern PetscErrorCode PetscFreeSpaceContiguous_newdatastruct(PetscFreeSpaceList *,PetscInt *,PetscInt,PetscInt *,PetscInt *); 3713 #undef __FUNCT__ 3714 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 3715 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3716 { 3717 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3718 IS isicol; 3719 PetscErrorCode ierr; 3720 const PetscInt *r,*ic; 3721 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 3722 PetscInt *bi,*cols,nnz,*cols_lvl; 3723 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 3724 PetscInt i,levels,diagonal_fill; 3725 PetscTruth col_identity,row_identity,both_identity; 3726 PetscReal f; 3727 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 3728 PetscBT lnkbt; 3729 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 3730 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 3731 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 3732 PetscTruth missing; 3733 PetscInt bs=A->rmap->bs,bs2=a->bs2; 3734 3735 PetscFunctionBegin; 3736 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 3737 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 3738 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 3739 3740 f = info->fill; 3741 levels = (PetscInt)info->levels; 3742 diagonal_fill = (PetscInt)info->diagonal_fill; 3743 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3744 3745 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3746 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3747 both_identity = (PetscTruth) (row_identity && col_identity); 3748 3749 if (!levels && both_identity) { 3750 /* special case: ilu(0) with natural ordering */ 3751 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 3752 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 3753 /* set MatSolve routines */ 3754 switch (bs){ 3755 case 2: 3756 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 3757 break; 3758 case 3: 3759 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 3760 break; 3761 case 4: 3762 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 3763 break; 3764 case 5: 3765 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3766 break; 3767 case 6: 3768 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 3769 break; 3770 case 7: 3771 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 3772 break; 3773 default: 3774 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3775 break; 3776 } 3777 3778 fact->factor = MAT_FACTOR_ILU; 3779 (fact)->info.factor_mallocs = 0; 3780 (fact)->info.fill_ratio_given = info->fill; 3781 (fact)->info.fill_ratio_needed = 1.0; 3782 b = (Mat_SeqBAIJ*)(fact)->data; 3783 b->row = isrow; 3784 b->col = iscol; 3785 b->icol = isicol; 3786 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3787 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3788 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3789 ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3790 PetscFunctionReturn(0); 3791 } 3792 3793 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3794 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3795 3796 /* get new row pointers */ 3797 ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 3798 bi[0] = 0; 3799 /* bdiag is location of diagonal in factor */ 3800 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 3801 bdiag[0] = 0; 3802 3803 ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 3804 bjlvl_ptr = (PetscInt**)(bj_ptr + n); 3805 3806 /* create a linked list for storing column indices of the active row */ 3807 nlnk = n + 1; 3808 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 3809 3810 /* initial FreeSpace size is f*(ai[n]+1) */ 3811 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 3812 current_space = free_space; 3813 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 3814 current_space_lvl = free_space_lvl; 3815 3816 for (i=0; i<n; i++) { 3817 nzi = 0; 3818 /* copy current row into linked list */ 3819 nnz = ai[r[i]+1] - ai[r[i]]; 3820 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 3821 cols = aj + ai[r[i]]; 3822 lnk[i] = -1; /* marker to indicate if diagonal exists */ 3823 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 3824 nzi += nlnk; 3825 3826 /* make sure diagonal entry is included */ 3827 if (diagonal_fill && lnk[i] == -1) { 3828 fm = n; 3829 while (lnk[fm] < i) fm = lnk[fm]; 3830 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 3831 lnk[fm] = i; 3832 lnk_lvl[i] = 0; 3833 nzi++; dcount++; 3834 } 3835 3836 /* add pivot rows into the active row */ 3837 nzbd = 0; 3838 prow = lnk[n]; 3839 while (prow < i) { 3840 nnz = bdiag[prow]; 3841 cols = bj_ptr[prow] + nnz + 1; 3842 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 3843 nnz = bi[prow+1] - bi[prow] - nnz - 1; 3844 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 3845 nzi += nlnk; 3846 prow = lnk[prow]; 3847 nzbd++; 3848 } 3849 bdiag[i] = nzbd; 3850 bi[i+1] = bi[i] + nzi; 3851 3852 /* if free space is not available, make more free space */ 3853 if (current_space->local_remaining<nzi) { 3854 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 3855 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 3856 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 3857 reallocs++; 3858 } 3859 3860 /* copy data into free_space and free_space_lvl, then initialize lnk */ 3861 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 3862 bj_ptr[i] = current_space->array; 3863 bjlvl_ptr[i] = current_space_lvl->array; 3864 3865 /* make sure the active row i has diagonal entry */ 3866 if (*(bj_ptr[i]+bdiag[i]) != i) { 3867 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3868 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 3869 } 3870 3871 current_space->array += nzi; 3872 current_space->local_used += nzi; 3873 current_space->local_remaining -= nzi; 3874 current_space_lvl->array += nzi; 3875 current_space_lvl->local_used += nzi; 3876 current_space_lvl->local_remaining -= nzi; 3877 } 3878 3879 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3880 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3881 3882 /* destroy list of free space and other temporary arrays */ 3883 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 3884 3885 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 3886 ierr = PetscFreeSpaceContiguous_newdatastruct(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 3887 3888 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 3889 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 3890 ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 3891 3892 #if defined(PETSC_USE_INFO) 3893 { 3894 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 3895 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 3896 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3897 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 3898 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3899 if (diagonal_fill) { 3900 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 3901 } 3902 } 3903 #endif 3904 3905 /* put together the new matrix */ 3906 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3907 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3908 b = (Mat_SeqBAIJ*)(fact)->data; 3909 b->free_a = PETSC_TRUE; 3910 b->free_ij = PETSC_TRUE; 3911 b->singlemalloc = PETSC_FALSE; 3912 ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3913 b->j = bj; 3914 b->i = bi; 3915 b->diag = bdiag; 3916 b->ilen = 0; 3917 b->imax = 0; 3918 b->row = isrow; 3919 b->col = iscol; 3920 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3921 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3922 b->icol = isicol; 3923 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3924 /* In b structure: Free imax, ilen, old a, old j. 3925 Allocate bdiag, solve_work, new a, new j */ 3926 ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 3927 b->maxnz = b->nz = bi[2*n+1] ; 3928 (fact)->info.factor_mallocs = reallocs; 3929 (fact)->info.fill_ratio_given = f; 3930 (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]); 3931 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 3932 /* set MatSolve routines */ 3933 if (both_identity){ 3934 switch (bs){ 3935 case 2: 3936 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 3937 break; 3938 case 3: 3939 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 3940 break; 3941 case 4: 3942 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 3943 break; 3944 case 5: 3945 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3946 break; 3947 case 6: 3948 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 3949 break; 3950 case 7: 3951 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 3952 break; 3953 default: 3954 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3955 break; 3956 } 3957 } else { 3958 switch (bs){ 3959 /* not implemented yet! 3960 case 2: 3961 fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct; 3962 break; 3963 case 3: 3964 fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct; 3965 break; 3966 case 4: 3967 fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct; 3968 break; 3969 case 5: 3970 fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct; 3971 break; 3972 case 6: 3973 fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct; 3974 break; 3975 case 7: 3976 fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct; 3977 break; 3978 default: 3979 fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 3980 break; 3981 */ 3982 } 3983 } 3984 PetscFunctionReturn(0); 3985 } 3986 3987 /* 3988 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3989 except that the data structure of Mat_SeqAIJ is slightly different. 3990 Not a good example of code reuse. 3991 */ 3992 #undef __FUNCT__ 3993 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3994 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3995 { 3996 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3997 IS isicol; 3998 PetscErrorCode ierr; 3999 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 4000 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 4001 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 4002 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 4003 PetscTruth col_identity,row_identity,both_identity,flg; 4004 PetscReal f; 4005 PetscTruth newdatastruct=PETSC_FALSE; 4006 4007 PetscFunctionBegin; 4008 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 4009 if (newdatastruct){ 4010 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4011 PetscFunctionReturn(0); 4012 } 4013 4014 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 4015 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 4016 4017 f = info->fill; 4018 levels = (PetscInt)info->levels; 4019 diagonal_fill = (PetscInt)info->diagonal_fill; 4020 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4021 4022 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4023 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4024 both_identity = (PetscTruth) (row_identity && col_identity); 4025 4026 if (!levels && both_identity) { /* special case copy the nonzero structure */ 4027 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 4028 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 4029 4030 fact->factor = MAT_FACTOR_ILU; 4031 b = (Mat_SeqBAIJ*)(fact)->data; 4032 b->row = isrow; 4033 b->col = iscol; 4034 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4035 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4036 b->icol = isicol; 4037 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4038 ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4039 PetscFunctionReturn(0); 4040 } 4041 4042 /* general case perform the symbolic factorization */ 4043 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4044 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4045 4046 /* get new row pointers */ 4047 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 4048 ainew[0] = 0; 4049 /* don't know how many column pointers are needed so estimate */ 4050 jmax = (PetscInt)(f*ai[n] + 1); 4051 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 4052 /* ajfill is level of fill for each fill entry */ 4053 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 4054 /* fill is a linked list of nonzeros in active row */ 4055 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 4056 /* im is level for each filled value */ 4057 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 4058 /* dloc is location of diagonal in factor */ 4059 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 4060 dloc[0] = 0; 4061 for (prow=0; prow<n; prow++) { 4062 4063 /* copy prow into linked list */ 4064 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 4065 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 4066 xi = aj + ai[r[prow]]; 4067 fill[n] = n; 4068 fill[prow] = -1; /* marker for diagonal entry */ 4069 while (nz--) { 4070 fm = n; 4071 idx = ic[*xi++]; 4072 do { 4073 m = fm; 4074 fm = fill[m]; 4075 } while (fm < idx); 4076 fill[m] = idx; 4077 fill[idx] = fm; 4078 im[idx] = 0; 4079 } 4080 4081 /* make sure diagonal entry is included */ 4082 if (diagonal_fill && fill[prow] == -1) { 4083 fm = n; 4084 while (fill[fm] < prow) fm = fill[fm]; 4085 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 4086 fill[fm] = prow; 4087 im[prow] = 0; 4088 nzf++; 4089 dcount++; 4090 } 4091 4092 nzi = 0; 4093 row = fill[n]; 4094 while (row < prow) { 4095 incrlev = im[row] + 1; 4096 nz = dloc[row]; 4097 xi = ajnew + ainew[row] + nz + 1; 4098 flev = ajfill + ainew[row] + nz + 1; 4099 nnz = ainew[row+1] - ainew[row] - nz - 1; 4100 fm = row; 4101 while (nnz-- > 0) { 4102 idx = *xi++; 4103 if (*flev + incrlev > levels) { 4104 flev++; 4105 continue; 4106 } 4107 do { 4108 m = fm; 4109 fm = fill[m]; 4110 } while (fm < idx); 4111 if (fm != idx) { 4112 im[idx] = *flev + incrlev; 4113 fill[m] = idx; 4114 fill[idx] = fm; 4115 fm = idx; 4116 nzf++; 4117 } else { 4118 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 4119 } 4120 flev++; 4121 } 4122 row = fill[row]; 4123 nzi++; 4124 } 4125 /* copy new filled row into permanent storage */ 4126 ainew[prow+1] = ainew[prow] + nzf; 4127 if (ainew[prow+1] > jmax) { 4128 4129 /* estimate how much additional space we will need */ 4130 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 4131 /* just double the memory each time */ 4132 PetscInt maxadd = jmax; 4133 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 4134 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 4135 jmax += maxadd; 4136 4137 /* allocate a longer ajnew and ajfill */ 4138 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 4139 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4140 ierr = PetscFree(ajnew);CHKERRQ(ierr); 4141 ajnew = xitmp; 4142 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 4143 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4144 ierr = PetscFree(ajfill);CHKERRQ(ierr); 4145 ajfill = xitmp; 4146 reallocate++; /* count how many reallocations are needed */ 4147 } 4148 xitmp = ajnew + ainew[prow]; 4149 flev = ajfill + ainew[prow]; 4150 dloc[prow] = nzi; 4151 fm = fill[n]; 4152 while (nzf--) { 4153 *xitmp++ = fm; 4154 *flev++ = im[fm]; 4155 fm = fill[fm]; 4156 } 4157 /* make sure row has diagonal entry */ 4158 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 4159 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 4160 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 4161 } 4162 } 4163 ierr = PetscFree(ajfill);CHKERRQ(ierr); 4164 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4165 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4166 ierr = PetscFree(fill);CHKERRQ(ierr); 4167 ierr = PetscFree(im);CHKERRQ(ierr); 4168 4169 #if defined(PETSC_USE_INFO) 4170 { 4171 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 4172 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 4173 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4174 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 4175 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4176 if (diagonal_fill) { 4177 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 4178 } 4179 } 4180 #endif 4181 4182 /* put together the new matrix */ 4183 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4184 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4185 b = (Mat_SeqBAIJ*)(fact)->data; 4186 b->free_a = PETSC_TRUE; 4187 b->free_ij = PETSC_TRUE; 4188 b->singlemalloc = PETSC_FALSE; 4189 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 4190 b->j = ajnew; 4191 b->i = ainew; 4192 for (i=0; i<n; i++) dloc[i] += ainew[i]; 4193 b->diag = dloc; 4194 b->ilen = 0; 4195 b->imax = 0; 4196 b->row = isrow; 4197 b->col = iscol; 4198 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4199 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4200 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4201 b->icol = isicol; 4202 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4203 /* In b structure: Free imax, ilen, old a, old j. 4204 Allocate dloc, solve_work, new a, new j */ 4205 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 4206 b->maxnz = b->nz = ainew[n]; 4207 4208 (fact)->info.factor_mallocs = reallocate; 4209 (fact)->info.fill_ratio_given = f; 4210 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 4211 4212 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 4213 PetscFunctionReturn(0); 4214 } 4215 4216 #undef __FUNCT__ 4217 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 4218 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 4219 { 4220 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 4221 /* int i,*AJ=a->j,nz=a->nz; */ 4222 PetscFunctionBegin; 4223 /* Undo Column scaling */ 4224 /* while (nz--) { */ 4225 /* AJ[i] = AJ[i]/4; */ 4226 /* } */ 4227 /* This should really invoke a push/pop logic, but we don't have that yet. */ 4228 A->ops->setunfactored = PETSC_NULL; 4229 PetscFunctionReturn(0); 4230 } 4231 4232 #undef __FUNCT__ 4233 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 4234 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 4235 { 4236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4237 PetscInt *AJ=a->j,nz=a->nz; 4238 unsigned short *aj=(unsigned short *)AJ; 4239 PetscFunctionBegin; 4240 /* Is this really necessary? */ 4241 while (nz--) { 4242 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 4243 } 4244 A->ops->setunfactored = PETSC_NULL; 4245 PetscFunctionReturn(0); 4246 } 4247 4248 4249