1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for SBAIJ format. 5 */ 6 7 #include "../src/mat/impls/sbaij/seq/sbaij.h" 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace" 13 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 16 IS isrow=a->row; 17 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 18 PetscErrorCode ierr; 19 const PetscInt *r; 20 PetscInt nz,*vj,k,idx,k1; 21 PetscInt bs=A->rmap->bs,bs2 = a->bs2; 22 MatScalar *aa=a->a,*v,*diag; 23 PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t; 24 25 PetscFunctionBegin; 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 t = a->solve_work; 29 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 30 ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr); 31 32 /* solve U^T * D * y = b by forward substitution */ 33 xk = t; 34 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 35 idx = bs*r[k]; 36 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 37 } 38 for (k=0; k<mbs; k++){ 39 v = aa + bs2*ai[k]; 40 xk = t + k*bs; /* Dk*xk = k-th block of x */ 41 ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */ 42 nz = ai[k+1] - ai[k]; 43 vj = aj + ai[k]; 44 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 45 while (nz--) { 46 /* x(:) += U(k,:)^T*(Dk*xk) */ 47 Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 48 vj++; xj = t + (*vj)*bs; 49 v += bs2; 50 } 51 /* xk = inv(Dk)*(Dk*xk) */ 52 diag = aa+k*bs2; /* ptr to inv(Dk) */ 53 Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 54 } 55 56 /* solve U*x = y by back substitution */ 57 for (k=mbs-1; k>=0; k--){ 58 v = aa + bs2*ai[k]; 59 xk = t + k*bs; /* xk */ 60 nz = ai[k+1] - ai[k]; 61 vj = aj + ai[k]; 62 xj = t + (*vj)*bs; 63 while (nz--) { 64 /* xk += U(k,:)*x(:) */ 65 Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 66 vj++; 67 v += bs2; xj = t + (*vj)*bs; 68 } 69 idx = bs*r[k]; 70 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 71 } 72 73 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 74 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 75 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 76 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 77 ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr); 78 PetscFunctionReturn(0); 79 } 80 81 #undef __FUNCT__ 82 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace" 83 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 84 { 85 PetscFunctionBegin; 86 SETERRQ(PETSC_COMM_SELF,1,"not implemented yet"); 87 PetscFunctionReturn(0); 88 } 89 90 #undef __FUNCT__ 91 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace" 92 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 93 { 94 PetscFunctionBegin; 95 SETERRQ(PETSC_COMM_SELF,1,"not implemented yet"); 96 PetscFunctionReturn(0); 97 } 98 99 #undef __FUNCT__ 100 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private" 101 PetscErrorCode ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 102 { 103 PetscErrorCode ierr; 104 PetscInt nz,*vj,k; 105 PetscInt bs2 = bs*bs; 106 MatScalar *v,*diag; 107 PetscScalar *xk,*xj,*xk_tmp; 108 109 PetscFunctionBegin; 110 ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr); 111 for (k=0; k<mbs; k++){ 112 v = aa + bs2*ai[k]; 113 xk = x + k*bs; /* Dk*xk = k-th block of x */ 114 ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */ 115 nz = ai[k+1] - ai[k]; 116 vj = aj + ai[k]; 117 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 118 while (nz--) { 119 /* x(:) += U(k,:)^T*(Dk*xk) */ 120 Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 121 vj++; xj = x + (*vj)*bs; 122 v += bs2; 123 } 124 /* xk = inv(Dk)*(Dk*xk) */ 125 diag = aa+k*bs2; /* ptr to inv(Dk) */ 126 Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 127 } 128 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 129 PetscFunctionReturn(0); 130 } 131 132 #undef __FUNCT__ 133 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private" 134 PetscErrorCode BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 135 { 136 PetscInt nz,*vj,k; 137 PetscInt bs2 = bs*bs; 138 MatScalar *v; 139 PetscScalar *xk,*xj; 140 141 PetscFunctionBegin; 142 for (k=mbs-1; k>=0; k--){ 143 v = aa + bs2*ai[k]; 144 xk = x + k*bs; /* xk */ 145 nz = ai[k+1] - ai[k]; 146 vj = aj + ai[k]; 147 xj = x + (*vj)*bs; 148 while (nz--) { 149 /* xk += U(k,:)*x(:) */ 150 Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 151 vj++; 152 v += bs2; xj = x + (*vj)*bs; 153 } 154 } 155 PetscFunctionReturn(0); 156 } 157 158 #undef __FUNCT__ 159 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 160 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 161 { 162 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 163 PetscErrorCode ierr; 164 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 165 PetscInt bs=A->rmap->bs; 166 MatScalar *aa=a->a; 167 PetscScalar *x,*b; 168 169 PetscFunctionBegin; 170 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 171 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 172 173 /* solve U^T * D * y = b by forward substitution */ 174 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 175 ierr = ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 176 177 /* solve U*x = y by back substitution */ 178 ierr = BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 179 180 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 181 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 182 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 183 PetscFunctionReturn(0); 184 } 185 186 #undef __FUNCT__ 187 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 188 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 189 { 190 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 191 PetscErrorCode ierr; 192 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 193 PetscInt bs=A->rmap->bs; 194 MatScalar *aa=a->a; 195 PetscScalar *x,*b; 196 197 PetscFunctionBegin; 198 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 199 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 200 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 201 ierr = ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 202 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 203 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 204 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 205 PetscFunctionReturn(0); 206 } 207 208 #undef __FUNCT__ 209 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 210 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 211 { 212 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 213 PetscErrorCode ierr; 214 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 215 PetscInt bs=A->rmap->bs; 216 MatScalar *aa=a->a; 217 PetscScalar *x,*b; 218 219 PetscFunctionBegin; 220 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 221 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 222 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 223 ierr = BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 224 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 225 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 226 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 #undef __FUNCT__ 231 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace" 232 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 233 { 234 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 235 IS isrow=a->row; 236 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs; 237 PetscErrorCode ierr; 238 const PetscInt *r; 239 PetscInt nz,*vj,k,idx; 240 MatScalar *aa=a->a,*v,*d; 241 PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 242 243 PetscFunctionBegin; 244 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 245 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 246 t = a->solve_work; 247 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 248 249 /* solve U^T * D * y = b by forward substitution */ 250 tp = t; 251 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 252 idx = 7*r[k]; 253 tp[0] = b[idx]; 254 tp[1] = b[idx+1]; 255 tp[2] = b[idx+2]; 256 tp[3] = b[idx+3]; 257 tp[4] = b[idx+4]; 258 tp[5] = b[idx+5]; 259 tp[6] = b[idx+6]; 260 tp += 7; 261 } 262 263 for (k=0; k<mbs; k++){ 264 v = aa + 49*ai[k]; 265 vj = aj + ai[k]; 266 tp = t + k*7; 267 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 268 nz = ai[k+1] - ai[k]; 269 tp = t + (*vj)*7; 270 while (nz--) { 271 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 272 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 273 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 274 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 275 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 276 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 277 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 278 vj++; tp = t + (*vj)*7; 279 v += 49; 280 } 281 282 /* xk = inv(Dk)*(Dk*xk) */ 283 d = aa+k*49; /* ptr to inv(Dk) */ 284 tp = t + k*7; 285 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 286 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 287 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 288 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 289 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 290 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 291 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 292 } 293 294 /* solve U*x = y by back substitution */ 295 for (k=mbs-1; k>=0; k--){ 296 v = aa + 49*ai[k]; 297 vj = aj + ai[k]; 298 tp = t + k*7; 299 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 300 nz = ai[k+1] - ai[k]; 301 302 tp = t + (*vj)*7; 303 while (nz--) { 304 /* xk += U(k,:)*x(:) */ 305 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 306 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 307 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 308 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 309 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 310 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 311 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 312 vj++; tp = t + (*vj)*7; 313 v += 49; 314 } 315 tp = t + k*7; 316 tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 317 idx = 7*r[k]; 318 x[idx] = x0; 319 x[idx+1] = x1; 320 x[idx+2] = x2; 321 x[idx+3] = x3; 322 x[idx+4] = x4; 323 x[idx+5] = x5; 324 x[idx+6] = x6; 325 } 326 327 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 328 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 329 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 330 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private" 336 PetscErrorCode ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 337 { 338 MatScalar *v,*d; 339 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 340 PetscInt nz,*vj,k; 341 342 PetscFunctionBegin; 343 for (k=0; k<mbs; k++){ 344 v = aa + 49*ai[k]; 345 xp = x + k*7; 346 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 347 nz = ai[k+1] - ai[k]; 348 vj = aj + ai[k]; 349 xp = x + (*vj)*7; 350 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 351 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 352 while (nz--) { 353 /* x(:) += U(k,:)^T*(Dk*xk) */ 354 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 355 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 356 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 357 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 358 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 359 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 360 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 361 vj++; xp = x + (*vj)*7; 362 v += 49; 363 } 364 /* xk = inv(Dk)*(Dk*xk) */ 365 d = aa+k*49; /* ptr to inv(Dk) */ 366 xp = x + k*7; 367 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 368 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 369 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 370 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 371 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 372 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 373 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 374 } 375 PetscFunctionReturn(0); 376 } 377 378 #undef __FUNCT__ 379 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private" 380 PetscErrorCode BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 381 { 382 MatScalar *v; 383 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 384 PetscInt nz,*vj,k; 385 386 PetscFunctionBegin; 387 for (k=mbs-1; k>=0; k--){ 388 v = aa + 49*ai[k]; 389 xp = x + k*7; 390 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 391 nz = ai[k+1] - ai[k]; 392 vj = aj + ai[k]; 393 xp = x + (*vj)*7; 394 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 395 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 396 while (nz--) { 397 /* xk += U(k,:)*x(:) */ 398 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 399 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 400 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 401 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 402 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 403 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 404 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 405 vj++; 406 v += 49; xp = x + (*vj)*7; 407 } 408 xp = x + k*7; 409 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 410 } 411 PetscFunctionReturn(0); 412 } 413 414 #undef __FUNCT__ 415 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 416 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 417 { 418 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 419 PetscErrorCode ierr; 420 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 421 MatScalar *aa=a->a; 422 PetscScalar *x,*b; 423 424 PetscFunctionBegin; 425 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 426 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 427 428 /* solve U^T * D * y = b by forward substitution */ 429 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 430 ierr = ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 431 432 /* solve U*x = y by back substitution */ 433 ierr = BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 434 435 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 436 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 437 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 438 PetscFunctionReturn(0); 439 } 440 441 #undef __FUNCT__ 442 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 443 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 444 { 445 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 446 PetscErrorCode ierr; 447 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 448 MatScalar *aa=a->a; 449 PetscScalar *x,*b; 450 451 PetscFunctionBegin; 452 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 453 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 454 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 455 ierr = ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 456 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 457 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 458 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 459 PetscFunctionReturn(0); 460 } 461 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 464 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 467 PetscErrorCode ierr; 468 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 469 MatScalar *aa=a->a; 470 PetscScalar *x,*b; 471 472 PetscFunctionBegin; 473 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 474 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 475 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 476 ierr = BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 477 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 478 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 479 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 480 PetscFunctionReturn(0); 481 } 482 483 #undef __FUNCT__ 484 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace" 485 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 486 { 487 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 488 IS isrow=a->row; 489 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 490 PetscErrorCode ierr; 491 const PetscInt *r; 492 PetscInt nz,*vj,k,idx; 493 MatScalar *aa=a->a,*v,*d; 494 PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp; 495 496 PetscFunctionBegin; 497 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 498 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 499 t = a->solve_work; 500 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 501 502 /* solve U^T * D * y = b by forward substitution */ 503 tp = t; 504 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 505 idx = 6*r[k]; 506 tp[0] = b[idx]; 507 tp[1] = b[idx+1]; 508 tp[2] = b[idx+2]; 509 tp[3] = b[idx+3]; 510 tp[4] = b[idx+4]; 511 tp[5] = b[idx+5]; 512 tp += 6; 513 } 514 515 for (k=0; k<mbs; k++){ 516 v = aa + 36*ai[k]; 517 vj = aj + ai[k]; 518 tp = t + k*6; 519 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 520 nz = ai[k+1] - ai[k]; 521 tp = t + (*vj)*6; 522 while (nz--) { 523 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 524 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 525 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 526 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 527 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 528 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 529 vj++; tp = t + (*vj)*6; 530 v += 36; 531 } 532 533 /* xk = inv(Dk)*(Dk*xk) */ 534 d = aa+k*36; /* ptr to inv(Dk) */ 535 tp = t + k*6; 536 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 537 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 538 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 539 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 540 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 541 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 542 } 543 544 /* solve U*x = y by back substitution */ 545 for (k=mbs-1; k>=0; k--){ 546 v = aa + 36*ai[k]; 547 vj = aj + ai[k]; 548 tp = t + k*6; 549 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 550 nz = ai[k+1] - ai[k]; 551 552 tp = t + (*vj)*6; 553 while (nz--) { 554 /* xk += U(k,:)*x(:) */ 555 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 556 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 557 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 558 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 559 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 560 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 561 vj++; tp = t + (*vj)*6; 562 v += 36; 563 } 564 tp = t + k*6; 565 tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 566 idx = 6*r[k]; 567 x[idx] = x0; 568 x[idx+1] = x1; 569 x[idx+2] = x2; 570 x[idx+3] = x3; 571 x[idx+4] = x4; 572 x[idx+5] = x5; 573 } 574 575 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 576 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 577 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 578 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 579 PetscFunctionReturn(0); 580 } 581 582 #undef __FUNCT__ 583 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private" 584 PetscErrorCode ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 585 { 586 MatScalar *v,*d; 587 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 588 PetscInt nz,*vj,k; 589 590 PetscFunctionBegin; 591 for (k=0; k<mbs; k++){ 592 v = aa + 36*ai[k]; 593 xp = x + k*6; 594 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 595 nz = ai[k+1] - ai[k]; 596 vj = aj + ai[k]; 597 xp = x + (*vj)*6; 598 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 599 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 600 while (nz--) { 601 /* x(:) += U(k,:)^T*(Dk*xk) */ 602 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 603 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 604 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 605 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 606 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 607 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 608 vj++; xp = x + (*vj)*6; 609 v += 36; 610 } 611 /* xk = inv(Dk)*(Dk*xk) */ 612 d = aa+k*36; /* ptr to inv(Dk) */ 613 xp = x + k*6; 614 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 615 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 616 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 617 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 618 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 619 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 620 } 621 PetscFunctionReturn(0); 622 } 623 #undef __FUNCT__ 624 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private" 625 PetscErrorCode BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 626 { 627 MatScalar *v; 628 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 629 PetscInt nz,*vj,k; 630 631 PetscFunctionBegin; 632 for (k=mbs-1; k>=0; k--){ 633 v = aa + 36*ai[k]; 634 xp = x + k*6; 635 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 636 nz = ai[k+1] - ai[k]; 637 vj = aj + ai[k]; 638 xp = x + (*vj)*6; 639 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 640 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 641 while (nz--) { 642 /* xk += U(k,:)*x(:) */ 643 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 644 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 645 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 646 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 647 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 648 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 649 vj++; 650 v += 36; xp = x + (*vj)*6; 651 } 652 xp = x + k*6; 653 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 654 } 655 PetscFunctionReturn(0); 656 } 657 658 659 #undef __FUNCT__ 660 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 661 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 662 { 663 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 664 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 665 MatScalar *aa=a->a; 666 PetscScalar *x,*b; 667 PetscErrorCode ierr; 668 669 PetscFunctionBegin; 670 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 671 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 672 673 /* solve U^T * D * y = b by forward substitution */ 674 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 675 ierr = ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 676 677 /* solve U*x = y by back substitution */ 678 ierr = BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 679 680 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 681 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 682 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 683 PetscFunctionReturn(0); 684 } 685 686 #undef __FUNCT__ 687 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 688 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 689 { 690 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 691 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 692 MatScalar *aa=a->a; 693 PetscScalar *x,*b; 694 PetscErrorCode ierr; 695 696 PetscFunctionBegin; 697 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 698 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 699 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 700 ierr = ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 701 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 703 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 #undef __FUNCT__ 708 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 709 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 710 { 711 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 712 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 713 MatScalar *aa=a->a; 714 PetscScalar *x,*b; 715 PetscErrorCode ierr; 716 717 PetscFunctionBegin; 718 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 719 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 720 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 721 ierr = BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 722 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 723 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 724 ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr); 725 PetscFunctionReturn(0); 726 } 727 728 #undef __FUNCT__ 729 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace" 730 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 731 { 732 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 733 IS isrow=a->row; 734 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2; 735 PetscErrorCode ierr; 736 const PetscInt *r; 737 PetscInt nz,*vj,k,idx; 738 MatScalar *aa=a->a,*v,*diag; 739 PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp; 740 741 PetscFunctionBegin; 742 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 743 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 744 t = a->solve_work; 745 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 746 747 /* solve U^T * D * y = b by forward substitution */ 748 tp = t; 749 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 750 idx = 5*r[k]; 751 tp[0] = b[idx]; 752 tp[1] = b[idx+1]; 753 tp[2] = b[idx+2]; 754 tp[3] = b[idx+3]; 755 tp[4] = b[idx+4]; 756 tp += 5; 757 } 758 759 for (k=0; k<mbs; k++){ 760 v = aa + 25*ai[k]; 761 vj = aj + ai[k]; 762 tp = t + k*5; 763 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 764 nz = ai[k+1] - ai[k]; 765 766 tp = t + (*vj)*5; 767 while (nz--) { 768 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 769 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 770 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 771 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 772 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 773 vj++; tp = t + (*vj)*5; 774 v += 25; 775 } 776 777 /* xk = inv(Dk)*(Dk*xk) */ 778 diag = aa+k*25; /* ptr to inv(Dk) */ 779 tp = t + k*5; 780 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 781 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 782 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 783 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 784 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 785 } 786 787 /* solve U*x = y by back substitution */ 788 for (k=mbs-1; k>=0; k--){ 789 v = aa + 25*ai[k]; 790 vj = aj + ai[k]; 791 tp = t + k*5; 792 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */ 793 nz = ai[k+1] - ai[k]; 794 795 tp = t + (*vj)*5; 796 while (nz--) { 797 /* xk += U(k,:)*x(:) */ 798 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 799 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 800 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 801 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 802 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 803 vj++; tp = t + (*vj)*5; 804 v += 25; 805 } 806 tp = t + k*5; 807 tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 808 idx = 5*r[k]; 809 x[idx] = x0; 810 x[idx+1] = x1; 811 x[idx+2] = x2; 812 x[idx+3] = x3; 813 x[idx+4] = x4; 814 } 815 816 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 817 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 818 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 819 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 820 PetscFunctionReturn(0); 821 } 822 823 #undef __FUNCT__ 824 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private" 825 PetscErrorCode ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 826 { 827 MatScalar *v,*diag; 828 PetscScalar *xp,x0,x1,x2,x3,x4; 829 PetscInt nz,*vj,k; 830 831 PetscFunctionBegin; 832 for (k=0; k<mbs; k++){ 833 v = aa + 25*ai[k]; 834 xp = x + k*5; 835 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 836 nz = ai[k+1] - ai[k]; 837 vj = aj + ai[k]; 838 xp = x + (*vj)*5; 839 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 840 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 841 while (nz--) { 842 /* x(:) += U(k,:)^T*(Dk*xk) */ 843 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 844 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 845 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 846 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 847 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 848 vj++; xp = x + (*vj)*5; 849 v += 25; 850 } 851 /* xk = inv(Dk)*(Dk*xk) */ 852 diag = aa+k*25; /* ptr to inv(Dk) */ 853 xp = x + k*5; 854 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 855 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 856 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 857 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 858 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 859 } 860 PetscFunctionReturn(0); 861 } 862 863 #undef __FUNCT__ 864 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private" 865 PetscErrorCode BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 866 { 867 MatScalar *v; 868 PetscScalar *xp,x0,x1,x2,x3,x4; 869 PetscInt nz,*vj,k; 870 871 PetscFunctionBegin; 872 for (k=mbs-1; k>=0; k--){ 873 v = aa + 25*ai[k]; 874 xp = x + k*5; 875 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */ 876 nz = ai[k+1] - ai[k]; 877 vj = aj + ai[k]; 878 xp = x + (*vj)*5; 879 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 880 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 881 while (nz--) { 882 /* xk += U(k,:)*x(:) */ 883 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 884 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 885 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 886 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 887 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 888 vj++; 889 v += 25; xp = x + (*vj)*5; 890 } 891 xp = x + k*5; 892 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 893 } 894 PetscFunctionReturn(0); 895 } 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 899 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 902 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2; 903 MatScalar *aa=a->a; 904 PetscScalar *x,*b; 905 PetscErrorCode ierr; 906 907 PetscFunctionBegin; 908 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 909 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 910 911 /* solve U^T * D * y = b by forward substitution */ 912 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 913 ierr = ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 914 915 /* solve U*x = y by back substitution */ 916 ierr = BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 917 918 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 919 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 920 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 921 PetscFunctionReturn(0); 922 } 923 924 #undef __FUNCT__ 925 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 926 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 927 { 928 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 929 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 930 MatScalar *aa=a->a; 931 PetscScalar *x,*b; 932 PetscErrorCode ierr; 933 934 PetscFunctionBegin; 935 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 936 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 937 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 938 ierr = ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 939 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 940 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 941 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 942 PetscFunctionReturn(0); 943 } 944 945 #undef __FUNCT__ 946 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 947 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 948 { 949 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 950 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 951 MatScalar *aa=a->a; 952 PetscScalar *x,*b; 953 PetscErrorCode ierr; 954 955 PetscFunctionBegin; 956 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 957 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 958 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 959 ierr = BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 960 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 961 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 962 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 963 PetscFunctionReturn(0); 964 } 965 966 #undef __FUNCT__ 967 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace" 968 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 969 { 970 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 971 IS isrow=a->row; 972 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 973 PetscErrorCode ierr; 974 const PetscInt *r; 975 PetscInt nz,*vj,k,idx; 976 MatScalar *aa=a->a,*v,*diag; 977 PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp; 978 979 PetscFunctionBegin; 980 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 981 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 982 t = a->solve_work; 983 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 984 985 /* solve U^T * D * y = b by forward substitution */ 986 tp = t; 987 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 988 idx = 4*r[k]; 989 tp[0] = b[idx]; 990 tp[1] = b[idx+1]; 991 tp[2] = b[idx+2]; 992 tp[3] = b[idx+3]; 993 tp += 4; 994 } 995 996 for (k=0; k<mbs; k++){ 997 v = aa + 16*ai[k]; 998 vj = aj + ai[k]; 999 tp = t + k*4; 1000 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 1001 nz = ai[k+1] - ai[k]; 1002 1003 tp = t + (*vj)*4; 1004 while (nz--) { 1005 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1006 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1007 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1008 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1009 vj++; tp = t + (*vj)*4; 1010 v += 16; 1011 } 1012 1013 /* xk = inv(Dk)*(Dk*xk) */ 1014 diag = aa+k*16; /* ptr to inv(Dk) */ 1015 tp = t + k*4; 1016 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1017 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1018 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1019 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1020 } 1021 1022 /* solve U*x = y by back substitution */ 1023 for (k=mbs-1; k>=0; k--){ 1024 v = aa + 16*ai[k]; 1025 vj = aj + ai[k]; 1026 tp = t + k*4; 1027 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 1028 nz = ai[k+1] - ai[k]; 1029 1030 tp = t + (*vj)*4; 1031 while (nz--) { 1032 /* xk += U(k,:)*x(:) */ 1033 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 1034 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 1035 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 1036 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 1037 vj++; tp = t + (*vj)*4; 1038 v += 16; 1039 } 1040 tp = t + k*4; 1041 tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 1042 idx = 4*r[k]; 1043 x[idx] = x0; 1044 x[idx+1] = x1; 1045 x[idx+2] = x2; 1046 x[idx+3] = x3; 1047 } 1048 1049 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1050 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1051 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1052 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1053 PetscFunctionReturn(0); 1054 } 1055 1056 #undef __FUNCT__ 1057 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private" 1058 PetscErrorCode ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1059 { 1060 MatScalar *v,*diag; 1061 PetscScalar *xp,x0,x1,x2,x3; 1062 PetscInt nz,*vj,k; 1063 1064 PetscFunctionBegin; 1065 for (k=0; k<mbs; k++){ 1066 v = aa + 16*ai[k]; 1067 xp = x + k*4; 1068 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1069 nz = ai[k+1] - ai[k]; 1070 vj = aj + ai[k]; 1071 xp = x + (*vj)*4; 1072 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1073 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1074 while (nz--) { 1075 /* x(:) += U(k,:)^T*(Dk*xk) */ 1076 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1077 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1078 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1079 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1080 vj++; xp = x + (*vj)*4; 1081 v += 16; 1082 } 1083 /* xk = inv(Dk)*(Dk*xk) */ 1084 diag = aa+k*16; /* ptr to inv(Dk) */ 1085 xp = x + k*4; 1086 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1087 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1088 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1089 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1090 } 1091 PetscFunctionReturn(0); 1092 } 1093 1094 #undef __FUNCT__ 1095 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private" 1096 PetscErrorCode BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1097 { 1098 MatScalar *v; 1099 PetscScalar *xp,x0,x1,x2,x3; 1100 PetscInt nz,*vj,k; 1101 1102 PetscFunctionBegin; 1103 for (k=mbs-1; k>=0; k--){ 1104 v = aa + 16*ai[k]; 1105 xp = x + k*4; 1106 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1107 nz = ai[k+1] - ai[k]; 1108 vj = aj + ai[k]; 1109 xp = x + (*vj)*4; 1110 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1111 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1112 while (nz--) { 1113 /* xk += U(k,:)*x(:) */ 1114 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1115 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1116 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1117 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1118 vj++; 1119 v += 16; xp = x + (*vj)*4; 1120 } 1121 xp = x + k*4; 1122 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1123 } 1124 PetscFunctionReturn(0); 1125 } 1126 1127 #undef __FUNCT__ 1128 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1129 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1130 { 1131 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1132 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1133 MatScalar *aa=a->a; 1134 PetscScalar *x,*b; 1135 PetscErrorCode ierr; 1136 1137 PetscFunctionBegin; 1138 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1139 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1140 1141 /* solve U^T * D * y = b by forward substitution */ 1142 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 1143 ierr = ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1144 1145 /* solve U*x = y by back substitution */ 1146 ierr = BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1147 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1148 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1149 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1150 PetscFunctionReturn(0); 1151 } 1152 1153 #undef __FUNCT__ 1154 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1155 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1156 { 1157 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1158 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1159 MatScalar *aa=a->a; 1160 PetscScalar *x,*b; 1161 PetscErrorCode ierr; 1162 1163 PetscFunctionBegin; 1164 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1166 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 1167 ierr = ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1176 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1177 { 1178 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1179 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1180 MatScalar *aa=a->a; 1181 PetscScalar *x,*b; 1182 PetscErrorCode ierr; 1183 1184 PetscFunctionBegin; 1185 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1186 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1187 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1188 ierr = BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1189 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1190 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1191 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 1192 PetscFunctionReturn(0); 1193 } 1194 1195 #undef __FUNCT__ 1196 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace" 1197 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1198 { 1199 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1200 IS isrow=a->row; 1201 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1202 PetscErrorCode ierr; 1203 const PetscInt *r; 1204 PetscInt nz,*vj,k,idx; 1205 MatScalar *aa=a->a,*v,*diag; 1206 PetscScalar *x,*b,x0,x1,x2,*t,*tp; 1207 1208 PetscFunctionBegin; 1209 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1210 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1211 t = a->solve_work; 1212 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1213 1214 /* solve U^T * D * y = b by forward substitution */ 1215 tp = t; 1216 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1217 idx = 3*r[k]; 1218 tp[0] = b[idx]; 1219 tp[1] = b[idx+1]; 1220 tp[2] = b[idx+2]; 1221 tp += 3; 1222 } 1223 1224 for (k=0; k<mbs; k++){ 1225 v = aa + 9*ai[k]; 1226 vj = aj + ai[k]; 1227 tp = t + k*3; 1228 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1229 nz = ai[k+1] - ai[k]; 1230 1231 tp = t + (*vj)*3; 1232 while (nz--) { 1233 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1234 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1235 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1236 vj++; tp = t + (*vj)*3; 1237 v += 9; 1238 } 1239 1240 /* xk = inv(Dk)*(Dk*xk) */ 1241 diag = aa+k*9; /* ptr to inv(Dk) */ 1242 tp = t + k*3; 1243 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1244 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1245 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1246 } 1247 1248 /* solve U*x = y by back substitution */ 1249 for (k=mbs-1; k>=0; k--){ 1250 v = aa + 9*ai[k]; 1251 vj = aj + ai[k]; 1252 tp = t + k*3; 1253 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1254 nz = ai[k+1] - ai[k]; 1255 1256 tp = t + (*vj)*3; 1257 while (nz--) { 1258 /* xk += U(k,:)*x(:) */ 1259 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1260 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1261 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1262 vj++; tp = t + (*vj)*3; 1263 v += 9; 1264 } 1265 tp = t + k*3; 1266 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1267 idx = 3*r[k]; 1268 x[idx] = x0; 1269 x[idx+1] = x1; 1270 x[idx+2] = x2; 1271 } 1272 1273 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1274 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1275 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1276 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1277 PetscFunctionReturn(0); 1278 } 1279 1280 #undef __FUNCT__ 1281 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private" 1282 PetscErrorCode ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1283 { 1284 MatScalar *v,*diag; 1285 PetscScalar *xp,x0,x1,x2; 1286 PetscInt nz,*vj,k; 1287 1288 PetscFunctionBegin; 1289 for (k=0; k<mbs; k++){ 1290 v = aa + 9*ai[k]; 1291 xp = x + k*3; 1292 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1293 nz = ai[k+1] - ai[k]; 1294 vj = aj + ai[k]; 1295 xp = x + (*vj)*3; 1296 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1297 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1298 while (nz--) { 1299 /* x(:) += U(k,:)^T*(Dk*xk) */ 1300 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1301 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1302 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1303 vj++; xp = x + (*vj)*3; 1304 v += 9; 1305 } 1306 /* xk = inv(Dk)*(Dk*xk) */ 1307 diag = aa+k*9; /* ptr to inv(Dk) */ 1308 xp = x + k*3; 1309 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1310 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1311 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1312 } 1313 PetscFunctionReturn(0); 1314 } 1315 1316 #undef __FUNCT__ 1317 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private" 1318 PetscErrorCode BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1319 { 1320 MatScalar *v; 1321 PetscScalar *xp,x0,x1,x2; 1322 PetscInt nz,*vj,k; 1323 1324 PetscFunctionBegin; 1325 for (k=mbs-1; k>=0; k--){ 1326 v = aa + 9*ai[k]; 1327 xp = x + k*3; 1328 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1329 nz = ai[k+1] - ai[k]; 1330 vj = aj + ai[k]; 1331 xp = x + (*vj)*3; 1332 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1333 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1334 while (nz--) { 1335 /* xk += U(k,:)*x(:) */ 1336 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1337 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1338 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1339 vj++; 1340 v += 9; xp = x + (*vj)*3; 1341 } 1342 xp = x + k*3; 1343 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1344 } 1345 PetscFunctionReturn(0); 1346 } 1347 1348 #undef __FUNCT__ 1349 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1350 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1351 { 1352 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1353 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1354 MatScalar *aa=a->a; 1355 PetscScalar *x,*b; 1356 PetscErrorCode ierr; 1357 1358 PetscFunctionBegin; 1359 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1360 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1361 1362 /* solve U^T * D * y = b by forward substitution */ 1363 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1364 ierr = ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1365 1366 /* solve U*x = y by back substitution */ 1367 ierr = BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1368 1369 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1370 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1371 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1372 PetscFunctionReturn(0); 1373 } 1374 1375 #undef __FUNCT__ 1376 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1377 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1378 { 1379 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1380 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1381 MatScalar *aa=a->a; 1382 PetscScalar *x,*b; 1383 PetscErrorCode ierr; 1384 1385 PetscFunctionBegin; 1386 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1387 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1388 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1389 ierr = ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1390 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1391 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1392 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1393 PetscFunctionReturn(0); 1394 } 1395 1396 #undef __FUNCT__ 1397 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1398 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1399 { 1400 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1401 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1402 MatScalar *aa=a->a; 1403 PetscScalar *x,*b; 1404 PetscErrorCode ierr; 1405 1406 PetscFunctionBegin; 1407 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1408 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1409 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1410 ierr = BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1411 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1412 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1413 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 1414 PetscFunctionReturn(0); 1415 } 1416 1417 #undef __FUNCT__ 1418 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace" 1419 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1420 { 1421 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data; 1422 IS isrow=a->row; 1423 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1424 PetscErrorCode ierr; 1425 const PetscInt *r; 1426 PetscInt nz,*vj,k,k2,idx; 1427 MatScalar *aa=a->a,*v,*diag; 1428 PetscScalar *x,*b,x0,x1,*t; 1429 1430 PetscFunctionBegin; 1431 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1432 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1433 t = a->solve_work; 1434 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1435 1436 /* solve U^T * D * y = perm(b) by forward substitution */ 1437 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1438 idx = 2*r[k]; 1439 t[k*2] = b[idx]; 1440 t[k*2+1] = b[idx+1]; 1441 } 1442 for (k=0; k<mbs; k++){ 1443 v = aa + 4*ai[k]; 1444 vj = aj + ai[k]; 1445 k2 = k*2; 1446 x0 = t[k2]; x1 = t[k2+1]; 1447 nz = ai[k+1] - ai[k]; 1448 while (nz--) { 1449 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1450 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1451 vj++; v += 4; 1452 } 1453 diag = aa+k*4; /* ptr to inv(Dk) */ 1454 t[k2] = diag[0]*x0 + diag[2]*x1; 1455 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1456 } 1457 1458 /* solve U*x = y by back substitution */ 1459 for (k=mbs-1; k>=0; k--){ 1460 v = aa + 4*ai[k]; 1461 vj = aj + ai[k]; 1462 k2 = k*2; 1463 x0 = t[k2]; x1 = t[k2+1]; 1464 nz = ai[k+1] - ai[k]; 1465 while (nz--) { 1466 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1467 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1468 vj++; v += 4; 1469 } 1470 t[k2] = x0; 1471 t[k2+1] = x1; 1472 idx = 2*r[k]; 1473 x[idx] = x0; 1474 x[idx+1] = x1; 1475 } 1476 1477 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1478 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1479 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1480 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1481 PetscFunctionReturn(0); 1482 } 1483 1484 #undef __FUNCT__ 1485 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private" 1486 PetscErrorCode ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1487 { 1488 MatScalar *v,*diag; 1489 PetscScalar x0,x1; 1490 PetscInt nz,*vj,k,k2; 1491 1492 PetscFunctionBegin; 1493 for (k=0; k<mbs; k++){ 1494 v = aa + 4*ai[k]; 1495 vj = aj + ai[k]; 1496 k2 = k*2; 1497 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1498 nz = ai[k+1] - ai[k]; 1499 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1500 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1501 while (nz--) { 1502 /* x(:) += U(k,:)^T*(Dk*xk) */ 1503 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1504 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1505 vj++; v += 4; 1506 } 1507 /* xk = inv(Dk)*(Dk*xk) */ 1508 diag = aa+k*4; /* ptr to inv(Dk) */ 1509 x[k2] = diag[0]*x0 + diag[2]*x1; 1510 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1511 } 1512 PetscFunctionReturn(0); 1513 } 1514 1515 #undef __FUNCT__ 1516 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private" 1517 PetscErrorCode BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1518 { 1519 MatScalar *v; 1520 PetscScalar x0,x1; 1521 PetscInt nz,*vj,k,k2; 1522 1523 PetscFunctionBegin; 1524 for (k=mbs-1; k>=0; k--){ 1525 v = aa + 4*ai[k]; 1526 vj = aj + ai[k]; 1527 k2 = k*2; 1528 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1529 nz = ai[k+1] - ai[k]; 1530 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1531 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1532 while (nz--) { 1533 /* xk += U(k,:)*x(:) */ 1534 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1535 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1536 vj++; v += 4; 1537 } 1538 x[k2] = x0; 1539 x[k2+1] = x1; 1540 } 1541 PetscFunctionReturn(0); 1542 } 1543 1544 #undef __FUNCT__ 1545 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1546 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1547 { 1548 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1549 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1550 MatScalar *aa=a->a; 1551 PetscScalar *x,*b; 1552 PetscErrorCode ierr; 1553 1554 PetscFunctionBegin; 1555 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1556 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1557 1558 /* solve U^T * D * y = b by forward substitution */ 1559 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1560 ierr = ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1561 1562 /* solve U*x = y by back substitution */ 1563 ierr = BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1564 1565 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1566 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1567 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1568 PetscFunctionReturn(0); 1569 } 1570 1571 #undef __FUNCT__ 1572 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1573 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1574 { 1575 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1576 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1577 MatScalar *aa=a->a; 1578 PetscScalar *x,*b; 1579 PetscErrorCode ierr; 1580 1581 PetscFunctionBegin; 1582 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1583 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1584 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1585 ierr = ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1586 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1587 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1588 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1589 PetscFunctionReturn(0); 1590 } 1591 1592 #undef __FUNCT__ 1593 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1594 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1595 { 1596 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1597 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1598 MatScalar *aa=a->a; 1599 PetscScalar *x,*b; 1600 PetscErrorCode ierr; 1601 1602 PetscFunctionBegin; 1603 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1604 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1605 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1606 ierr = BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1607 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1608 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1609 ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr); 1610 PetscFunctionReturn(0); 1611 } 1612 1613 #undef __FUNCT__ 1614 #define __FUNCT__ "MatSolve_SeqSBAIJ_1" 1615 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1616 { 1617 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1618 IS isrow=a->row; 1619 PetscErrorCode ierr; 1620 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1621 const MatScalar *aa=a->a,*v; 1622 const PetscScalar *b; 1623 PetscScalar *x,xk,*t; 1624 PetscInt nz,k,j; 1625 1626 PetscFunctionBegin; 1627 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1628 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1629 t = a->solve_work; 1630 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1631 1632 /* solve U^T*D*y = perm(b) by forward substitution */ 1633 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1634 for (k=0; k<mbs; k++){ 1635 v = aa + ai[k]; 1636 vj = aj + ai[k]; 1637 xk = t[k]; 1638 nz = ai[k+1] - ai[k] - 1; 1639 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1640 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1641 } 1642 1643 /* solve U*perm(x) = y by back substitution */ 1644 for (k=mbs-1; k>=0; k--){ 1645 v = aa + adiag[k] - 1; 1646 vj = aj + adiag[k] - 1; 1647 nz = ai[k+1] - ai[k] - 1; 1648 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1649 x[rp[k]] = t[k]; 1650 } 1651 1652 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1653 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1654 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1655 ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 #undef __FUNCT__ 1660 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace" 1661 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1662 { 1663 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1664 IS isrow=a->row; 1665 PetscErrorCode ierr; 1666 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1667 const MatScalar *aa=a->a,*v; 1668 PetscScalar *x,*b,xk,*t; 1669 PetscInt nz,k; 1670 1671 PetscFunctionBegin; 1672 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1673 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1674 t = a->solve_work; 1675 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1676 1677 /* solve U^T*D*y = perm(b) by forward substitution */ 1678 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1679 for (k=0; k<mbs; k++){ 1680 v = aa + ai[k] + 1; 1681 vj = aj + ai[k] + 1; 1682 xk = t[k]; 1683 nz = ai[k+1] - ai[k] - 1; 1684 while (nz--) t[*vj++] += (*v++) * xk; 1685 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1686 } 1687 1688 /* solve U*perm(x) = y by back substitution */ 1689 for (k=mbs-1; k>=0; k--){ 1690 v = aa + ai[k] + 1; 1691 vj = aj + ai[k] + 1; 1692 nz = ai[k+1] - ai[k] - 1; 1693 while (nz--) t[k] += (*v++) * t[*vj++]; 1694 x[rp[k]] = t[k]; 1695 } 1696 1697 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1698 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1699 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1700 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1701 PetscFunctionReturn(0); 1702 } 1703 1704 #undef __FUNCT__ 1705 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1" 1706 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1707 { 1708 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1709 IS isrow=a->row; 1710 PetscErrorCode ierr; 1711 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1712 const MatScalar *aa=a->a,*v; 1713 PetscReal diagk; 1714 PetscScalar *x,*b,xk; 1715 PetscInt nz,k; 1716 1717 PetscFunctionBegin; 1718 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1719 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1720 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1721 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1722 1723 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1724 for (k=0; k<mbs; k++){ 1725 v = aa + ai[k]; 1726 vj = aj + ai[k]; 1727 xk = x[k]; 1728 nz = ai[k+1] - ai[k] - 1; 1729 while (nz--) x[*vj++] += (*v++) * xk; 1730 1731 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1732 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1733 x[k] = xk*sqrt(diagk); 1734 } 1735 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1736 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1737 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1738 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1739 PetscFunctionReturn(0); 1740 } 1741 1742 #undef __FUNCT__ 1743 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace" 1744 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1745 { 1746 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1747 IS isrow=a->row; 1748 PetscErrorCode ierr; 1749 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1750 const MatScalar *aa=a->a,*v; 1751 PetscReal diagk; 1752 PetscScalar *x,*b,xk; 1753 PetscInt nz,k; 1754 1755 PetscFunctionBegin; 1756 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1757 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1758 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1759 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1760 1761 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1762 for (k=0; k<mbs; k++){ 1763 v = aa + ai[k] + 1; 1764 vj = aj + ai[k] + 1; 1765 xk = x[k]; 1766 nz = ai[k+1] - ai[k] - 1; 1767 while (nz--) x[*vj++] += (*v++) * xk; 1768 1769 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1770 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1771 x[k] = xk*sqrt(diagk); 1772 } 1773 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1774 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1775 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1776 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1777 PetscFunctionReturn(0); 1778 } 1779 1780 #undef __FUNCT__ 1781 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1" 1782 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1783 { 1784 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1785 IS isrow=a->row; 1786 PetscErrorCode ierr; 1787 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1788 const MatScalar *aa=a->a,*v; 1789 PetscReal diagk; 1790 PetscScalar *x,*b,*t; 1791 PetscInt nz,k; 1792 1793 PetscFunctionBegin; 1794 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1795 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1796 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1797 t = a->solve_work; 1798 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1799 1800 for (k=mbs-1; k>=0; k--){ 1801 v = aa + ai[k]; 1802 vj = aj + ai[k]; 1803 diagk = PetscRealPart(aa[adiag[k]]); 1804 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1805 t[k] = b[k] * sqrt(diagk); 1806 nz = ai[k+1] - ai[k] - 1; 1807 while (nz--) t[k] += (*v++) * t[*vj++]; 1808 x[rp[k]] = t[k]; 1809 } 1810 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1811 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1812 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1813 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1814 PetscFunctionReturn(0); 1815 } 1816 1817 #undef __FUNCT__ 1818 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace" 1819 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1820 { 1821 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1822 IS isrow=a->row; 1823 PetscErrorCode ierr; 1824 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1825 const MatScalar *aa=a->a,*v; 1826 PetscReal diagk; 1827 PetscScalar *x,*b,*t; 1828 PetscInt nz,k; 1829 1830 PetscFunctionBegin; 1831 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1832 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1833 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1834 t = a->solve_work; 1835 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1836 1837 for (k=mbs-1; k>=0; k--){ 1838 v = aa + ai[k] + 1; 1839 vj = aj + ai[k] + 1; 1840 diagk = PetscRealPart(aa[ai[k]]); 1841 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1842 t[k] = b[k] * sqrt(diagk); 1843 nz = ai[k+1] - ai[k] - 1; 1844 while (nz--) t[k] += (*v++) * t[*vj++]; 1845 x[rp[k]] = t[k]; 1846 } 1847 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1848 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1849 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1850 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1851 PetscFunctionReturn(0); 1852 } 1853 1854 #undef __FUNCT__ 1855 #define __FUNCT__ "MatSolves_SeqSBAIJ_1" 1856 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1857 { 1858 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1859 PetscErrorCode ierr; 1860 1861 PetscFunctionBegin; 1862 if (A->rmap->bs == 1) { 1863 ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr); 1864 } else { 1865 IS isrow=a->row; 1866 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1867 const MatScalar *aa=a->a,*v; 1868 PetscScalar *x,*b,*t; 1869 PetscInt nz,k,n,i,j; 1870 if (bb->n > a->solves_work_n) { 1871 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1872 ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr); 1873 a->solves_work_n = bb->n; 1874 } 1875 n = bb->n; 1876 ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr); 1877 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1878 t = a->solves_work; 1879 1880 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1881 1882 /* solve U^T*D*y = perm(b) by forward substitution */ 1883 for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */ 1884 for (k=0; k<mbs; k++){ 1885 v = aa + ai[k]; 1886 vj = aj + ai[k]; 1887 nz = ai[k+1] - ai[k] - 1; 1888 for (j=0; j<nz; j++){ 1889 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1890 v++;vj++; 1891 } 1892 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1893 } 1894 1895 /* solve U*perm(x) = y by back substitution */ 1896 for (k=mbs-1; k>=0; k--){ 1897 v = aa + ai[k] - 1; 1898 vj = aj + ai[k] - 1; 1899 nz = ai[k+1] - ai[k] - 1; 1900 for (j=0; j<nz; j++){ 1901 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1902 v++;vj++; 1903 } 1904 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1905 } 1906 1907 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1908 ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr); 1909 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1910 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1911 } 1912 PetscFunctionReturn(0); 1913 } 1914 1915 #undef __FUNCT__ 1916 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace" 1917 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1918 { 1919 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1920 PetscErrorCode ierr; 1921 1922 PetscFunctionBegin; 1923 if (A->rmap->bs == 1) { 1924 ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr); 1925 } else { 1926 IS isrow=a->row; 1927 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1928 const MatScalar *aa=a->a,*v; 1929 PetscScalar *x,*b,*t; 1930 PetscInt nz,k,n,i; 1931 if (bb->n > a->solves_work_n) { 1932 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1933 ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr); 1934 a->solves_work_n = bb->n; 1935 } 1936 n = bb->n; 1937 ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr); 1938 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1939 t = a->solves_work; 1940 1941 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1942 1943 /* solve U^T*D*y = perm(b) by forward substitution */ 1944 for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */ 1945 for (k=0; k<mbs; k++){ 1946 v = aa + ai[k]; 1947 vj = aj + ai[k]; 1948 nz = ai[k+1] - ai[k]; 1949 while (nz--) { 1950 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1951 v++;vj++; 1952 } 1953 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1954 } 1955 1956 /* solve U*perm(x) = y by back substitution */ 1957 for (k=mbs-1; k>=0; k--){ 1958 v = aa + ai[k]; 1959 vj = aj + ai[k]; 1960 nz = ai[k+1] - ai[k]; 1961 while (nz--) { 1962 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1963 v++;vj++; 1964 } 1965 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1966 } 1967 1968 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1969 ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr); 1970 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1971 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1972 } 1973 PetscFunctionReturn(0); 1974 } 1975 1976 #undef __FUNCT__ 1977 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering" 1978 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1979 { 1980 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1981 PetscErrorCode ierr; 1982 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag; 1983 const MatScalar *aa=a->a,*v; 1984 const PetscScalar *b; 1985 PetscScalar *x,xi; 1986 PetscInt nz,i,j; 1987 1988 PetscFunctionBegin; 1989 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1990 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1991 1992 /* solve U^T*D*y = b by forward substitution */ 1993 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1994 for (i=0; i<mbs; i++){ 1995 v = aa + ai[i]; 1996 vj = aj + ai[i]; 1997 xi = x[i]; 1998 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1999 for (j=0; j<nz; j++){ 2000 x[vj[j]] += v[j]* xi; 2001 } 2002 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2003 } 2004 2005 /* solve U*x = y by backward substitution */ 2006 for (i=mbs-2; i>=0; i--){ 2007 xi = x[i]; 2008 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2009 vj = aj + adiag[i] - 1; 2010 nz = ai[i+1] - ai[i] - 1; 2011 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 2012 x[i] = xi; 2013 } 2014 2015 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2016 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2017 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2018 PetscFunctionReturn(0); 2019 } 2020 2021 #undef __FUNCT__ 2022 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2023 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2024 { 2025 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2026 PetscErrorCode ierr; 2027 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2028 MatScalar *aa=a->a,*v; 2029 PetscScalar *x,*b,xk; 2030 PetscInt nz,*vj,k; 2031 2032 PetscFunctionBegin; 2033 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2034 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2035 2036 /* solve U^T*D*y = b by forward substitution */ 2037 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2038 for (k=0; k<mbs; k++){ 2039 v = aa + ai[k] + 1; 2040 vj = aj + ai[k] + 1; 2041 xk = x[k]; 2042 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2043 while (nz--) x[*vj++] += (*v++) * xk; 2044 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2045 } 2046 2047 /* solve U*x = y by back substitution */ 2048 for (k=mbs-2; k>=0; k--){ 2049 v = aa + ai[k] + 1; 2050 vj = aj + ai[k] + 1; 2051 xk = x[k]; 2052 nz = ai[k+1] - ai[k] - 1; 2053 while (nz--) { 2054 xk += (*v++) * x[*vj++]; 2055 } 2056 x[k] = xk; 2057 } 2058 2059 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2060 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2061 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 #undef __FUNCT__ 2066 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering" 2067 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2068 { 2069 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2070 PetscErrorCode ierr; 2071 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag; 2072 const MatScalar *aa=a->a,*v; 2073 PetscReal diagk; 2074 PetscScalar *x,*b; 2075 PetscInt nz,*vj,k; 2076 2077 PetscFunctionBegin; 2078 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2079 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2080 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2081 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2082 for (k=0; k<mbs; k++){ 2083 v = aa + ai[k]; 2084 vj = aj + ai[k]; 2085 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2086 while (nz--) x[*vj++] += (*v++) * x[k]; 2087 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2088 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]])); 2089 x[k] *= sqrt(diagk); 2090 } 2091 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2092 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2093 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2094 PetscFunctionReturn(0); 2095 } 2096 2097 #undef __FUNCT__ 2098 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2099 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2100 { 2101 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2102 PetscErrorCode ierr; 2103 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2104 MatScalar *aa=a->a,*v; 2105 PetscReal diagk; 2106 PetscScalar *x,*b; 2107 PetscInt nz,*vj,k; 2108 2109 PetscFunctionBegin; 2110 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2111 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2112 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2113 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2114 for (k=0; k<mbs; k++){ 2115 v = aa + ai[k] + 1; 2116 vj = aj + ai[k] + 1; 2117 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2118 while (nz--) x[*vj++] += (*v++) * x[k]; 2119 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2120 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]])); 2121 x[k] *= sqrt(diagk); 2122 } 2123 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2124 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2125 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2126 PetscFunctionReturn(0); 2127 } 2128 2129 #undef __FUNCT__ 2130 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering" 2131 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2132 { 2133 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2134 PetscErrorCode ierr; 2135 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag; 2136 MatScalar *aa=a->a,*v; 2137 PetscReal diagk; 2138 PetscScalar *x,*b; 2139 PetscInt nz,*vj,k; 2140 2141 PetscFunctionBegin; 2142 /* solve D^(1/2)*U*x = b by back substitution */ 2143 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2144 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2145 2146 for (k=mbs-1; k>=0; k--){ 2147 v = aa + ai[k]; 2148 vj = aj + ai[k]; 2149 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2150 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2151 x[k] = sqrt(diagk)*b[k]; 2152 nz = ai[k+1] - ai[k] - 1; 2153 while (nz--) x[k] += (*v++) * x[*vj++]; 2154 } 2155 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2156 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2157 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2158 PetscFunctionReturn(0); 2159 } 2160 2161 #undef __FUNCT__ 2162 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2163 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2164 { 2165 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2166 PetscErrorCode ierr; 2167 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2168 MatScalar *aa=a->a,*v; 2169 PetscReal diagk; 2170 PetscScalar *x,*b; 2171 PetscInt nz,*vj,k; 2172 2173 PetscFunctionBegin; 2174 /* solve D^(1/2)*U*x = b by back substitution */ 2175 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2176 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2177 2178 for (k=mbs-1; k>=0; k--){ 2179 v = aa + ai[k] + 1; 2180 vj = aj + ai[k] + 1; 2181 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2182 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2183 x[k] = sqrt(diagk)*b[k]; 2184 nz = ai[k+1] - ai[k] - 1; 2185 while (nz--) x[k] += (*v++) * x[*vj++]; 2186 } 2187 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2188 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2189 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2190 PetscFunctionReturn(0); 2191 } 2192 2193 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2194 #undef __FUNCT__ 2195 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR" 2196 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2197 { 2198 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2199 PetscErrorCode ierr; 2200 const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j; 2201 PetscInt *jutmp,bs = A->rmap->bs,bs2=a->bs2,i; 2202 PetscInt m,reallocs = 0,*levtmp; 2203 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2204 PetscInt incrlev,*lev,shift,prow,nz; 2205 PetscReal f = info->fill,levels = info->levels; 2206 PetscBool perm_identity; 2207 2208 PetscFunctionBegin; 2209 /* check whether perm is the identity mapping */ 2210 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2211 2212 if (perm_identity){ 2213 a->permute = PETSC_FALSE; 2214 ai = a->i; aj = a->j; 2215 } else { /* non-trivial permutation */ 2216 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2217 a->permute = PETSC_TRUE; 2218 ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr); 2219 ai = a->inew; aj = a->jnew; 2220 } 2221 2222 /* initialization */ 2223 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2224 umax = (PetscInt)(f*ai[mbs] + 1); 2225 ierr = PetscMalloc(umax*sizeof(PetscInt),&lev);CHKERRQ(ierr); 2226 umax += mbs + 1; 2227 shift = mbs + 1; 2228 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr); 2229 ierr = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr); 2230 iu[0] = mbs + 1; 2231 juidx = mbs + 1; 2232 /* prowl: linked list for pivot row */ 2233 ierr = PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);CHKERRQ(ierr); 2234 /* q: linked list for col index */ 2235 2236 for (i=0; i<mbs; i++){ 2237 prowl[i] = mbs; 2238 q[i] = 0; 2239 } 2240 2241 /* for each row k */ 2242 for (k=0; k<mbs; k++){ 2243 nzk = 0; 2244 q[k] = mbs; 2245 /* copy current row into linked list */ 2246 nz = ai[rip[k]+1] - ai[rip[k]]; 2247 j = ai[rip[k]]; 2248 while (nz--){ 2249 vj = rip[aj[j++]]; 2250 if (vj > k){ 2251 qm = k; 2252 do { 2253 m = qm; qm = q[m]; 2254 } while(qm < vj); 2255 if (qm == vj) { 2256 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n"); 2257 } 2258 nzk++; 2259 q[m] = vj; 2260 q[vj] = qm; 2261 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2262 } 2263 } 2264 2265 /* modify nonzero structure of k-th row by computing fill-in 2266 for each row prow to be merged in */ 2267 prow = k; 2268 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2269 2270 while (prow < k){ 2271 /* merge row prow into k-th row */ 2272 jmin = iu[prow] + 1; 2273 jmax = iu[prow+1]; 2274 qm = k; 2275 for (j=jmin; j<jmax; j++){ 2276 incrlev = lev[j-shift] + 1; 2277 if (incrlev > levels) continue; 2278 vj = ju[j]; 2279 do { 2280 m = qm; qm = q[m]; 2281 } while (qm < vj); 2282 if (qm != vj){ /* a fill */ 2283 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2284 levtmp[vj] = incrlev; 2285 } else { 2286 if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2287 } 2288 } 2289 prow = prowl[prow]; /* next pivot row */ 2290 } 2291 2292 /* add k to row list for first nonzero element in k-th row */ 2293 if (nzk > 1){ 2294 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2295 prowl[k] = prowl[i]; prowl[i] = k; 2296 } 2297 iu[k+1] = iu[k] + nzk; 2298 2299 /* allocate more space to ju and lev if needed */ 2300 if (iu[k+1] > umax) { 2301 /* estimate how much additional space we will need */ 2302 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2303 /* just double the memory each time */ 2304 maxadd = umax; 2305 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2306 umax += maxadd; 2307 2308 /* allocate a longer ju */ 2309 ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr); 2310 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr); 2311 ierr = PetscFree(ju);CHKERRQ(ierr); 2312 ju = jutmp; 2313 2314 ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr); 2315 ierr = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr); 2316 ierr = PetscFree(lev);CHKERRQ(ierr); 2317 lev = jutmp; 2318 reallocs += 2; /* count how many times we realloc */ 2319 } 2320 2321 /* save nonzero structure of k-th row in ju */ 2322 i=k; 2323 while (nzk --) { 2324 i = q[i]; 2325 ju[juidx] = i; 2326 lev[juidx-shift] = levtmp[i]; 2327 juidx++; 2328 } 2329 } 2330 2331 #if defined(PETSC_USE_INFO) 2332 if (ai[mbs] != 0) { 2333 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2334 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 2335 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 2336 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 2337 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 2338 } else { 2339 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2340 } 2341 #endif 2342 2343 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2344 ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr); 2345 ierr = PetscFree(lev);CHKERRQ(ierr); 2346 2347 /* put together the new matrix */ 2348 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);CHKERRQ(ierr); 2349 2350 /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */ 2351 b = (Mat_SeqSBAIJ*)(B)->data; 2352 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2353 b->singlemalloc = PETSC_FALSE; 2354 b->free_a = PETSC_TRUE; 2355 b->free_ij = PETSC_TRUE; 2356 /* the next line frees the default space generated by the Create() */ 2357 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 2358 ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr); 2359 b->j = ju; 2360 b->i = iu; 2361 b->diag = 0; 2362 b->ilen = 0; 2363 b->imax = 0; 2364 2365 if (b->row) { 2366 ierr = ISDestroy(b->row);CHKERRQ(ierr); 2367 } 2368 if (b->icol) { 2369 ierr = ISDestroy(b->icol);CHKERRQ(ierr); 2370 } 2371 b->row = perm; 2372 b->icol = perm; 2373 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2374 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2375 ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2376 /* In b structure: Free imax, ilen, old a, old j. 2377 Allocate idnew, solve_work, new a, new j */ 2378 ierr = PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2379 b->maxnz = b->nz = iu[mbs]; 2380 2381 (B)->info.factor_mallocs = reallocs; 2382 (B)->info.fill_ratio_given = f; 2383 if (ai[mbs] != 0) { 2384 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2385 } else { 2386 (B)->info.fill_ratio_needed = 0.0; 2387 } 2388 ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr); 2389 PetscFunctionReturn(0); 2390 } 2391 2392 /* 2393 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2394 */ 2395 #include "petscbt.h" 2396 #include "../src/mat/utils/freespace.h" 2397 #undef __FUNCT__ 2398 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ" 2399 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2400 { 2401 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2402 PetscErrorCode ierr; 2403 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2404 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2405 const PetscInt *rip; 2406 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2407 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2408 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2409 PetscReal fill=info->fill,levels=info->levels; 2410 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 2411 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 2412 PetscBT lnkbt; 2413 2414 PetscFunctionBegin; 2415 if (bs > 1){ 2416 ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr); 2417 PetscFunctionReturn(0); 2418 } 2419 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 2420 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2421 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2422 2423 /* check whether perm is the identity mapping */ 2424 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2425 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2426 a->permute = PETSC_FALSE; 2427 2428 ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr); 2429 ierr = PetscMalloc((am+1)*sizeof(PetscInt),&udiag);CHKERRQ(ierr); 2430 ui[0] = 0; 2431 2432 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2433 if (!levels){ 2434 /* reuse the column pointers and row offsets for memory savings */ 2435 for (i=0; i<am; i++) { 2436 ncols = ai[i+1] - ai[i]; 2437 ui[i+1] = ui[i] + ncols; 2438 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2439 } 2440 ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 2441 cols = uj; 2442 for (i=0; i<am; i++) { 2443 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2444 ncols = ai[i+1] - ai[i] -1; 2445 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2446 *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */ 2447 } 2448 } else { /* case: levels>0 */ 2449 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2450 2451 /* initialization */ 2452 /* jl: linked list for storing indices of the pivot rows 2453 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2454 ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr); 2455 for (i=0; i<am; i++){ 2456 jl[i] = am; il[i] = 0; 2457 } 2458 2459 /* create and initialize a linked list for storing column indices of the active row k */ 2460 nlnk = am + 1; 2461 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2462 2463 /* initial FreeSpace size is fill*(ai[am]+1) */ 2464 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr); 2465 current_space = free_space; 2466 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr); 2467 current_space_lvl = free_space_lvl; 2468 2469 for (k=0; k<am; k++){ /* for each active row k */ 2470 /* initialize lnk by the column indices of row k */ 2471 nzk = 0; 2472 ncols = ai[k+1] - ai[k]; 2473 if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k); 2474 cols = aj+ai[k]; 2475 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2476 nzk += nlnk; 2477 2478 /* update lnk by computing fill-in for each pivot row to be merged in */ 2479 prow = jl[k]; /* 1st pivot row */ 2480 2481 while (prow < k){ 2482 nextprow = jl[prow]; 2483 2484 /* merge prow into k-th row */ 2485 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2486 jmax = ui[prow+1]; 2487 ncols = jmax-jmin; 2488 i = jmin - ui[prow]; 2489 2490 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2491 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2492 j = *(uj - 1); 2493 ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2494 nzk += nlnk; 2495 2496 /* update il and jl for prow */ 2497 if (jmin < jmax){ 2498 il[prow] = jmin; 2499 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2500 } 2501 prow = nextprow; 2502 } 2503 2504 /* if free space is not available, make more free space */ 2505 if (current_space->local_remaining<nzk) { 2506 i = am - k + 1; /* num of unfactored rows */ 2507 i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2508 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2509 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2510 reallocs++; 2511 } 2512 2513 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2514 if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k); 2515 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2516 2517 /* add the k-th row into il and jl */ 2518 if (nzk > 1){ 2519 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2520 jl[k] = jl[i]; jl[i] = k; 2521 il[k] = ui[k] + 1; 2522 } 2523 uj_ptr[k] = current_space->array; 2524 uj_lvl_ptr[k] = current_space_lvl->array; 2525 2526 current_space->array += nzk; 2527 current_space->local_used += nzk; 2528 current_space->local_remaining -= nzk; 2529 current_space_lvl->array += nzk; 2530 current_space_lvl->local_used += nzk; 2531 current_space_lvl->local_remaining -= nzk; 2532 2533 ui[k+1] = ui[k] + nzk; 2534 } 2535 2536 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2537 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2538 2539 /* destroy list of free space and other temporary array(s) */ 2540 ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 2541 ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor */ 2542 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2543 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2544 2545 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2546 2547 /* put together the new matrix in MATSEQSBAIJ format */ 2548 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 2549 2550 b = (Mat_SeqSBAIJ*)(fact)->data; 2551 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2552 b->singlemalloc = PETSC_FALSE; 2553 b->free_a = PETSC_TRUE; 2554 b->free_ij = free_ij; 2555 ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 2556 b->j = uj; 2557 b->i = ui; 2558 b->diag = udiag; 2559 b->free_diag = PETSC_TRUE; 2560 b->ilen = 0; 2561 b->imax = 0; 2562 b->row = perm; 2563 b->col = perm; 2564 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2565 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2566 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2567 ierr = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2568 ierr = PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2569 b->maxnz = b->nz = ui[am]; 2570 2571 fact->info.factor_mallocs = reallocs; 2572 fact->info.fill_ratio_given = fill; 2573 if (ai[am] != 0) { 2574 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2575 } else { 2576 fact->info.fill_ratio_needed = 0.0; 2577 } 2578 #if defined(PETSC_USE_INFO) 2579 if (ai[am] != 0) { 2580 PetscReal af = fact->info.fill_ratio_needed; 2581 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr); 2582 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 2583 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr); 2584 } else { 2585 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2586 } 2587 #endif 2588 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2589 PetscFunctionReturn(0); 2590 } 2591 2592 #undef __FUNCT__ 2593 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_inplace" 2594 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2595 { 2596 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2597 Mat_SeqSBAIJ *b; 2598 PetscErrorCode ierr; 2599 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2600 PetscInt bs=A->rmap->bs,am=a->mbs,d; 2601 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2602 PetscInt reallocs=0,i,*ui; 2603 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2604 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2605 PetscReal fill=info->fill,levels=info->levels,ratio_needed; 2606 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 2607 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 2608 PetscBT lnkbt; 2609 2610 PetscFunctionBegin; 2611 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2612 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2613 2614 /* 2615 This code originally uses Modified Sparse Row (MSR) storage 2616 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2617 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2618 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2619 thus the original code in MSR format is still used for these cases. 2620 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2621 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2622 */ 2623 if (bs > 1){ 2624 ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 2625 PetscFunctionReturn(0); 2626 } 2627 2628 /* check whether perm is the identity mapping */ 2629 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2630 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2631 a->permute = PETSC_FALSE; 2632 2633 /* special case that simply copies fill pattern */ 2634 if (!levels ) { 2635 /* reuse the column pointers and row offsets for memory savings */ 2636 ui = a->i; 2637 uj = a->j; 2638 free_ij = PETSC_FALSE; 2639 ratio_needed = 1.0; 2640 } else { /* case: levels>0 */ 2641 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2642 2643 /* initialization */ 2644 ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr); 2645 ui[0] = 0; 2646 2647 /* jl: linked list for storing indices of the pivot rows 2648 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2649 ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr); 2650 for (i=0; i<am; i++){ 2651 jl[i] = am; il[i] = 0; 2652 } 2653 2654 /* create and initialize a linked list for storing column indices of the active row k */ 2655 nlnk = am + 1; 2656 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2657 2658 /* initial FreeSpace size is fill*(ai[am]+1) */ 2659 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr); 2660 current_space = free_space; 2661 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr); 2662 current_space_lvl = free_space_lvl; 2663 2664 for (k=0; k<am; k++){ /* for each active row k */ 2665 /* initialize lnk by the column indices of row rip[k] */ 2666 nzk = 0; 2667 ncols = ai[rip[k]+1] - ai[rip[k]]; 2668 cols = aj+ai[rip[k]]; 2669 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2670 nzk += nlnk; 2671 2672 /* update lnk by computing fill-in for each pivot row to be merged in */ 2673 prow = jl[k]; /* 1st pivot row */ 2674 2675 while (prow < k){ 2676 nextprow = jl[prow]; 2677 2678 /* merge prow into k-th row */ 2679 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2680 jmax = ui[prow+1]; 2681 ncols = jmax-jmin; 2682 i = jmin - ui[prow]; 2683 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2684 j = *(uj_lvl_ptr[prow] + i - 1); 2685 cols_lvl = uj_lvl_ptr[prow]+i; 2686 ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2687 nzk += nlnk; 2688 2689 /* update il and jl for prow */ 2690 if (jmin < jmax){ 2691 il[prow] = jmin; 2692 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2693 } 2694 prow = nextprow; 2695 } 2696 2697 /* if free space is not available, make more free space */ 2698 if (current_space->local_remaining<nzk) { 2699 i = am - k + 1; /* num of unfactored rows */ 2700 i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2701 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2702 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2703 reallocs++; 2704 } 2705 2706 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2707 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2708 2709 /* add the k-th row into il and jl */ 2710 if (nzk-1 > 0){ 2711 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2712 jl[k] = jl[i]; jl[i] = k; 2713 il[k] = ui[k] + 1; 2714 } 2715 uj_ptr[k] = current_space->array; 2716 uj_lvl_ptr[k] = current_space_lvl->array; 2717 2718 current_space->array += nzk; 2719 current_space->local_used += nzk; 2720 current_space->local_remaining -= nzk; 2721 current_space_lvl->array += nzk; 2722 current_space_lvl->local_used += nzk; 2723 current_space_lvl->local_remaining -= nzk; 2724 2725 ui[k+1] = ui[k] + nzk; 2726 } 2727 2728 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2729 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2730 2731 /* destroy list of free space and other temporary array(s) */ 2732 ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 2733 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 2734 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2735 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2736 if (ai[am] != 0) { 2737 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2738 } else { 2739 ratio_needed = 0.0; 2740 } 2741 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2742 2743 /* put together the new matrix in MATSEQSBAIJ format */ 2744 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 2745 2746 b = (Mat_SeqSBAIJ*)(fact)->data; 2747 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2748 b->singlemalloc = PETSC_FALSE; 2749 b->free_a = PETSC_TRUE; 2750 b->free_ij = free_ij; 2751 ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 2752 b->j = uj; 2753 b->i = ui; 2754 b->diag = 0; 2755 b->ilen = 0; 2756 b->imax = 0; 2757 b->row = perm; 2758 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2759 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2760 b->icol = perm; 2761 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2762 ierr = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2763 b->maxnz = b->nz = ui[am]; 2764 2765 fact->info.factor_mallocs = reallocs; 2766 fact->info.fill_ratio_given = fill; 2767 fact->info.fill_ratio_needed = ratio_needed; 2768 #if defined(PETSC_USE_INFO) 2769 if (ai[am] != 0) { 2770 PetscReal af = fact->info.fill_ratio_needed; 2771 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr); 2772 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 2773 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr); 2774 } else { 2775 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2776 } 2777 #endif 2778 if (perm_identity){ 2779 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2780 } else { 2781 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2782 } 2783 PetscFunctionReturn(0); 2784 } 2785 2786