1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 11 { 12 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 13 IS isrow=a->row; 14 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 15 PetscErrorCode ierr; 16 const PetscInt *r; 17 PetscInt nz,*vj,k,idx,k1; 18 PetscInt bs =A->rmap->bs,bs2 = a->bs2; 19 const MatScalar *aa=a->a,*v,*diag; 20 PetscScalar *x,*xk,*xj,*xk_tmp,*t; 21 const PetscScalar *b; 22 23 PetscFunctionBegin; 24 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 t = a->solve_work; 27 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 28 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 29 30 /* solve U^T * D * y = b by forward substitution */ 31 xk = t; 32 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 33 idx = bs*r[k]; 34 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 35 } 36 for (k=0; k<mbs; k++) { 37 v = aa + bs2*ai[k]; 38 xk = t + k*bs; /* Dk*xk = k-th block of x */ 39 ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */ 40 nz = ai[k+1] - ai[k]; 41 vj = aj + ai[k]; 42 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 43 while (nz--) { 44 /* x(:) += U(k,:)^T*(Dk*xk) */ 45 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 46 vj++; xj = t + (*vj)*bs; 47 v += bs2; 48 } 49 /* xk = inv(Dk)*(Dk*xk) */ 50 diag = aa+k*bs2; /* ptr to inv(Dk) */ 51 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 52 } 53 54 /* solve U*x = y by back substitution */ 55 for (k=mbs-1; k>=0; k--) { 56 v = aa + bs2*ai[k]; 57 xk = t + k*bs; /* xk */ 58 nz = ai[k+1] - ai[k]; 59 vj = aj + ai[k]; 60 xj = t + (*vj)*bs; 61 while (nz--) { 62 /* xk += U(k,:)*x(:) */ 63 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 64 vj++; 65 v += bs2; xj = t + (*vj)*bs; 66 } 67 idx = bs*r[k]; 68 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 69 } 70 71 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 72 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 73 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 74 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 75 ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr); 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 80 { 81 PetscFunctionBegin; 82 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet"); 83 } 84 85 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 86 { 87 PetscFunctionBegin; 88 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented"); 89 } 90 91 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 92 { 93 PetscErrorCode ierr; 94 PetscInt nz,k; 95 const PetscInt *vj,bs2 = bs*bs; 96 const MatScalar *v,*diag; 97 PetscScalar *xk,*xj,*xk_tmp; 98 99 PetscFunctionBegin; 100 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 101 for (k=0; k<mbs; k++) { 102 v = aa + bs2*ai[k]; 103 xk = x + k*bs; /* Dk*xk = k-th block of x */ 104 ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */ 105 nz = ai[k+1] - ai[k]; 106 vj = aj + ai[k]; 107 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 108 while (nz--) { 109 /* x(:) += U(k,:)^T*(Dk*xk) */ 110 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 111 vj++; xj = x + (*vj)*bs; 112 v += bs2; 113 } 114 /* xk = inv(Dk)*(Dk*xk) */ 115 diag = aa+k*bs2; /* ptr to inv(Dk) */ 116 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 117 } 118 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 123 { 124 PetscInt nz,k; 125 const PetscInt *vj,bs2 = bs*bs; 126 const MatScalar *v; 127 PetscScalar *xk,*xj; 128 129 PetscFunctionBegin; 130 for (k=mbs-1; k>=0; k--) { 131 v = aa + bs2*ai[k]; 132 xk = x + k*bs; /* xk */ 133 nz = ai[k+1] - ai[k]; 134 vj = aj + ai[k]; 135 xj = x + (*vj)*bs; 136 while (nz--) { 137 /* xk += U(k,:)*x(:) */ 138 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 139 vj++; 140 v += bs2; xj = x + (*vj)*bs; 141 } 142 } 143 PetscFunctionReturn(0); 144 } 145 146 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 147 { 148 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 149 PetscErrorCode ierr; 150 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 151 PetscInt bs =A->rmap->bs; 152 const MatScalar *aa=a->a; 153 PetscScalar *x; 154 const PetscScalar *b; 155 156 PetscFunctionBegin; 157 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 158 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 159 160 /* solve U^T * D * y = b by forward substitution */ 161 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */ 162 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 163 164 /* solve U*x = y by back substitution */ 165 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 166 167 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 169 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 174 { 175 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 176 PetscErrorCode ierr; 177 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 178 PetscInt bs =A->rmap->bs; 179 const MatScalar *aa=a->a; 180 const PetscScalar *b; 181 PetscScalar *x; 182 183 PetscFunctionBegin; 184 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 185 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 186 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */ 187 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 188 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 189 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 190 ierr = PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 195 { 196 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 197 PetscErrorCode ierr; 198 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 199 PetscInt bs =A->rmap->bs; 200 const MatScalar *aa=a->a; 201 const PetscScalar *b; 202 PetscScalar *x; 203 204 PetscFunctionBegin; 205 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 206 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 207 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); 208 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 209 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 210 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 211 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 212 PetscFunctionReturn(0); 213 } 214 215 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 216 { 217 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 218 IS isrow=a->row; 219 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 220 PetscErrorCode ierr; 221 PetscInt nz,k,idx; 222 const MatScalar *aa=a->a,*v,*d; 223 PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 224 const PetscScalar *b; 225 226 PetscFunctionBegin; 227 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 228 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 229 t = a->solve_work; 230 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 231 232 /* solve U^T * D * y = b by forward substitution */ 233 tp = t; 234 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 235 idx = 7*r[k]; 236 tp[0] = b[idx]; 237 tp[1] = b[idx+1]; 238 tp[2] = b[idx+2]; 239 tp[3] = b[idx+3]; 240 tp[4] = b[idx+4]; 241 tp[5] = b[idx+5]; 242 tp[6] = b[idx+6]; 243 tp += 7; 244 } 245 246 for (k=0; k<mbs; k++) { 247 v = aa + 49*ai[k]; 248 vj = aj + ai[k]; 249 tp = t + k*7; 250 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 251 nz = ai[k+1] - ai[k]; 252 tp = t + (*vj)*7; 253 while (nz--) { 254 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 255 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 256 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 257 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 258 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 259 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 260 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 261 vj++; 262 tp = t + (*vj)*7; 263 v += 49; 264 } 265 266 /* xk = inv(Dk)*(Dk*xk) */ 267 d = aa+k*49; /* ptr to inv(Dk) */ 268 tp = t + k*7; 269 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 270 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 271 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 272 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 273 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 274 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 275 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 276 } 277 278 /* solve U*x = y by back substitution */ 279 for (k=mbs-1; k>=0; k--) { 280 v = aa + 49*ai[k]; 281 vj = aj + ai[k]; 282 tp = t + k*7; 283 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 284 nz = ai[k+1] - ai[k]; 285 286 tp = t + (*vj)*7; 287 while (nz--) { 288 /* xk += U(k,:)*x(:) */ 289 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 290 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 291 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 292 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 293 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 294 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 295 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 296 vj++; 297 tp = t + (*vj)*7; 298 v += 49; 299 } 300 tp = t + k*7; 301 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 302 idx = 7*r[k]; 303 x[idx] = x0; 304 x[idx+1] = x1; 305 x[idx+2] = x2; 306 x[idx+3] = x3; 307 x[idx+4] = x4; 308 x[idx+5] = x5; 309 x[idx+6] = x6; 310 } 311 312 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 313 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 314 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 315 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 316 PetscFunctionReturn(0); 317 } 318 319 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 320 { 321 const MatScalar *v,*d; 322 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 323 PetscInt nz,k; 324 const PetscInt *vj; 325 326 PetscFunctionBegin; 327 for (k=0; k<mbs; k++) { 328 v = aa + 49*ai[k]; 329 xp = x + k*7; 330 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 331 nz = ai[k+1] - ai[k]; 332 vj = aj + ai[k]; 333 xp = x + (*vj)*7; 334 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 335 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 336 while (nz--) { 337 /* x(:) += U(k,:)^T*(Dk*xk) */ 338 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 339 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 340 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 341 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 342 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 343 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 344 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 345 vj++; 346 xp = x + (*vj)*7; 347 v += 49; 348 } 349 /* xk = inv(Dk)*(Dk*xk) */ 350 d = aa+k*49; /* ptr to inv(Dk) */ 351 xp = x + k*7; 352 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 353 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 354 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 355 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 356 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 357 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 358 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 359 } 360 PetscFunctionReturn(0); 361 } 362 363 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 364 { 365 const MatScalar *v; 366 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 367 PetscInt nz,k; 368 const PetscInt *vj; 369 370 PetscFunctionBegin; 371 for (k=mbs-1; k>=0; k--) { 372 v = aa + 49*ai[k]; 373 xp = x + k*7; 374 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 375 nz = ai[k+1] - ai[k]; 376 vj = aj + ai[k]; 377 xp = x + (*vj)*7; 378 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 379 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 380 while (nz--) { 381 /* xk += U(k,:)*x(:) */ 382 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 383 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 384 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 385 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 386 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 387 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 388 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 389 vj++; 390 v += 49; xp = x + (*vj)*7; 391 } 392 xp = x + k*7; 393 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 394 } 395 PetscFunctionReturn(0); 396 } 397 398 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 399 { 400 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 401 PetscErrorCode ierr; 402 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 403 const MatScalar *aa=a->a; 404 PetscScalar *x; 405 const PetscScalar *b; 406 407 PetscFunctionBegin; 408 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 409 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 410 411 /* solve U^T * D * y = b by forward substitution */ 412 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); /* x <- b */ 413 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 414 415 /* solve U*x = y by back substitution */ 416 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 417 418 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 419 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 420 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 425 { 426 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 427 PetscErrorCode ierr; 428 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 429 const MatScalar *aa=a->a; 430 PetscScalar *x; 431 const PetscScalar *b; 432 433 PetscFunctionBegin; 434 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 435 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 436 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); 437 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 438 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 439 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 440 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 441 PetscFunctionReturn(0); 442 } 443 444 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 445 { 446 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 447 PetscErrorCode ierr; 448 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 449 const MatScalar *aa=a->a; 450 PetscScalar *x; 451 const PetscScalar *b; 452 453 PetscFunctionBegin; 454 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 455 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 456 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); 457 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 458 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 459 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 460 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 461 PetscFunctionReturn(0); 462 } 463 464 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 467 IS isrow=a->row; 468 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 469 PetscErrorCode ierr; 470 PetscInt nz,k,idx; 471 const MatScalar *aa=a->a,*v,*d; 472 PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp; 473 const PetscScalar *b; 474 475 PetscFunctionBegin; 476 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 477 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 478 t = a->solve_work; 479 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 480 481 /* solve U^T * D * y = b by forward substitution */ 482 tp = t; 483 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 484 idx = 6*r[k]; 485 tp[0] = b[idx]; 486 tp[1] = b[idx+1]; 487 tp[2] = b[idx+2]; 488 tp[3] = b[idx+3]; 489 tp[4] = b[idx+4]; 490 tp[5] = b[idx+5]; 491 tp += 6; 492 } 493 494 for (k=0; k<mbs; k++) { 495 v = aa + 36*ai[k]; 496 vj = aj + ai[k]; 497 tp = t + k*6; 498 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 499 nz = ai[k+1] - ai[k]; 500 tp = t + (*vj)*6; 501 while (nz--) { 502 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 503 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 504 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 505 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 506 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 507 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 508 vj++; 509 tp = t + (*vj)*6; 510 v += 36; 511 } 512 513 /* xk = inv(Dk)*(Dk*xk) */ 514 d = aa+k*36; /* ptr to inv(Dk) */ 515 tp = t + k*6; 516 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 517 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 518 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 519 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 520 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 521 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 522 } 523 524 /* solve U*x = y by back substitution */ 525 for (k=mbs-1; k>=0; k--) { 526 v = aa + 36*ai[k]; 527 vj = aj + ai[k]; 528 tp = t + k*6; 529 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 530 nz = ai[k+1] - ai[k]; 531 532 tp = t + (*vj)*6; 533 while (nz--) { 534 /* xk += U(k,:)*x(:) */ 535 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 536 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 537 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 538 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 539 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 540 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 541 vj++; 542 tp = t + (*vj)*6; 543 v += 36; 544 } 545 tp = t + k*6; 546 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 547 idx = 6*r[k]; 548 x[idx] = x0; 549 x[idx+1] = x1; 550 x[idx+2] = x2; 551 x[idx+3] = x3; 552 x[idx+4] = x4; 553 x[idx+5] = x5; 554 } 555 556 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 557 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 558 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 559 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 560 PetscFunctionReturn(0); 561 } 562 563 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 564 { 565 const MatScalar *v,*d; 566 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 567 PetscInt nz,k; 568 const PetscInt *vj; 569 570 PetscFunctionBegin; 571 for (k=0; k<mbs; k++) { 572 v = aa + 36*ai[k]; 573 xp = x + k*6; 574 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 575 nz = ai[k+1] - ai[k]; 576 vj = aj + ai[k]; 577 xp = x + (*vj)*6; 578 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 579 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 580 while (nz--) { 581 /* x(:) += U(k,:)^T*(Dk*xk) */ 582 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 583 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 584 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 585 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 586 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 587 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 588 vj++; 589 xp = x + (*vj)*6; 590 v += 36; 591 } 592 /* xk = inv(Dk)*(Dk*xk) */ 593 d = aa+k*36; /* ptr to inv(Dk) */ 594 xp = x + k*6; 595 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 596 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 597 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 598 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 599 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 600 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 601 } 602 PetscFunctionReturn(0); 603 } 604 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 605 { 606 const MatScalar *v; 607 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 608 PetscInt nz,k; 609 const PetscInt *vj; 610 611 PetscFunctionBegin; 612 for (k=mbs-1; k>=0; k--) { 613 v = aa + 36*ai[k]; 614 xp = x + k*6; 615 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 616 nz = ai[k+1] - ai[k]; 617 vj = aj + ai[k]; 618 xp = x + (*vj)*6; 619 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 620 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 621 while (nz--) { 622 /* xk += U(k,:)*x(:) */ 623 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 624 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 625 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 626 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 627 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 628 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 629 vj++; 630 v += 36; xp = x + (*vj)*6; 631 } 632 xp = x + k*6; 633 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 634 } 635 PetscFunctionReturn(0); 636 } 637 638 639 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 640 { 641 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 642 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 643 const MatScalar *aa=a->a; 644 PetscScalar *x; 645 const PetscScalar *b; 646 PetscErrorCode ierr; 647 648 PetscFunctionBegin; 649 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 650 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 651 652 /* solve U^T * D * y = b by forward substitution */ 653 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 654 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 655 656 /* solve U*x = y by back substitution */ 657 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 658 659 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 660 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 661 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 662 PetscFunctionReturn(0); 663 } 664 665 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 666 { 667 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 668 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 669 const MatScalar *aa=a->a; 670 PetscScalar *x; 671 const PetscScalar *b; 672 PetscErrorCode ierr; 673 674 PetscFunctionBegin; 675 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 676 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 677 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 678 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 679 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 680 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 681 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 686 { 687 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 688 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 689 const MatScalar *aa=a->a; 690 PetscScalar *x; 691 const PetscScalar *b; 692 PetscErrorCode ierr; 693 694 PetscFunctionBegin; 695 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 696 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 697 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 698 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 699 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 700 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 701 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 702 PetscFunctionReturn(0); 703 } 704 705 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 706 { 707 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 708 IS isrow=a->row; 709 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 710 PetscErrorCode ierr; 711 const PetscInt *r,*vj; 712 PetscInt nz,k,idx; 713 const MatScalar *aa=a->a,*v,*diag; 714 PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp; 715 const PetscScalar *b; 716 717 PetscFunctionBegin; 718 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 719 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 720 t = a->solve_work; 721 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 722 723 /* solve U^T * D * y = b by forward substitution */ 724 tp = t; 725 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 726 idx = 5*r[k]; 727 tp[0] = b[idx]; 728 tp[1] = b[idx+1]; 729 tp[2] = b[idx+2]; 730 tp[3] = b[idx+3]; 731 tp[4] = b[idx+4]; 732 tp += 5; 733 } 734 735 for (k=0; k<mbs; k++) { 736 v = aa + 25*ai[k]; 737 vj = aj + ai[k]; 738 tp = t + k*5; 739 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 740 nz = ai[k+1] - ai[k]; 741 742 tp = t + (*vj)*5; 743 while (nz--) { 744 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 745 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 746 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 747 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 748 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 749 vj++; 750 tp = t + (*vj)*5; 751 v += 25; 752 } 753 754 /* xk = inv(Dk)*(Dk*xk) */ 755 diag = aa+k*25; /* ptr to inv(Dk) */ 756 tp = t + k*5; 757 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 758 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 759 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 760 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 761 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 762 } 763 764 /* solve U*x = y by back substitution */ 765 for (k=mbs-1; k>=0; k--) { 766 v = aa + 25*ai[k]; 767 vj = aj + ai[k]; 768 tp = t + k*5; 769 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */ 770 nz = ai[k+1] - ai[k]; 771 772 tp = t + (*vj)*5; 773 while (nz--) { 774 /* xk += U(k,:)*x(:) */ 775 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 776 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 777 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 778 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 779 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 780 vj++; 781 tp = t + (*vj)*5; 782 v += 25; 783 } 784 tp = t + k*5; 785 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 786 idx = 5*r[k]; 787 x[idx] = x0; 788 x[idx+1] = x1; 789 x[idx+2] = x2; 790 x[idx+3] = x3; 791 x[idx+4] = x4; 792 } 793 794 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 795 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 796 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 797 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 798 PetscFunctionReturn(0); 799 } 800 801 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 802 { 803 const MatScalar *v,*diag; 804 PetscScalar *xp,x0,x1,x2,x3,x4; 805 PetscInt nz,k; 806 const PetscInt *vj; 807 808 PetscFunctionBegin; 809 for (k=0; k<mbs; k++) { 810 v = aa + 25*ai[k]; 811 xp = x + k*5; 812 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 813 nz = ai[k+1] - ai[k]; 814 vj = aj + ai[k]; 815 xp = x + (*vj)*5; 816 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 817 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 818 while (nz--) { 819 /* x(:) += U(k,:)^T*(Dk*xk) */ 820 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 821 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 822 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 823 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 824 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 825 vj++; 826 xp = x + (*vj)*5; 827 v += 25; 828 } 829 /* xk = inv(Dk)*(Dk*xk) */ 830 diag = aa+k*25; /* ptr to inv(Dk) */ 831 xp = x + k*5; 832 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 833 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 834 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 835 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 836 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 837 } 838 PetscFunctionReturn(0); 839 } 840 841 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 842 { 843 const MatScalar *v; 844 PetscScalar *xp,x0,x1,x2,x3,x4; 845 PetscInt nz,k; 846 const PetscInt *vj; 847 848 PetscFunctionBegin; 849 for (k=mbs-1; k>=0; k--) { 850 v = aa + 25*ai[k]; 851 xp = x + k*5; 852 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */ 853 nz = ai[k+1] - ai[k]; 854 vj = aj + ai[k]; 855 xp = x + (*vj)*5; 856 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 857 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 858 while (nz--) { 859 /* xk += U(k,:)*x(:) */ 860 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 861 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 862 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 863 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 864 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 865 vj++; 866 v += 25; xp = x + (*vj)*5; 867 } 868 xp = x + k*5; 869 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 870 } 871 PetscFunctionReturn(0); 872 } 873 874 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 875 { 876 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 877 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 878 const MatScalar *aa=a->a; 879 PetscScalar *x; 880 const PetscScalar *b; 881 PetscErrorCode ierr; 882 883 PetscFunctionBegin; 884 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 885 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 886 887 /* solve U^T * D * y = b by forward substitution */ 888 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */ 889 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 890 891 /* solve U*x = y by back substitution */ 892 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 893 894 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 895 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 896 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 897 PetscFunctionReturn(0); 898 } 899 900 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 901 { 902 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 903 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 904 const MatScalar *aa=a->a; 905 PetscScalar *x; 906 const PetscScalar *b; 907 PetscErrorCode ierr; 908 909 PetscFunctionBegin; 910 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 912 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */ 913 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 914 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 915 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 916 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 917 PetscFunctionReturn(0); 918 } 919 920 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 921 { 922 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 923 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 924 const MatScalar *aa=a->a; 925 PetscScalar *x; 926 const PetscScalar *b; 927 PetscErrorCode ierr; 928 929 PetscFunctionBegin; 930 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 931 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 932 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); 933 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 934 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 935 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 936 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 937 PetscFunctionReturn(0); 938 } 939 940 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 941 { 942 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 943 IS isrow=a->row; 944 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 945 PetscErrorCode ierr; 946 const PetscInt *r,*vj; 947 PetscInt nz,k,idx; 948 const MatScalar *aa=a->a,*v,*diag; 949 PetscScalar *x,x0,x1,x2,x3,*t,*tp; 950 const PetscScalar *b; 951 952 PetscFunctionBegin; 953 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 954 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 955 t = a->solve_work; 956 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 957 958 /* solve U^T * D * y = b by forward substitution */ 959 tp = t; 960 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 961 idx = 4*r[k]; 962 tp[0] = b[idx]; 963 tp[1] = b[idx+1]; 964 tp[2] = b[idx+2]; 965 tp[3] = b[idx+3]; 966 tp += 4; 967 } 968 969 for (k=0; k<mbs; k++) { 970 v = aa + 16*ai[k]; 971 vj = aj + ai[k]; 972 tp = t + k*4; 973 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 974 nz = ai[k+1] - ai[k]; 975 976 tp = t + (*vj)*4; 977 while (nz--) { 978 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 979 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 980 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 981 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 982 vj++; 983 tp = t + (*vj)*4; 984 v += 16; 985 } 986 987 /* xk = inv(Dk)*(Dk*xk) */ 988 diag = aa+k*16; /* ptr to inv(Dk) */ 989 tp = t + k*4; 990 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 991 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 992 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 993 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 994 } 995 996 /* solve U*x = y by back substitution */ 997 for (k=mbs-1; k>=0; k--) { 998 v = aa + 16*ai[k]; 999 vj = aj + ai[k]; 1000 tp = t + k*4; 1001 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 1002 nz = ai[k+1] - ai[k]; 1003 1004 tp = t + (*vj)*4; 1005 while (nz--) { 1006 /* xk += U(k,:)*x(:) */ 1007 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 1008 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 1009 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 1010 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 1011 vj++; 1012 tp = t + (*vj)*4; 1013 v += 16; 1014 } 1015 tp = t + k*4; 1016 tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 1017 idx = 4*r[k]; 1018 x[idx] = x0; 1019 x[idx+1] = x1; 1020 x[idx+2] = x2; 1021 x[idx+3] = x3; 1022 } 1023 1024 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1025 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1026 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1027 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1028 PetscFunctionReturn(0); 1029 } 1030 1031 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1032 { 1033 const MatScalar *v,*diag; 1034 PetscScalar *xp,x0,x1,x2,x3; 1035 PetscInt nz,k; 1036 const PetscInt *vj; 1037 1038 PetscFunctionBegin; 1039 for (k=0; k<mbs; k++) { 1040 v = aa + 16*ai[k]; 1041 xp = x + k*4; 1042 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1043 nz = ai[k+1] - ai[k]; 1044 vj = aj + ai[k]; 1045 xp = x + (*vj)*4; 1046 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1047 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1048 while (nz--) { 1049 /* x(:) += U(k,:)^T*(Dk*xk) */ 1050 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1051 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1052 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1053 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1054 vj++; 1055 xp = x + (*vj)*4; 1056 v += 16; 1057 } 1058 /* xk = inv(Dk)*(Dk*xk) */ 1059 diag = aa+k*16; /* ptr to inv(Dk) */ 1060 xp = x + k*4; 1061 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1062 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1063 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1064 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1065 } 1066 PetscFunctionReturn(0); 1067 } 1068 1069 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1070 { 1071 const MatScalar *v; 1072 PetscScalar *xp,x0,x1,x2,x3; 1073 PetscInt nz,k; 1074 const PetscInt *vj; 1075 1076 PetscFunctionBegin; 1077 for (k=mbs-1; k>=0; k--) { 1078 v = aa + 16*ai[k]; 1079 xp = x + k*4; 1080 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1081 nz = ai[k+1] - ai[k]; 1082 vj = aj + ai[k]; 1083 xp = x + (*vj)*4; 1084 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1085 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1086 while (nz--) { 1087 /* xk += U(k,:)*x(:) */ 1088 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1089 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1090 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1091 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1092 vj++; 1093 v += 16; xp = x + (*vj)*4; 1094 } 1095 xp = x + k*4; 1096 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1097 } 1098 PetscFunctionReturn(0); 1099 } 1100 1101 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1102 { 1103 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1104 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1105 const MatScalar *aa=a->a; 1106 PetscScalar *x; 1107 const PetscScalar *b; 1108 PetscErrorCode ierr; 1109 1110 PetscFunctionBegin; 1111 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1112 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1113 1114 /* solve U^T * D * y = b by forward substitution */ 1115 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */ 1116 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1117 1118 /* solve U*x = y by back substitution */ 1119 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1120 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1121 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1122 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1123 PetscFunctionReturn(0); 1124 } 1125 1126 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1127 { 1128 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1129 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1130 const MatScalar *aa=a->a; 1131 PetscScalar *x; 1132 const PetscScalar *b; 1133 PetscErrorCode ierr; 1134 1135 PetscFunctionBegin; 1136 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1137 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1138 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */ 1139 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1140 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1141 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1142 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1143 PetscFunctionReturn(0); 1144 } 1145 1146 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1147 { 1148 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1149 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1150 const MatScalar *aa=a->a; 1151 PetscScalar *x; 1152 const PetscScalar *b; 1153 PetscErrorCode ierr; 1154 1155 PetscFunctionBegin; 1156 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1157 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1158 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); 1159 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1160 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1162 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1167 { 1168 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1169 IS isrow=a->row; 1170 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1171 PetscErrorCode ierr; 1172 const PetscInt *r; 1173 PetscInt nz,k,idx; 1174 const PetscInt *vj; 1175 const MatScalar *aa=a->a,*v,*diag; 1176 PetscScalar *x,x0,x1,x2,*t,*tp; 1177 const PetscScalar *b; 1178 1179 PetscFunctionBegin; 1180 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1181 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182 t = a->solve_work; 1183 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1184 1185 /* solve U^T * D * y = b by forward substitution */ 1186 tp = t; 1187 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1188 idx = 3*r[k]; 1189 tp[0] = b[idx]; 1190 tp[1] = b[idx+1]; 1191 tp[2] = b[idx+2]; 1192 tp += 3; 1193 } 1194 1195 for (k=0; k<mbs; k++) { 1196 v = aa + 9*ai[k]; 1197 vj = aj + ai[k]; 1198 tp = t + k*3; 1199 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1200 nz = ai[k+1] - ai[k]; 1201 1202 tp = t + (*vj)*3; 1203 while (nz--) { 1204 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1205 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1206 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1207 vj++; 1208 tp = t + (*vj)*3; 1209 v += 9; 1210 } 1211 1212 /* xk = inv(Dk)*(Dk*xk) */ 1213 diag = aa+k*9; /* ptr to inv(Dk) */ 1214 tp = t + k*3; 1215 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1216 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1217 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1218 } 1219 1220 /* solve U*x = y by back substitution */ 1221 for (k=mbs-1; k>=0; k--) { 1222 v = aa + 9*ai[k]; 1223 vj = aj + ai[k]; 1224 tp = t + k*3; 1225 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1226 nz = ai[k+1] - ai[k]; 1227 1228 tp = t + (*vj)*3; 1229 while (nz--) { 1230 /* xk += U(k,:)*x(:) */ 1231 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1232 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1233 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1234 vj++; 1235 tp = t + (*vj)*3; 1236 v += 9; 1237 } 1238 tp = t + k*3; 1239 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1240 idx = 3*r[k]; 1241 x[idx] = x0; 1242 x[idx+1] = x1; 1243 x[idx+2] = x2; 1244 } 1245 1246 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1247 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1248 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1249 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1250 PetscFunctionReturn(0); 1251 } 1252 1253 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1254 { 1255 const MatScalar *v,*diag; 1256 PetscScalar *xp,x0,x1,x2; 1257 PetscInt nz,k; 1258 const PetscInt *vj; 1259 1260 PetscFunctionBegin; 1261 for (k=0; k<mbs; k++) { 1262 v = aa + 9*ai[k]; 1263 xp = x + k*3; 1264 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1265 nz = ai[k+1] - ai[k]; 1266 vj = aj + ai[k]; 1267 xp = x + (*vj)*3; 1268 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1269 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1270 while (nz--) { 1271 /* x(:) += U(k,:)^T*(Dk*xk) */ 1272 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1273 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1274 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1275 vj++; 1276 xp = x + (*vj)*3; 1277 v += 9; 1278 } 1279 /* xk = inv(Dk)*(Dk*xk) */ 1280 diag = aa+k*9; /* ptr to inv(Dk) */ 1281 xp = x + k*3; 1282 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1283 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1284 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1285 } 1286 PetscFunctionReturn(0); 1287 } 1288 1289 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1290 { 1291 const MatScalar *v; 1292 PetscScalar *xp,x0,x1,x2; 1293 PetscInt nz,k; 1294 const PetscInt *vj; 1295 1296 PetscFunctionBegin; 1297 for (k=mbs-1; k>=0; k--) { 1298 v = aa + 9*ai[k]; 1299 xp = x + k*3; 1300 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1301 nz = ai[k+1] - ai[k]; 1302 vj = aj + ai[k]; 1303 xp = x + (*vj)*3; 1304 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1305 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1306 while (nz--) { 1307 /* xk += U(k,:)*x(:) */ 1308 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1309 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1310 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1311 vj++; 1312 v += 9; xp = x + (*vj)*3; 1313 } 1314 xp = x + k*3; 1315 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1316 } 1317 PetscFunctionReturn(0); 1318 } 1319 1320 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1321 { 1322 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1323 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1324 const MatScalar *aa=a->a; 1325 PetscScalar *x; 1326 const PetscScalar *b; 1327 PetscErrorCode ierr; 1328 1329 PetscFunctionBegin; 1330 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1331 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1332 1333 /* solve U^T * D * y = b by forward substitution */ 1334 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1335 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1336 1337 /* solve U*x = y by back substitution */ 1338 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1339 1340 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1341 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1342 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1343 PetscFunctionReturn(0); 1344 } 1345 1346 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1347 { 1348 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1349 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1350 const MatScalar *aa=a->a; 1351 PetscScalar *x; 1352 const PetscScalar *b; 1353 PetscErrorCode ierr; 1354 1355 PetscFunctionBegin; 1356 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1357 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1358 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1359 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1360 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1361 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1362 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1363 PetscFunctionReturn(0); 1364 } 1365 1366 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1367 { 1368 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1369 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1370 const MatScalar *aa=a->a; 1371 PetscScalar *x; 1372 const PetscScalar *b; 1373 PetscErrorCode ierr; 1374 1375 PetscFunctionBegin; 1376 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1377 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1378 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1379 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1380 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1381 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1382 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1383 PetscFunctionReturn(0); 1384 } 1385 1386 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1387 { 1388 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1389 IS isrow=a->row; 1390 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1391 PetscErrorCode ierr; 1392 const PetscInt *r,*vj; 1393 PetscInt nz,k,k2,idx; 1394 const MatScalar *aa=a->a,*v,*diag; 1395 PetscScalar *x,x0,x1,*t; 1396 const PetscScalar *b; 1397 1398 PetscFunctionBegin; 1399 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1400 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1401 t = a->solve_work; 1402 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1403 1404 /* solve U^T * D * y = perm(b) by forward substitution */ 1405 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1406 idx = 2*r[k]; 1407 t[k*2] = b[idx]; 1408 t[k*2+1] = b[idx+1]; 1409 } 1410 for (k=0; k<mbs; k++) { 1411 v = aa + 4*ai[k]; 1412 vj = aj + ai[k]; 1413 k2 = k*2; 1414 x0 = t[k2]; x1 = t[k2+1]; 1415 nz = ai[k+1] - ai[k]; 1416 while (nz--) { 1417 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1418 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1419 vj++; v += 4; 1420 } 1421 diag = aa+k*4; /* ptr to inv(Dk) */ 1422 t[k2] = diag[0]*x0 + diag[2]*x1; 1423 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1424 } 1425 1426 /* solve U*x = y by back substitution */ 1427 for (k=mbs-1; k>=0; k--) { 1428 v = aa + 4*ai[k]; 1429 vj = aj + ai[k]; 1430 k2 = k*2; 1431 x0 = t[k2]; x1 = t[k2+1]; 1432 nz = ai[k+1] - ai[k]; 1433 while (nz--) { 1434 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1435 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1436 vj++; 1437 v += 4; 1438 } 1439 t[k2] = x0; 1440 t[k2+1] = x1; 1441 idx = 2*r[k]; 1442 x[idx] = x0; 1443 x[idx+1] = x1; 1444 } 1445 1446 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1447 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1448 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1449 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1450 PetscFunctionReturn(0); 1451 } 1452 1453 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1454 { 1455 const MatScalar *v,*diag; 1456 PetscScalar x0,x1; 1457 PetscInt nz,k,k2; 1458 const PetscInt *vj; 1459 1460 PetscFunctionBegin; 1461 for (k=0; k<mbs; k++) { 1462 v = aa + 4*ai[k]; 1463 vj = aj + ai[k]; 1464 k2 = k*2; 1465 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1466 nz = ai[k+1] - ai[k]; 1467 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1468 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1469 while (nz--) { 1470 /* x(:) += U(k,:)^T*(Dk*xk) */ 1471 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1472 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1473 vj++; v += 4; 1474 } 1475 /* xk = inv(Dk)*(Dk*xk) */ 1476 diag = aa+k*4; /* ptr to inv(Dk) */ 1477 x[k2] = diag[0]*x0 + diag[2]*x1; 1478 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1479 } 1480 PetscFunctionReturn(0); 1481 } 1482 1483 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1484 { 1485 const MatScalar *v; 1486 PetscScalar x0,x1; 1487 PetscInt nz,k,k2; 1488 const PetscInt *vj; 1489 1490 PetscFunctionBegin; 1491 for (k=mbs-1; k>=0; k--) { 1492 v = aa + 4*ai[k]; 1493 vj = aj + ai[k]; 1494 k2 = k*2; 1495 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1496 nz = ai[k+1] - ai[k]; 1497 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1498 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1499 while (nz--) { 1500 /* xk += U(k,:)*x(:) */ 1501 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1502 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1503 vj++; 1504 v += 4; 1505 } 1506 x[k2] = x0; 1507 x[k2+1] = x1; 1508 } 1509 PetscFunctionReturn(0); 1510 } 1511 1512 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1513 { 1514 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1515 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1516 const MatScalar *aa=a->a; 1517 PetscScalar *x; 1518 const PetscScalar *b; 1519 PetscErrorCode ierr; 1520 1521 PetscFunctionBegin; 1522 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1523 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1524 1525 /* solve U^T * D * y = b by forward substitution */ 1526 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1527 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1528 1529 /* solve U*x = y by back substitution */ 1530 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1531 1532 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1533 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1534 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1539 { 1540 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1541 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1542 const MatScalar *aa=a->a; 1543 PetscScalar *x; 1544 const PetscScalar *b; 1545 PetscErrorCode ierr; 1546 1547 PetscFunctionBegin; 1548 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1549 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1550 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1551 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1552 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1553 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1554 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1555 PetscFunctionReturn(0); 1556 } 1557 1558 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1559 { 1560 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1561 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1562 const MatScalar *aa=a->a; 1563 PetscScalar *x; 1564 const PetscScalar *b; 1565 PetscErrorCode ierr; 1566 1567 PetscFunctionBegin; 1568 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1569 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1570 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1571 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1572 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1573 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1574 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 1575 PetscFunctionReturn(0); 1576 } 1577 1578 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1579 { 1580 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1581 IS isrow=a->row; 1582 PetscErrorCode ierr; 1583 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1584 const MatScalar *aa=a->a,*v; 1585 const PetscScalar *b; 1586 PetscScalar *x,xk,*t; 1587 PetscInt nz,k,j; 1588 1589 PetscFunctionBegin; 1590 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1591 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1592 t = a->solve_work; 1593 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1594 1595 /* solve U^T*D*y = perm(b) by forward substitution */ 1596 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1597 for (k=0; k<mbs; k++) { 1598 v = aa + ai[k]; 1599 vj = aj + ai[k]; 1600 xk = t[k]; 1601 nz = ai[k+1] - ai[k] - 1; 1602 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1603 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1604 } 1605 1606 /* solve U*perm(x) = y by back substitution */ 1607 for (k=mbs-1; k>=0; k--) { 1608 v = aa + adiag[k] - 1; 1609 vj = aj + adiag[k] - 1; 1610 nz = ai[k+1] - ai[k] - 1; 1611 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1612 x[rp[k]] = t[k]; 1613 } 1614 1615 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1616 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1617 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1618 ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr); 1619 PetscFunctionReturn(0); 1620 } 1621 1622 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1623 { 1624 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1625 IS isrow=a->row; 1626 PetscErrorCode ierr; 1627 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1628 const MatScalar *aa=a->a,*v; 1629 PetscScalar *x,xk,*t; 1630 const PetscScalar *b; 1631 PetscInt nz,k; 1632 1633 PetscFunctionBegin; 1634 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1635 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1636 t = a->solve_work; 1637 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1638 1639 /* solve U^T*D*y = perm(b) by forward substitution */ 1640 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1641 for (k=0; k<mbs; k++) { 1642 v = aa + ai[k] + 1; 1643 vj = aj + ai[k] + 1; 1644 xk = t[k]; 1645 nz = ai[k+1] - ai[k] - 1; 1646 while (nz--) t[*vj++] += (*v++) * xk; 1647 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1648 } 1649 1650 /* solve U*perm(x) = y by back substitution */ 1651 for (k=mbs-1; k>=0; k--) { 1652 v = aa + ai[k] + 1; 1653 vj = aj + ai[k] + 1; 1654 nz = ai[k+1] - ai[k] - 1; 1655 while (nz--) t[k] += (*v++) * t[*vj++]; 1656 x[rp[k]] = t[k]; 1657 } 1658 1659 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1660 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1661 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1662 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1663 PetscFunctionReturn(0); 1664 } 1665 1666 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1667 { 1668 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1669 IS isrow=a->row; 1670 PetscErrorCode ierr; 1671 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1672 const MatScalar *aa=a->a,*v; 1673 PetscReal diagk; 1674 PetscScalar *x,xk; 1675 const PetscScalar *b; 1676 PetscInt nz,k; 1677 1678 PetscFunctionBegin; 1679 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1680 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1681 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1682 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1683 1684 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1685 for (k=0; k<mbs; k++) { 1686 v = aa + ai[k]; 1687 vj = aj + ai[k]; 1688 xk = x[k]; 1689 nz = ai[k+1] - ai[k] - 1; 1690 while (nz--) x[*vj++] += (*v++) * xk; 1691 1692 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1693 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1694 x[k] = xk*PetscSqrtReal(diagk); 1695 } 1696 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1697 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1698 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1699 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1700 PetscFunctionReturn(0); 1701 } 1702 1703 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1704 { 1705 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1706 IS isrow=a->row; 1707 PetscErrorCode ierr; 1708 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1709 const MatScalar *aa=a->a,*v; 1710 PetscReal diagk; 1711 PetscScalar *x,xk; 1712 const PetscScalar *b; 1713 PetscInt nz,k; 1714 1715 PetscFunctionBegin; 1716 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1717 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1719 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1720 1721 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1722 for (k=0; k<mbs; k++) { 1723 v = aa + ai[k] + 1; 1724 vj = aj + ai[k] + 1; 1725 xk = x[k]; 1726 nz = ai[k+1] - ai[k] - 1; 1727 while (nz--) x[*vj++] += (*v++) * xk; 1728 1729 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1730 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1731 x[k] = xk*PetscSqrtReal(diagk); 1732 } 1733 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1734 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1735 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1736 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1737 PetscFunctionReturn(0); 1738 } 1739 1740 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1741 { 1742 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1743 IS isrow=a->row; 1744 PetscErrorCode ierr; 1745 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1746 const MatScalar *aa=a->a,*v; 1747 PetscReal diagk; 1748 PetscScalar *x,*t; 1749 const PetscScalar *b; 1750 PetscInt nz,k; 1751 1752 PetscFunctionBegin; 1753 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1754 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1755 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1756 t = a->solve_work; 1757 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1758 1759 for (k=mbs-1; k>=0; k--) { 1760 v = aa + ai[k]; 1761 vj = aj + ai[k]; 1762 diagk = PetscRealPart(aa[adiag[k]]); 1763 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1764 t[k] = b[k] * PetscSqrtReal(diagk); 1765 nz = ai[k+1] - ai[k] - 1; 1766 while (nz--) t[k] += (*v++) * t[*vj++]; 1767 x[rp[k]] = t[k]; 1768 } 1769 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1770 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1771 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1772 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1773 PetscFunctionReturn(0); 1774 } 1775 1776 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1777 { 1778 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1779 IS isrow=a->row; 1780 PetscErrorCode ierr; 1781 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1782 const MatScalar *aa=a->a,*v; 1783 PetscReal diagk; 1784 PetscScalar *x,*t; 1785 const PetscScalar *b; 1786 PetscInt nz,k; 1787 1788 PetscFunctionBegin; 1789 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1790 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1791 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1792 t = a->solve_work; 1793 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1794 1795 for (k=mbs-1; k>=0; k--) { 1796 v = aa + ai[k] + 1; 1797 vj = aj + ai[k] + 1; 1798 diagk = PetscRealPart(aa[ai[k]]); 1799 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1800 t[k] = b[k] * PetscSqrtReal(diagk); 1801 nz = ai[k+1] - ai[k] - 1; 1802 while (nz--) t[k] += (*v++) * t[*vj++]; 1803 x[rp[k]] = t[k]; 1804 } 1805 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1806 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1807 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1808 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1809 PetscFunctionReturn(0); 1810 } 1811 1812 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1813 { 1814 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1815 PetscErrorCode ierr; 1816 1817 PetscFunctionBegin; 1818 if (A->rmap->bs == 1) { 1819 ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr); 1820 } else { 1821 IS isrow=a->row; 1822 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1823 const MatScalar *aa=a->a,*v; 1824 PetscScalar *x,*t; 1825 const PetscScalar *b; 1826 PetscInt nz,k,n,i,j; 1827 1828 if (bb->n > a->solves_work_n) { 1829 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1830 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1831 a->solves_work_n = bb->n; 1832 } 1833 n = bb->n; 1834 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1835 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1836 t = a->solves_work; 1837 1838 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1839 1840 /* solve U^T*D*y = perm(b) by forward substitution */ 1841 for (k=0; k<mbs; k++) { 1842 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1843 } 1844 for (k=0; k<mbs; k++) { 1845 v = aa + ai[k]; 1846 vj = aj + ai[k]; 1847 nz = ai[k+1] - ai[k] - 1; 1848 for (j=0; j<nz; j++) { 1849 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1850 v++;vj++; 1851 } 1852 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1853 } 1854 1855 /* solve U*perm(x) = y by back substitution */ 1856 for (k=mbs-1; k>=0; k--) { 1857 v = aa + ai[k] - 1; 1858 vj = aj + ai[k] - 1; 1859 nz = ai[k+1] - ai[k] - 1; 1860 for (j=0; j<nz; j++) { 1861 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1862 v++;vj++; 1863 } 1864 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1865 } 1866 1867 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1868 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1869 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1870 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1871 } 1872 PetscFunctionReturn(0); 1873 } 1874 1875 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1876 { 1877 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1878 PetscErrorCode ierr; 1879 1880 PetscFunctionBegin; 1881 if (A->rmap->bs == 1) { 1882 ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr); 1883 } else { 1884 IS isrow=a->row; 1885 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1886 const MatScalar *aa=a->a,*v; 1887 PetscScalar *x,*t; 1888 const PetscScalar *b; 1889 PetscInt nz,k,n,i; 1890 1891 if (bb->n > a->solves_work_n) { 1892 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1893 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1894 a->solves_work_n = bb->n; 1895 } 1896 n = bb->n; 1897 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1898 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1899 t = a->solves_work; 1900 1901 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1902 1903 /* solve U^T*D*y = perm(b) by forward substitution */ 1904 for (k=0; k<mbs; k++) { 1905 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1906 } 1907 for (k=0; k<mbs; k++) { 1908 v = aa + ai[k]; 1909 vj = aj + ai[k]; 1910 nz = ai[k+1] - ai[k]; 1911 while (nz--) { 1912 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1913 v++;vj++; 1914 } 1915 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1916 } 1917 1918 /* solve U*perm(x) = y by back substitution */ 1919 for (k=mbs-1; k>=0; k--) { 1920 v = aa + ai[k]; 1921 vj = aj + ai[k]; 1922 nz = ai[k+1] - ai[k]; 1923 while (nz--) { 1924 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1925 v++;vj++; 1926 } 1927 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1928 } 1929 1930 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1931 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1932 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1933 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1934 } 1935 PetscFunctionReturn(0); 1936 } 1937 1938 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1939 { 1940 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1941 PetscErrorCode ierr; 1942 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1943 const MatScalar *aa=a->a,*v; 1944 const PetscScalar *b; 1945 PetscScalar *x,xi; 1946 PetscInt nz,i,j; 1947 1948 PetscFunctionBegin; 1949 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1950 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1951 /* solve U^T*D*y = b by forward substitution */ 1952 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 1953 for (i=0; i<mbs; i++) { 1954 v = aa + ai[i]; 1955 vj = aj + ai[i]; 1956 xi = x[i]; 1957 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1958 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 1959 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 1960 } 1961 /* solve U*x = y by backward substitution */ 1962 for (i=mbs-2; i>=0; i--) { 1963 xi = x[i]; 1964 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 1965 vj = aj + adiag[i] - 1; 1966 nz = ai[i+1] - ai[i] - 1; 1967 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 1968 x[i] = xi; 1969 } 1970 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1971 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1972 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1973 PetscFunctionReturn(0); 1974 } 1975 1976 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X) 1977 { 1978 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1979 PetscErrorCode ierr; 1980 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1981 const MatScalar *aa=a->a,*v; 1982 const PetscScalar *b; 1983 PetscScalar *x,xi; 1984 PetscInt nz,i,j,neq,ldb,ldx; 1985 PetscBool isdense; 1986 1987 PetscFunctionBegin; 1988 if (!mbs) PetscFunctionReturn(0); 1989 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 1990 if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix"); 1991 if (X != B) { 1992 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);CHKERRQ(ierr); 1993 if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix"); 1994 } 1995 ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1996 ierr = MatDenseGetLDA(B,&ldb);CHKERRQ(ierr); 1997 ierr = MatDenseGetArray(X,&x);CHKERRQ(ierr); 1998 ierr = MatDenseGetLDA(X,&ldx);CHKERRQ(ierr); 1999 for (neq=0; neq<B->cmap->n; neq++) { 2000 /* solve U^T*D*y = b by forward substitution */ 2001 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2002 for (i=0; i<mbs; i++) { 2003 v = aa + ai[i]; 2004 vj = aj + ai[i]; 2005 xi = x[i]; 2006 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 2007 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 2008 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2009 } 2010 /* solve U*x = y by backward substitution */ 2011 for (i=mbs-2; i>=0; i--) { 2012 xi = x[i]; 2013 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2014 vj = aj + adiag[i] - 1; 2015 nz = ai[i+1] - ai[i] - 1; 2016 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 2017 x[i] = xi; 2018 } 2019 b += ldb; 2020 x += ldx; 2021 } 2022 ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 2023 ierr = MatDenseRestoreArray(X,&x);CHKERRQ(ierr); 2024 ierr = PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));CHKERRQ(ierr); 2025 PetscFunctionReturn(0); 2026 } 2027 2028 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2029 { 2030 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2031 PetscErrorCode ierr; 2032 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2033 const MatScalar *aa=a->a,*v; 2034 PetscScalar *x,xk; 2035 const PetscScalar *b; 2036 PetscInt nz,k; 2037 2038 PetscFunctionBegin; 2039 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2040 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2041 2042 /* solve U^T*D*y = b by forward substitution */ 2043 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2044 for (k=0; k<mbs; k++) { 2045 v = aa + ai[k] + 1; 2046 vj = aj + ai[k] + 1; 2047 xk = x[k]; 2048 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2049 while (nz--) x[*vj++] += (*v++) * xk; 2050 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2051 } 2052 2053 /* solve U*x = y by back substitution */ 2054 for (k=mbs-2; k>=0; k--) { 2055 v = aa + ai[k] + 1; 2056 vj = aj + ai[k] + 1; 2057 xk = x[k]; 2058 nz = ai[k+1] - ai[k] - 1; 2059 while (nz--) { 2060 xk += (*v++) * x[*vj++]; 2061 } 2062 x[k] = xk; 2063 } 2064 2065 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2066 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2067 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2068 PetscFunctionReturn(0); 2069 } 2070 2071 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2072 { 2073 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2074 PetscErrorCode ierr; 2075 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2076 const MatScalar *aa=a->a,*v; 2077 PetscReal diagk; 2078 PetscScalar *x; 2079 const PetscScalar *b; 2080 PetscInt nz,k; 2081 2082 PetscFunctionBegin; 2083 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2084 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2085 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2086 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2087 for (k=0; k<mbs; k++) { 2088 v = aa + ai[k]; 2089 vj = aj + ai[k]; 2090 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2091 while (nz--) x[*vj++] += (*v++) * x[k]; 2092 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2093 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]])); 2094 x[k] *= PetscSqrtReal(diagk); 2095 } 2096 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2097 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2098 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2099 PetscFunctionReturn(0); 2100 } 2101 2102 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2103 { 2104 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2105 PetscErrorCode ierr; 2106 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2107 const MatScalar *aa=a->a,*v; 2108 PetscReal diagk; 2109 PetscScalar *x; 2110 const PetscScalar *b; 2111 PetscInt nz,k; 2112 2113 PetscFunctionBegin; 2114 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2115 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2116 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2117 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2118 for (k=0; k<mbs; k++) { 2119 v = aa + ai[k] + 1; 2120 vj = aj + ai[k] + 1; 2121 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2122 while (nz--) x[*vj++] += (*v++) * x[k]; 2123 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2124 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]])); 2125 x[k] *= PetscSqrtReal(diagk); 2126 } 2127 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2128 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2129 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2130 PetscFunctionReturn(0); 2131 } 2132 2133 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2134 { 2135 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2136 PetscErrorCode ierr; 2137 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2138 const MatScalar *aa=a->a,*v; 2139 PetscReal diagk; 2140 PetscScalar *x; 2141 const PetscScalar *b; 2142 PetscInt nz,k; 2143 2144 PetscFunctionBegin; 2145 /* solve D^(1/2)*U*x = b by back substitution */ 2146 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2147 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2148 2149 for (k=mbs-1; k>=0; k--) { 2150 v = aa + ai[k]; 2151 vj = aj + ai[k]; 2152 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2153 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2154 x[k] = PetscSqrtReal(diagk)*b[k]; 2155 nz = ai[k+1] - ai[k] - 1; 2156 while (nz--) x[k] += (*v++) * x[*vj++]; 2157 } 2158 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2159 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2160 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2161 PetscFunctionReturn(0); 2162 } 2163 2164 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2165 { 2166 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2167 PetscErrorCode ierr; 2168 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2169 const MatScalar *aa=a->a,*v; 2170 PetscReal diagk; 2171 PetscScalar *x; 2172 const PetscScalar *b; 2173 PetscInt nz,k; 2174 2175 PetscFunctionBegin; 2176 /* solve D^(1/2)*U*x = b by back substitution */ 2177 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2178 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2179 2180 for (k=mbs-1; k>=0; k--) { 2181 v = aa + ai[k] + 1; 2182 vj = aj + ai[k] + 1; 2183 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2184 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2185 x[k] = PetscSqrtReal(diagk)*b[k]; 2186 nz = ai[k+1] - ai[k] - 1; 2187 while (nz--) x[k] += (*v++) * x[*vj++]; 2188 } 2189 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2190 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2191 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2192 PetscFunctionReturn(0); 2193 } 2194 2195 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2196 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2197 { 2198 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2199 PetscErrorCode ierr; 2200 const PetscInt *rip,mbs = a->mbs,*ai,*aj; 2201 PetscInt *jutmp,bs = A->rmap->bs,i; 2202 PetscInt m,reallocs = 0,*levtmp; 2203 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2204 PetscInt incrlev,*lev,shift,prow,nz; 2205 PetscReal f = info->fill,levels = info->levels; 2206 PetscBool perm_identity; 2207 2208 PetscFunctionBegin; 2209 /* check whether perm is the identity mapping */ 2210 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2211 2212 if (perm_identity) { 2213 a->permute = PETSC_FALSE; 2214 ai = a->i; aj = a->j; 2215 } else { /* non-trivial permutation */ 2216 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2217 } 2218 2219 /* initialization */ 2220 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2221 umax = (PetscInt)(f*ai[mbs] + 1); 2222 ierr = PetscMalloc1(umax,&lev);CHKERRQ(ierr); 2223 umax += mbs + 1; 2224 shift = mbs + 1; 2225 ierr = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr); 2226 ierr = PetscMalloc1(umax,&ju);CHKERRQ(ierr); 2227 iu[0] = mbs + 1; 2228 juidx = mbs + 1; 2229 /* prowl: linked list for pivot row */ 2230 ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr); 2231 /* q: linked list for col index */ 2232 2233 for (i=0; i<mbs; i++) { 2234 prowl[i] = mbs; 2235 q[i] = 0; 2236 } 2237 2238 /* for each row k */ 2239 for (k=0; k<mbs; k++) { 2240 nzk = 0; 2241 q[k] = mbs; 2242 /* copy current row into linked list */ 2243 nz = ai[rip[k]+1] - ai[rip[k]]; 2244 j = ai[rip[k]]; 2245 while (nz--) { 2246 vj = rip[aj[j++]]; 2247 if (vj > k) { 2248 qm = k; 2249 do { 2250 m = qm; qm = q[m]; 2251 } while (qm < vj); 2252 if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n"); 2253 nzk++; 2254 q[m] = vj; 2255 q[vj] = qm; 2256 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2257 } 2258 } 2259 2260 /* modify nonzero structure of k-th row by computing fill-in 2261 for each row prow to be merged in */ 2262 prow = k; 2263 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2264 2265 while (prow < k) { 2266 /* merge row prow into k-th row */ 2267 jmin = iu[prow] + 1; 2268 jmax = iu[prow+1]; 2269 qm = k; 2270 for (j=jmin; j<jmax; j++) { 2271 incrlev = lev[j-shift] + 1; 2272 if (incrlev > levels) continue; 2273 vj = ju[j]; 2274 do { 2275 m = qm; qm = q[m]; 2276 } while (qm < vj); 2277 if (qm != vj) { /* a fill */ 2278 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2279 levtmp[vj] = incrlev; 2280 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2281 } 2282 prow = prowl[prow]; /* next pivot row */ 2283 } 2284 2285 /* add k to row list for first nonzero element in k-th row */ 2286 if (nzk > 1) { 2287 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2288 prowl[k] = prowl[i]; prowl[i] = k; 2289 } 2290 iu[k+1] = iu[k] + nzk; 2291 2292 /* allocate more space to ju and lev if needed */ 2293 if (iu[k+1] > umax) { 2294 /* estimate how much additional space we will need */ 2295 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2296 /* just double the memory each time */ 2297 maxadd = umax; 2298 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2299 umax += maxadd; 2300 2301 /* allocate a longer ju */ 2302 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2303 ierr = PetscArraycpy(jutmp,ju,iu[k]);CHKERRQ(ierr); 2304 ierr = PetscFree(ju);CHKERRQ(ierr); 2305 ju = jutmp; 2306 2307 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2308 ierr = PetscArraycpy(jutmp,lev,iu[k]-shift);CHKERRQ(ierr); 2309 ierr = PetscFree(lev);CHKERRQ(ierr); 2310 lev = jutmp; 2311 reallocs += 2; /* count how many times we realloc */ 2312 } 2313 2314 /* save nonzero structure of k-th row in ju */ 2315 i=k; 2316 while (nzk--) { 2317 i = q[i]; 2318 ju[juidx] = i; 2319 lev[juidx-shift] = levtmp[i]; 2320 juidx++; 2321 } 2322 } 2323 2324 #if defined(PETSC_USE_INFO) 2325 if (ai[mbs] != 0) { 2326 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2327 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr); 2328 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2329 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr); 2330 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 2331 } else { 2332 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2333 } 2334 #endif 2335 2336 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2337 ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr); 2338 ierr = PetscFree(lev);CHKERRQ(ierr); 2339 2340 /* put together the new matrix */ 2341 ierr = MatSeqSBAIJSetPreallocation(B,bs,0,NULL);CHKERRQ(ierr); 2342 2343 /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */ 2344 b = (Mat_SeqSBAIJ*)(B)->data; 2345 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2346 2347 b->singlemalloc = PETSC_FALSE; 2348 b->free_a = PETSC_TRUE; 2349 b->free_ij = PETSC_TRUE; 2350 /* the next line frees the default space generated by the Create() */ 2351 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 2352 ierr = PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);CHKERRQ(ierr); 2353 b->j = ju; 2354 b->i = iu; 2355 b->diag = NULL; 2356 b->ilen = NULL; 2357 b->imax = NULL; 2358 2359 ierr = ISDestroy(&b->row);CHKERRQ(ierr); 2360 ierr = ISDestroy(&b->icol);CHKERRQ(ierr); 2361 b->row = perm; 2362 b->icol = perm; 2363 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2364 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2365 ierr = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr); 2366 /* In b structure: Free imax, ilen, old a, old j. 2367 Allocate idnew, solve_work, new a, new j */ 2368 ierr = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2369 b->maxnz = b->nz = iu[mbs]; 2370 2371 (B)->info.factor_mallocs = reallocs; 2372 (B)->info.fill_ratio_given = f; 2373 if (ai[mbs] != 0) { 2374 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2375 } else { 2376 (B)->info.fill_ratio_needed = 0.0; 2377 } 2378 ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr); 2379 PetscFunctionReturn(0); 2380 } 2381 2382 /* 2383 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2384 */ 2385 #include <petscbt.h> 2386 #include <../src/mat/utils/freespace.h> 2387 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2388 { 2389 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2390 PetscErrorCode ierr; 2391 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2392 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2393 const PetscInt *rip; 2394 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2395 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2396 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2397 PetscReal fill =info->fill,levels=info->levels; 2398 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2399 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2400 PetscBT lnkbt; 2401 2402 PetscFunctionBegin; 2403 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 2404 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2405 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2406 if (bs > 1) { 2407 ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr); 2408 PetscFunctionReturn(0); 2409 } 2410 2411 /* check whether perm is the identity mapping */ 2412 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2413 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2414 a->permute = PETSC_FALSE; 2415 2416 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2417 ierr = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr); 2418 ui[0] = 0; 2419 2420 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2421 if (!levels) { 2422 /* reuse the column pointers and row offsets for memory savings */ 2423 for (i=0; i<am; i++) { 2424 ncols = ai[i+1] - ai[i]; 2425 ui[i+1] = ui[i] + ncols; 2426 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2427 } 2428 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2429 cols = uj; 2430 for (i=0; i<am; i++) { 2431 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2432 ncols = ai[i+1] - ai[i] -1; 2433 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2434 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2435 } 2436 } else { /* case: levels>0 */ 2437 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2438 2439 /* initialization */ 2440 /* jl: linked list for storing indices of the pivot rows 2441 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2442 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2443 for (i=0; i<am; i++) { 2444 jl[i] = am; il[i] = 0; 2445 } 2446 2447 /* create and initialize a linked list for storing column indices of the active row k */ 2448 nlnk = am + 1; 2449 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2450 2451 /* initial FreeSpace size is fill*(ai[am]+1) */ 2452 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2453 2454 current_space = free_space; 2455 2456 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2457 2458 current_space_lvl = free_space_lvl; 2459 2460 for (k=0; k<am; k++) { /* for each active row k */ 2461 /* initialize lnk by the column indices of row k */ 2462 nzk = 0; 2463 ncols = ai[k+1] - ai[k]; 2464 if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k); 2465 cols = aj+ai[k]; 2466 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2467 nzk += nlnk; 2468 2469 /* update lnk by computing fill-in for each pivot row to be merged in */ 2470 prow = jl[k]; /* 1st pivot row */ 2471 2472 while (prow < k) { 2473 nextprow = jl[prow]; 2474 2475 /* merge prow into k-th row */ 2476 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2477 jmax = ui[prow+1]; 2478 ncols = jmax-jmin; 2479 i = jmin - ui[prow]; 2480 2481 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2482 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2483 j = *(uj - 1); 2484 ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2485 nzk += nlnk; 2486 2487 /* update il and jl for prow */ 2488 if (jmin < jmax) { 2489 il[prow] = jmin; 2490 2491 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2492 } 2493 prow = nextprow; 2494 } 2495 2496 /* if free space is not available, make more free space */ 2497 if (current_space->local_remaining<nzk) { 2498 i = am - k + 1; /* num of unfactored rows */ 2499 i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2500 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2501 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2502 reallocs++; 2503 } 2504 2505 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2506 if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k); 2507 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2508 2509 /* add the k-th row into il and jl */ 2510 if (nzk > 1) { 2511 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2512 jl[k] = jl[i]; jl[i] = k; 2513 il[k] = ui[k] + 1; 2514 } 2515 uj_ptr[k] = current_space->array; 2516 uj_lvl_ptr[k] = current_space_lvl->array; 2517 2518 current_space->array += nzk; 2519 current_space->local_used += nzk; 2520 current_space->local_remaining -= nzk; 2521 current_space_lvl->array += nzk; 2522 current_space_lvl->local_used += nzk; 2523 current_space_lvl->local_remaining -= nzk; 2524 2525 ui[k+1] = ui[k] + nzk; 2526 } 2527 2528 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2529 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2530 2531 /* destroy list of free space and other temporary array(s) */ 2532 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2533 ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor */ 2534 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2535 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2536 2537 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2538 2539 /* put together the new matrix in MATSEQSBAIJ format */ 2540 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2541 2542 b = (Mat_SeqSBAIJ*)(fact)->data; 2543 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2544 2545 b->singlemalloc = PETSC_FALSE; 2546 b->free_a = PETSC_TRUE; 2547 b->free_ij = free_ij; 2548 2549 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2550 2551 b->j = uj; 2552 b->i = ui; 2553 b->diag = udiag; 2554 b->free_diag = PETSC_TRUE; 2555 b->ilen = NULL; 2556 b->imax = NULL; 2557 b->row = perm; 2558 b->col = perm; 2559 2560 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2561 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2562 2563 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2564 2565 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2566 ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2567 2568 b->maxnz = b->nz = ui[am]; 2569 2570 fact->info.factor_mallocs = reallocs; 2571 fact->info.fill_ratio_given = fill; 2572 if (ai[am] != 0) { 2573 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2574 } else { 2575 fact->info.fill_ratio_needed = 0.0; 2576 } 2577 #if defined(PETSC_USE_INFO) 2578 if (ai[am] != 0) { 2579 PetscReal af = fact->info.fill_ratio_needed; 2580 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2581 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2582 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2583 } else { 2584 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2585 } 2586 #endif 2587 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2588 PetscFunctionReturn(0); 2589 } 2590 2591 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2592 { 2593 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2594 Mat_SeqSBAIJ *b; 2595 PetscErrorCode ierr; 2596 PetscBool perm_identity,free_ij = PETSC_TRUE; 2597 PetscInt bs=A->rmap->bs,am=a->mbs; 2598 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2599 PetscInt reallocs=0,i,*ui; 2600 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2601 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2602 PetscReal fill =info->fill,levels=info->levels,ratio_needed; 2603 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2604 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2605 PetscBT lnkbt; 2606 2607 PetscFunctionBegin; 2608 /* 2609 This code originally uses Modified Sparse Row (MSR) storage 2610 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2611 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2612 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2613 thus the original code in MSR format is still used for these cases. 2614 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2615 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2616 */ 2617 if (bs > 1) { 2618 ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 2619 PetscFunctionReturn(0); 2620 } 2621 2622 /* check whether perm is the identity mapping */ 2623 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2624 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2625 a->permute = PETSC_FALSE; 2626 2627 /* special case that simply copies fill pattern */ 2628 if (!levels) { 2629 /* reuse the column pointers and row offsets for memory savings */ 2630 ui = a->i; 2631 uj = a->j; 2632 free_ij = PETSC_FALSE; 2633 ratio_needed = 1.0; 2634 } else { /* case: levels>0 */ 2635 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2636 2637 /* initialization */ 2638 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2639 ui[0] = 0; 2640 2641 /* jl: linked list for storing indices of the pivot rows 2642 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2643 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2644 for (i=0; i<am; i++) { 2645 jl[i] = am; il[i] = 0; 2646 } 2647 2648 /* create and initialize a linked list for storing column indices of the active row k */ 2649 nlnk = am + 1; 2650 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2651 2652 /* initial FreeSpace size is fill*(ai[am]+1) */ 2653 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2654 2655 current_space = free_space; 2656 2657 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2658 2659 current_space_lvl = free_space_lvl; 2660 2661 for (k=0; k<am; k++) { /* for each active row k */ 2662 /* initialize lnk by the column indices of row rip[k] */ 2663 nzk = 0; 2664 ncols = ai[rip[k]+1] - ai[rip[k]]; 2665 cols = aj+ai[rip[k]]; 2666 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2667 nzk += nlnk; 2668 2669 /* update lnk by computing fill-in for each pivot row to be merged in */ 2670 prow = jl[k]; /* 1st pivot row */ 2671 2672 while (prow < k) { 2673 nextprow = jl[prow]; 2674 2675 /* merge prow into k-th row */ 2676 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2677 jmax = ui[prow+1]; 2678 ncols = jmax-jmin; 2679 i = jmin - ui[prow]; 2680 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2681 j = *(uj_lvl_ptr[prow] + i - 1); 2682 cols_lvl = uj_lvl_ptr[prow]+i; 2683 ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2684 nzk += nlnk; 2685 2686 /* update il and jl for prow */ 2687 if (jmin < jmax) { 2688 il[prow] = jmin; 2689 j = *cols; 2690 jl[prow] = jl[j]; 2691 jl[j] = prow; 2692 } 2693 prow = nextprow; 2694 } 2695 2696 /* if free space is not available, make more free space */ 2697 if (current_space->local_remaining<nzk) { 2698 i = am - k + 1; /* num of unfactored rows */ 2699 i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2700 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2701 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2702 reallocs++; 2703 } 2704 2705 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2706 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2707 2708 /* add the k-th row into il and jl */ 2709 if (nzk-1 > 0) { 2710 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2711 jl[k] = jl[i]; jl[i] = k; 2712 il[k] = ui[k] + 1; 2713 } 2714 uj_ptr[k] = current_space->array; 2715 uj_lvl_ptr[k] = current_space_lvl->array; 2716 2717 current_space->array += nzk; 2718 current_space->local_used += nzk; 2719 current_space->local_remaining -= nzk; 2720 current_space_lvl->array += nzk; 2721 current_space_lvl->local_used += nzk; 2722 current_space_lvl->local_remaining -= nzk; 2723 2724 ui[k+1] = ui[k] + nzk; 2725 } 2726 2727 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2728 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2729 2730 /* destroy list of free space and other temporary array(s) */ 2731 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2732 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 2733 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2734 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2735 if (ai[am] != 0) { 2736 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2737 } else { 2738 ratio_needed = 0.0; 2739 } 2740 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2741 2742 /* put together the new matrix in MATSEQSBAIJ format */ 2743 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2744 2745 b = (Mat_SeqSBAIJ*)(fact)->data; 2746 2747 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2748 2749 b->singlemalloc = PETSC_FALSE; 2750 b->free_a = PETSC_TRUE; 2751 b->free_ij = free_ij; 2752 2753 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2754 2755 b->j = uj; 2756 b->i = ui; 2757 b->diag = NULL; 2758 b->ilen = NULL; 2759 b->imax = NULL; 2760 b->row = perm; 2761 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2762 2763 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2764 b->icol = perm; 2765 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2766 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2767 2768 b->maxnz = b->nz = ui[am]; 2769 2770 fact->info.factor_mallocs = reallocs; 2771 fact->info.fill_ratio_given = fill; 2772 fact->info.fill_ratio_needed = ratio_needed; 2773 #if defined(PETSC_USE_INFO) 2774 if (ai[am] != 0) { 2775 PetscReal af = fact->info.fill_ratio_needed; 2776 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2777 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2778 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2779 } else { 2780 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2781 } 2782 #endif 2783 if (perm_identity) { 2784 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2785 } else { 2786 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2787 } 2788 PetscFunctionReturn(0); 2789 } 2790