1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 11 { 12 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 13 IS isrow=a->row; 14 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 15 const PetscInt *r; 16 PetscInt nz,*vj,k,idx,k1; 17 PetscInt bs =A->rmap->bs,bs2 = a->bs2; 18 const MatScalar *aa=a->a,*v,*diag; 19 PetscScalar *x,*xk,*xj,*xk_tmp,*t; 20 const PetscScalar *b; 21 22 PetscFunctionBegin; 23 PetscCall(VecGetArrayRead(bb,&b)); 24 PetscCall(VecGetArray(xx,&x)); 25 t = a->solve_work; 26 PetscCall(ISGetIndices(isrow,&r)); 27 PetscCall(PetscMalloc1(bs,&xk_tmp)); 28 29 /* solve U^T * D * y = b by forward substitution */ 30 xk = t; 31 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 32 idx = bs*r[k]; 33 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 34 } 35 for (k=0; k<mbs; k++) { 36 v = aa + bs2*ai[k]; 37 xk = t + k*bs; /* Dk*xk = k-th block of x */ 38 PetscCall(PetscArraycpy(xk_tmp,xk,bs)); /* xk_tmp <- xk */ 39 nz = ai[k+1] - ai[k]; 40 vj = aj + ai[k]; 41 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 42 while (nz--) { 43 /* x(:) += U(k,:)^T*(Dk*xk) */ 44 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 45 vj++; xj = t + (*vj)*bs; 46 v += bs2; 47 } 48 /* xk = inv(Dk)*(Dk*xk) */ 49 diag = aa+k*bs2; /* ptr to inv(Dk) */ 50 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 51 } 52 53 /* solve U*x = y by back substitution */ 54 for (k=mbs-1; k>=0; k--) { 55 v = aa + bs2*ai[k]; 56 xk = t + k*bs; /* xk */ 57 nz = ai[k+1] - ai[k]; 58 vj = aj + ai[k]; 59 xj = t + (*vj)*bs; 60 while (nz--) { 61 /* xk += U(k,:)*x(:) */ 62 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 63 vj++; 64 v += bs2; xj = t + (*vj)*bs; 65 } 66 idx = bs*r[k]; 67 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 68 } 69 70 PetscCall(PetscFree(xk_tmp)); 71 PetscCall(ISRestoreIndices(isrow,&r)); 72 PetscCall(VecRestoreArrayRead(bb,&b)); 73 PetscCall(VecRestoreArray(xx,&x)); 74 PetscCall(PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs)); 75 PetscFunctionReturn(0); 76 } 77 78 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 79 { 80 PetscFunctionBegin; 81 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet"); 82 } 83 84 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 85 { 86 PetscFunctionBegin; 87 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented"); 88 } 89 90 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 91 { 92 PetscInt nz,k; 93 const PetscInt *vj,bs2 = bs*bs; 94 const MatScalar *v,*diag; 95 PetscScalar *xk,*xj,*xk_tmp; 96 97 PetscFunctionBegin; 98 PetscCall(PetscMalloc1(bs,&xk_tmp)); 99 for (k=0; k<mbs; k++) { 100 v = aa + bs2*ai[k]; 101 xk = x + k*bs; /* Dk*xk = k-th block of x */ 102 PetscCall(PetscArraycpy(xk_tmp,xk,bs)); /* xk_tmp <- xk */ 103 nz = ai[k+1] - ai[k]; 104 vj = aj + ai[k]; 105 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 106 while (nz--) { 107 /* x(:) += U(k,:)^T*(Dk*xk) */ 108 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 109 vj++; xj = x + (*vj)*bs; 110 v += bs2; 111 } 112 /* xk = inv(Dk)*(Dk*xk) */ 113 diag = aa+k*bs2; /* ptr to inv(Dk) */ 114 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 115 } 116 PetscCall(PetscFree(xk_tmp)); 117 PetscFunctionReturn(0); 118 } 119 120 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 121 { 122 PetscInt nz,k; 123 const PetscInt *vj,bs2 = bs*bs; 124 const MatScalar *v; 125 PetscScalar *xk,*xj; 126 127 PetscFunctionBegin; 128 for (k=mbs-1; k>=0; k--) { 129 v = aa + bs2*ai[k]; 130 xk = x + k*bs; /* xk */ 131 nz = ai[k+1] - ai[k]; 132 vj = aj + ai[k]; 133 xj = x + (*vj)*bs; 134 while (nz--) { 135 /* xk += U(k,:)*x(:) */ 136 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 137 vj++; 138 v += bs2; xj = x + (*vj)*bs; 139 } 140 } 141 PetscFunctionReturn(0); 142 } 143 144 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 145 { 146 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 147 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 148 PetscInt bs =A->rmap->bs; 149 const MatScalar *aa=a->a; 150 PetscScalar *x; 151 const PetscScalar *b; 152 153 PetscFunctionBegin; 154 PetscCall(VecGetArrayRead(bb,&b)); 155 PetscCall(VecGetArray(xx,&x)); 156 157 /* solve U^T * D * y = b by forward substitution */ 158 PetscCall(PetscArraycpy(x,b,bs*mbs)); /* x <- b */ 159 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x)); 160 161 /* solve U*x = y by back substitution */ 162 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x)); 163 164 PetscCall(VecRestoreArrayRead(bb,&b)); 165 PetscCall(VecRestoreArray(xx,&x)); 166 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs)); 167 PetscFunctionReturn(0); 168 } 169 170 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 171 { 172 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 173 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 174 PetscInt bs =A->rmap->bs; 175 const MatScalar *aa=a->a; 176 const PetscScalar *b; 177 PetscScalar *x; 178 179 PetscFunctionBegin; 180 PetscCall(VecGetArrayRead(bb,&b)); 181 PetscCall(VecGetArray(xx,&x)); 182 PetscCall(PetscArraycpy(x,b,bs*mbs)); /* x <- b */ 183 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x)); 184 PetscCall(VecRestoreArrayRead(bb,&b)); 185 PetscCall(VecRestoreArray(xx,&x)); 186 PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs)); 187 PetscFunctionReturn(0); 188 } 189 190 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 191 { 192 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 193 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 194 PetscInt bs =A->rmap->bs; 195 const MatScalar *aa=a->a; 196 const PetscScalar *b; 197 PetscScalar *x; 198 199 PetscFunctionBegin; 200 PetscCall(VecGetArrayRead(bb,&b)); 201 PetscCall(VecGetArray(xx,&x)); 202 PetscCall(PetscArraycpy(x,b,bs*mbs)); 203 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x)); 204 PetscCall(VecRestoreArrayRead(bb,&b)); 205 PetscCall(VecRestoreArray(xx,&x)); 206 PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs))); 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 211 { 212 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 213 IS isrow=a->row; 214 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 215 PetscInt nz,k,idx; 216 const MatScalar *aa=a->a,*v,*d; 217 PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 218 const PetscScalar *b; 219 220 PetscFunctionBegin; 221 PetscCall(VecGetArrayRead(bb,&b)); 222 PetscCall(VecGetArray(xx,&x)); 223 t = a->solve_work; 224 PetscCall(ISGetIndices(isrow,&r)); 225 226 /* solve U^T * D * y = b by forward substitution */ 227 tp = t; 228 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 229 idx = 7*r[k]; 230 tp[0] = b[idx]; 231 tp[1] = b[idx+1]; 232 tp[2] = b[idx+2]; 233 tp[3] = b[idx+3]; 234 tp[4] = b[idx+4]; 235 tp[5] = b[idx+5]; 236 tp[6] = b[idx+6]; 237 tp += 7; 238 } 239 240 for (k=0; k<mbs; k++) { 241 v = aa + 49*ai[k]; 242 vj = aj + ai[k]; 243 tp = t + k*7; 244 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 245 nz = ai[k+1] - ai[k]; 246 tp = t + (*vj)*7; 247 while (nz--) { 248 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 249 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 250 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 251 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 252 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 253 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 254 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 255 vj++; 256 tp = t + (*vj)*7; 257 v += 49; 258 } 259 260 /* xk = inv(Dk)*(Dk*xk) */ 261 d = aa+k*49; /* ptr to inv(Dk) */ 262 tp = t + k*7; 263 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 264 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 265 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 266 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 267 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 268 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 269 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 270 } 271 272 /* solve U*x = y by back substitution */ 273 for (k=mbs-1; k>=0; k--) { 274 v = aa + 49*ai[k]; 275 vj = aj + ai[k]; 276 tp = t + k*7; 277 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 278 nz = ai[k+1] - ai[k]; 279 280 tp = t + (*vj)*7; 281 while (nz--) { 282 /* xk += U(k,:)*x(:) */ 283 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 284 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 285 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 286 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 287 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 288 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 289 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 290 vj++; 291 tp = t + (*vj)*7; 292 v += 49; 293 } 294 tp = t + k*7; 295 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 296 idx = 7*r[k]; 297 x[idx] = x0; 298 x[idx+1] = x1; 299 x[idx+2] = x2; 300 x[idx+3] = x3; 301 x[idx+4] = x4; 302 x[idx+5] = x5; 303 x[idx+6] = x6; 304 } 305 306 PetscCall(ISRestoreIndices(isrow,&r)); 307 PetscCall(VecRestoreArrayRead(bb,&b)); 308 PetscCall(VecRestoreArray(xx,&x)); 309 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 310 PetscFunctionReturn(0); 311 } 312 313 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 314 { 315 const MatScalar *v,*d; 316 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 317 PetscInt nz,k; 318 const PetscInt *vj; 319 320 PetscFunctionBegin; 321 for (k=0; k<mbs; k++) { 322 v = aa + 49*ai[k]; 323 xp = x + k*7; 324 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 325 nz = ai[k+1] - ai[k]; 326 vj = aj + ai[k]; 327 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 328 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 329 while (nz--) { 330 xp = x + (*vj)*7; 331 /* x(:) += U(k,:)^T*(Dk*xk) */ 332 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 333 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 334 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 335 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 336 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 337 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 338 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 339 vj++; 340 v += 49; 341 } 342 /* xk = inv(Dk)*(Dk*xk) */ 343 d = aa+k*49; /* ptr to inv(Dk) */ 344 xp = x + k*7; 345 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 346 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 347 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 348 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 349 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 350 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 351 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 352 } 353 PetscFunctionReturn(0); 354 } 355 356 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 357 { 358 const MatScalar *v; 359 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 360 PetscInt nz,k; 361 const PetscInt *vj; 362 363 PetscFunctionBegin; 364 for (k=mbs-1; k>=0; k--) { 365 v = aa + 49*ai[k]; 366 xp = x + k*7; 367 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 368 nz = ai[k+1] - ai[k]; 369 vj = aj + ai[k]; 370 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 371 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 372 while (nz--) { 373 xp = x + (*vj)*7; 374 /* xk += U(k,:)*x(:) */ 375 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 376 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 377 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 378 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 379 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 380 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 381 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 382 vj++; 383 v += 49; 384 } 385 xp = x + k*7; 386 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 387 } 388 PetscFunctionReturn(0); 389 } 390 391 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 392 { 393 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 394 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 395 const MatScalar *aa=a->a; 396 PetscScalar *x; 397 const PetscScalar *b; 398 399 PetscFunctionBegin; 400 PetscCall(VecGetArrayRead(bb,&b)); 401 PetscCall(VecGetArray(xx,&x)); 402 403 /* solve U^T * D * y = b by forward substitution */ 404 PetscCall(PetscArraycpy(x,b,7*mbs)); /* x <- b */ 405 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x)); 406 407 /* solve U*x = y by back substitution */ 408 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x)); 409 410 PetscCall(VecRestoreArrayRead(bb,&b)); 411 PetscCall(VecRestoreArray(xx,&x)); 412 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 413 PetscFunctionReturn(0); 414 } 415 416 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 417 { 418 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 419 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 420 const MatScalar *aa=a->a; 421 PetscScalar *x; 422 const PetscScalar *b; 423 424 PetscFunctionBegin; 425 PetscCall(VecGetArrayRead(bb,&b)); 426 PetscCall(VecGetArray(xx,&x)); 427 PetscCall(PetscArraycpy(x,b,7*mbs)); 428 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x)); 429 PetscCall(VecRestoreArrayRead(bb,&b)); 430 PetscCall(VecRestoreArray(xx,&x)); 431 PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs)); 432 PetscFunctionReturn(0); 433 } 434 435 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 436 { 437 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 438 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 439 const MatScalar *aa=a->a; 440 PetscScalar *x; 441 const PetscScalar *b; 442 443 PetscFunctionBegin; 444 PetscCall(VecGetArrayRead(bb,&b)); 445 PetscCall(VecGetArray(xx,&x)); 446 PetscCall(PetscArraycpy(x,b,7*mbs)); 447 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x)); 448 PetscCall(VecRestoreArrayRead(bb,&b)); 449 PetscCall(VecRestoreArray(xx,&x)); 450 PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs))); 451 PetscFunctionReturn(0); 452 } 453 454 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 455 { 456 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 457 IS isrow=a->row; 458 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 459 PetscInt nz,k,idx; 460 const MatScalar *aa=a->a,*v,*d; 461 PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp; 462 const PetscScalar *b; 463 464 PetscFunctionBegin; 465 PetscCall(VecGetArrayRead(bb,&b)); 466 PetscCall(VecGetArray(xx,&x)); 467 t = a->solve_work; 468 PetscCall(ISGetIndices(isrow,&r)); 469 470 /* solve U^T * D * y = b by forward substitution */ 471 tp = t; 472 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 473 idx = 6*r[k]; 474 tp[0] = b[idx]; 475 tp[1] = b[idx+1]; 476 tp[2] = b[idx+2]; 477 tp[3] = b[idx+3]; 478 tp[4] = b[idx+4]; 479 tp[5] = b[idx+5]; 480 tp += 6; 481 } 482 483 for (k=0; k<mbs; k++) { 484 v = aa + 36*ai[k]; 485 vj = aj + ai[k]; 486 tp = t + k*6; 487 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 488 nz = ai[k+1] - ai[k]; 489 tp = t + (*vj)*6; 490 while (nz--) { 491 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 492 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 493 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 494 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 495 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 496 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 497 vj++; 498 tp = t + (*vj)*6; 499 v += 36; 500 } 501 502 /* xk = inv(Dk)*(Dk*xk) */ 503 d = aa+k*36; /* ptr to inv(Dk) */ 504 tp = t + k*6; 505 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 506 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 507 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 508 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 509 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 510 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 511 } 512 513 /* solve U*x = y by back substitution */ 514 for (k=mbs-1; k>=0; k--) { 515 v = aa + 36*ai[k]; 516 vj = aj + ai[k]; 517 tp = t + k*6; 518 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 519 nz = ai[k+1] - ai[k]; 520 521 tp = t + (*vj)*6; 522 while (nz--) { 523 /* xk += U(k,:)*x(:) */ 524 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 525 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 526 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 527 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 528 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 529 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 530 vj++; 531 tp = t + (*vj)*6; 532 v += 36; 533 } 534 tp = t + k*6; 535 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 536 idx = 6*r[k]; 537 x[idx] = x0; 538 x[idx+1] = x1; 539 x[idx+2] = x2; 540 x[idx+3] = x3; 541 x[idx+4] = x4; 542 x[idx+5] = x5; 543 } 544 545 PetscCall(ISRestoreIndices(isrow,&r)); 546 PetscCall(VecRestoreArrayRead(bb,&b)); 547 PetscCall(VecRestoreArray(xx,&x)); 548 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 549 PetscFunctionReturn(0); 550 } 551 552 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 553 { 554 const MatScalar *v,*d; 555 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 556 PetscInt nz,k; 557 const PetscInt *vj; 558 559 PetscFunctionBegin; 560 for (k=0; k<mbs; k++) { 561 v = aa + 36*ai[k]; 562 xp = x + k*6; 563 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 564 nz = ai[k+1] - ai[k]; 565 vj = aj + ai[k]; 566 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 567 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 568 while (nz--) { 569 xp = x + (*vj)*6; 570 /* x(:) += U(k,:)^T*(Dk*xk) */ 571 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 572 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 573 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 574 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 575 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 576 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 577 vj++; 578 v += 36; 579 } 580 /* xk = inv(Dk)*(Dk*xk) */ 581 d = aa+k*36; /* ptr to inv(Dk) */ 582 xp = x + k*6; 583 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 584 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 585 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 586 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 587 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 588 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 589 } 590 PetscFunctionReturn(0); 591 } 592 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 593 { 594 const MatScalar *v; 595 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 596 PetscInt nz,k; 597 const PetscInt *vj; 598 599 PetscFunctionBegin; 600 for (k=mbs-1; k>=0; k--) { 601 v = aa + 36*ai[k]; 602 xp = x + k*6; 603 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 604 nz = ai[k+1] - ai[k]; 605 vj = aj + ai[k]; 606 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 607 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 608 while (nz--) { 609 xp = x + (*vj)*6; 610 /* xk += U(k,:)*x(:) */ 611 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 612 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 613 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 614 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 615 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 616 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 617 vj++; 618 v += 36; 619 } 620 xp = x + k*6; 621 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 622 } 623 PetscFunctionReturn(0); 624 } 625 626 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 627 { 628 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 629 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 630 const MatScalar *aa=a->a; 631 PetscScalar *x; 632 const PetscScalar *b; 633 634 PetscFunctionBegin; 635 PetscCall(VecGetArrayRead(bb,&b)); 636 PetscCall(VecGetArray(xx,&x)); 637 638 /* solve U^T * D * y = b by forward substitution */ 639 PetscCall(PetscArraycpy(x,b,6*mbs)); /* x <- b */ 640 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x)); 641 642 /* solve U*x = y by back substitution */ 643 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x)); 644 645 PetscCall(VecRestoreArrayRead(bb,&b)); 646 PetscCall(VecRestoreArray(xx,&x)); 647 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 648 PetscFunctionReturn(0); 649 } 650 651 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 652 { 653 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 654 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 655 const MatScalar *aa=a->a; 656 PetscScalar *x; 657 const PetscScalar *b; 658 659 PetscFunctionBegin; 660 PetscCall(VecGetArrayRead(bb,&b)); 661 PetscCall(VecGetArray(xx,&x)); 662 PetscCall(PetscArraycpy(x,b,6*mbs)); /* x <- b */ 663 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x)); 664 PetscCall(VecRestoreArrayRead(bb,&b)); 665 PetscCall(VecRestoreArray(xx,&x)); 666 PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs)); 667 PetscFunctionReturn(0); 668 } 669 670 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 671 { 672 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 673 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 674 const MatScalar *aa=a->a; 675 PetscScalar *x; 676 const PetscScalar *b; 677 678 PetscFunctionBegin; 679 PetscCall(VecGetArrayRead(bb,&b)); 680 PetscCall(VecGetArray(xx,&x)); 681 PetscCall(PetscArraycpy(x,b,6*mbs)); /* x <- b */ 682 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x)); 683 PetscCall(VecRestoreArrayRead(bb,&b)); 684 PetscCall(VecRestoreArray(xx,&x)); 685 PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz - mbs))); 686 PetscFunctionReturn(0); 687 } 688 689 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 690 { 691 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 692 IS isrow=a->row; 693 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 694 const PetscInt *r,*vj; 695 PetscInt nz,k,idx; 696 const MatScalar *aa=a->a,*v,*diag; 697 PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp; 698 const PetscScalar *b; 699 700 PetscFunctionBegin; 701 PetscCall(VecGetArrayRead(bb,&b)); 702 PetscCall(VecGetArray(xx,&x)); 703 t = a->solve_work; 704 PetscCall(ISGetIndices(isrow,&r)); 705 706 /* solve U^T * D * y = b by forward substitution */ 707 tp = t; 708 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 709 idx = 5*r[k]; 710 tp[0] = b[idx]; 711 tp[1] = b[idx+1]; 712 tp[2] = b[idx+2]; 713 tp[3] = b[idx+3]; 714 tp[4] = b[idx+4]; 715 tp += 5; 716 } 717 718 for (k=0; k<mbs; k++) { 719 v = aa + 25*ai[k]; 720 vj = aj + ai[k]; 721 tp = t + k*5; 722 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 723 nz = ai[k+1] - ai[k]; 724 725 tp = t + (*vj)*5; 726 while (nz--) { 727 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 728 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 729 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 730 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 731 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 732 vj++; 733 tp = t + (*vj)*5; 734 v += 25; 735 } 736 737 /* xk = inv(Dk)*(Dk*xk) */ 738 diag = aa+k*25; /* ptr to inv(Dk) */ 739 tp = t + k*5; 740 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 741 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 742 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 743 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 744 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 745 } 746 747 /* solve U*x = y by back substitution */ 748 for (k=mbs-1; k>=0; k--) { 749 v = aa + 25*ai[k]; 750 vj = aj + ai[k]; 751 tp = t + k*5; 752 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */ 753 nz = ai[k+1] - ai[k]; 754 755 tp = t + (*vj)*5; 756 while (nz--) { 757 /* xk += U(k,:)*x(:) */ 758 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 759 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 760 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 761 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 762 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 763 vj++; 764 tp = t + (*vj)*5; 765 v += 25; 766 } 767 tp = t + k*5; 768 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 769 idx = 5*r[k]; 770 x[idx] = x0; 771 x[idx+1] = x1; 772 x[idx+2] = x2; 773 x[idx+3] = x3; 774 x[idx+4] = x4; 775 } 776 777 PetscCall(ISRestoreIndices(isrow,&r)); 778 PetscCall(VecRestoreArrayRead(bb,&b)); 779 PetscCall(VecRestoreArray(xx,&x)); 780 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 781 PetscFunctionReturn(0); 782 } 783 784 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 785 { 786 const MatScalar *v,*diag; 787 PetscScalar *xp,x0,x1,x2,x3,x4; 788 PetscInt nz,k; 789 const PetscInt *vj; 790 791 PetscFunctionBegin; 792 for (k=0; k<mbs; k++) { 793 v = aa + 25*ai[k]; 794 xp = x + k*5; 795 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 796 nz = ai[k+1] - ai[k]; 797 vj = aj + ai[k]; 798 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 799 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 800 while (nz--) { 801 xp = x + (*vj)*5; 802 /* x(:) += U(k,:)^T*(Dk*xk) */ 803 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 804 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 805 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 806 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 807 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 808 vj++; 809 v += 25; 810 } 811 /* xk = inv(Dk)*(Dk*xk) */ 812 diag = aa+k*25; /* ptr to inv(Dk) */ 813 xp = x + k*5; 814 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 815 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 816 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 817 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 818 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 819 } 820 PetscFunctionReturn(0); 821 } 822 823 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 824 { 825 const MatScalar *v; 826 PetscScalar *xp,x0,x1,x2,x3,x4; 827 PetscInt nz,k; 828 const PetscInt *vj; 829 830 PetscFunctionBegin; 831 for (k=mbs-1; k>=0; k--) { 832 v = aa + 25*ai[k]; 833 xp = x + k*5; 834 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */ 835 nz = ai[k+1] - ai[k]; 836 vj = aj + ai[k]; 837 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 838 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 839 while (nz--) { 840 xp = x + (*vj)*5; 841 /* xk += U(k,:)*x(:) */ 842 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 843 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 844 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 845 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 846 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 847 vj++; 848 v += 25; 849 } 850 xp = x + k*5; 851 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 852 } 853 PetscFunctionReturn(0); 854 } 855 856 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 857 { 858 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 859 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 860 const MatScalar *aa=a->a; 861 PetscScalar *x; 862 const PetscScalar *b; 863 864 PetscFunctionBegin; 865 PetscCall(VecGetArrayRead(bb,&b)); 866 PetscCall(VecGetArray(xx,&x)); 867 868 /* solve U^T * D * y = b by forward substitution */ 869 PetscCall(PetscArraycpy(x,b,5*mbs)); /* x <- b */ 870 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x)); 871 872 /* solve U*x = y by back substitution */ 873 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x)); 874 875 PetscCall(VecRestoreArrayRead(bb,&b)); 876 PetscCall(VecRestoreArray(xx,&x)); 877 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 878 PetscFunctionReturn(0); 879 } 880 881 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 882 { 883 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 884 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 885 const MatScalar *aa=a->a; 886 PetscScalar *x; 887 const PetscScalar *b; 888 889 PetscFunctionBegin; 890 PetscCall(VecGetArrayRead(bb,&b)); 891 PetscCall(VecGetArray(xx,&x)); 892 PetscCall(PetscArraycpy(x,b,5*mbs)); /* x <- b */ 893 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x)); 894 PetscCall(VecRestoreArrayRead(bb,&b)); 895 PetscCall(VecRestoreArray(xx,&x)); 896 PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs)); 897 PetscFunctionReturn(0); 898 } 899 900 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 901 { 902 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 903 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 904 const MatScalar *aa=a->a; 905 PetscScalar *x; 906 const PetscScalar *b; 907 908 PetscFunctionBegin; 909 PetscCall(VecGetArrayRead(bb,&b)); 910 PetscCall(VecGetArray(xx,&x)); 911 PetscCall(PetscArraycpy(x,b,5*mbs)); 912 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x)); 913 PetscCall(VecRestoreArrayRead(bb,&b)); 914 PetscCall(VecRestoreArray(xx,&x)); 915 PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs))); 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 920 { 921 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 922 IS isrow=a->row; 923 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 924 const PetscInt *r,*vj; 925 PetscInt nz,k,idx; 926 const MatScalar *aa=a->a,*v,*diag; 927 PetscScalar *x,x0,x1,x2,x3,*t,*tp; 928 const PetscScalar *b; 929 930 PetscFunctionBegin; 931 PetscCall(VecGetArrayRead(bb,&b)); 932 PetscCall(VecGetArray(xx,&x)); 933 t = a->solve_work; 934 PetscCall(ISGetIndices(isrow,&r)); 935 936 /* solve U^T * D * y = b by forward substitution */ 937 tp = t; 938 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 939 idx = 4*r[k]; 940 tp[0] = b[idx]; 941 tp[1] = b[idx+1]; 942 tp[2] = b[idx+2]; 943 tp[3] = b[idx+3]; 944 tp += 4; 945 } 946 947 for (k=0; k<mbs; k++) { 948 v = aa + 16*ai[k]; 949 vj = aj + ai[k]; 950 tp = t + k*4; 951 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 952 nz = ai[k+1] - ai[k]; 953 954 tp = t + (*vj)*4; 955 while (nz--) { 956 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 957 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 958 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 959 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 960 vj++; 961 tp = t + (*vj)*4; 962 v += 16; 963 } 964 965 /* xk = inv(Dk)*(Dk*xk) */ 966 diag = aa+k*16; /* ptr to inv(Dk) */ 967 tp = t + k*4; 968 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 969 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 970 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 971 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 972 } 973 974 /* solve U*x = y by back substitution */ 975 for (k=mbs-1; k>=0; k--) { 976 v = aa + 16*ai[k]; 977 vj = aj + ai[k]; 978 tp = t + k*4; 979 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 980 nz = ai[k+1] - ai[k]; 981 982 tp = t + (*vj)*4; 983 while (nz--) { 984 /* xk += U(k,:)*x(:) */ 985 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 986 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 987 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 988 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 989 vj++; 990 tp = t + (*vj)*4; 991 v += 16; 992 } 993 tp = t + k*4; 994 tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 995 idx = 4*r[k]; 996 x[idx] = x0; 997 x[idx+1] = x1; 998 x[idx+2] = x2; 999 x[idx+3] = x3; 1000 } 1001 1002 PetscCall(ISRestoreIndices(isrow,&r)); 1003 PetscCall(VecRestoreArrayRead(bb,&b)); 1004 PetscCall(VecRestoreArray(xx,&x)); 1005 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 1006 PetscFunctionReturn(0); 1007 } 1008 1009 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1010 { 1011 const MatScalar *v,*diag; 1012 PetscScalar *xp,x0,x1,x2,x3; 1013 PetscInt nz,k; 1014 const PetscInt *vj; 1015 1016 PetscFunctionBegin; 1017 for (k=0; k<mbs; k++) { 1018 v = aa + 16*ai[k]; 1019 xp = x + k*4; 1020 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1021 nz = ai[k+1] - ai[k]; 1022 vj = aj + ai[k]; 1023 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1024 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1025 while (nz--) { 1026 xp = x + (*vj)*4; 1027 /* x(:) += U(k,:)^T*(Dk*xk) */ 1028 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1029 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1030 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1031 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1032 vj++; 1033 v += 16; 1034 } 1035 /* xk = inv(Dk)*(Dk*xk) */ 1036 diag = aa+k*16; /* ptr to inv(Dk) */ 1037 xp = x + k*4; 1038 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1039 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1040 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1041 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1042 } 1043 PetscFunctionReturn(0); 1044 } 1045 1046 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1047 { 1048 const MatScalar *v; 1049 PetscScalar *xp,x0,x1,x2,x3; 1050 PetscInt nz,k; 1051 const PetscInt *vj; 1052 1053 PetscFunctionBegin; 1054 for (k=mbs-1; k>=0; k--) { 1055 v = aa + 16*ai[k]; 1056 xp = x + k*4; 1057 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1058 nz = ai[k+1] - ai[k]; 1059 vj = aj + ai[k]; 1060 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1061 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1062 while (nz--) { 1063 xp = x + (*vj)*4; 1064 /* xk += U(k,:)*x(:) */ 1065 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1066 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1067 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1068 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1069 vj++; 1070 v += 16; 1071 } 1072 xp = x + k*4; 1073 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1074 } 1075 PetscFunctionReturn(0); 1076 } 1077 1078 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1079 { 1080 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1081 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1082 const MatScalar *aa=a->a; 1083 PetscScalar *x; 1084 const PetscScalar *b; 1085 1086 PetscFunctionBegin; 1087 PetscCall(VecGetArrayRead(bb,&b)); 1088 PetscCall(VecGetArray(xx,&x)); 1089 1090 /* solve U^T * D * y = b by forward substitution */ 1091 PetscCall(PetscArraycpy(x,b,4*mbs)); /* x <- b */ 1092 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x)); 1093 1094 /* solve U*x = y by back substitution */ 1095 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x)); 1096 PetscCall(VecRestoreArrayRead(bb,&b)); 1097 PetscCall(VecRestoreArray(xx,&x)); 1098 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 1099 PetscFunctionReturn(0); 1100 } 1101 1102 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1103 { 1104 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1105 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1106 const MatScalar *aa=a->a; 1107 PetscScalar *x; 1108 const PetscScalar *b; 1109 1110 PetscFunctionBegin; 1111 PetscCall(VecGetArrayRead(bb,&b)); 1112 PetscCall(VecGetArray(xx,&x)); 1113 PetscCall(PetscArraycpy(x,b,4*mbs)); /* x <- b */ 1114 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x)); 1115 PetscCall(VecRestoreArrayRead(bb,&b)); 1116 PetscCall(VecRestoreArray(xx,&x)); 1117 PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs)); 1118 PetscFunctionReturn(0); 1119 } 1120 1121 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1124 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1125 const MatScalar *aa=a->a; 1126 PetscScalar *x; 1127 const PetscScalar *b; 1128 1129 PetscFunctionBegin; 1130 PetscCall(VecGetArrayRead(bb,&b)); 1131 PetscCall(VecGetArray(xx,&x)); 1132 PetscCall(PetscArraycpy(x,b,4*mbs)); 1133 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x)); 1134 PetscCall(VecRestoreArrayRead(bb,&b)); 1135 PetscCall(VecRestoreArray(xx,&x)); 1136 PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs))); 1137 PetscFunctionReturn(0); 1138 } 1139 1140 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1141 { 1142 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1143 IS isrow=a->row; 1144 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1145 const PetscInt *r; 1146 PetscInt nz,k,idx; 1147 const PetscInt *vj; 1148 const MatScalar *aa=a->a,*v,*diag; 1149 PetscScalar *x,x0,x1,x2,*t,*tp; 1150 const PetscScalar *b; 1151 1152 PetscFunctionBegin; 1153 PetscCall(VecGetArrayRead(bb,&b)); 1154 PetscCall(VecGetArray(xx,&x)); 1155 t = a->solve_work; 1156 PetscCall(ISGetIndices(isrow,&r)); 1157 1158 /* solve U^T * D * y = b by forward substitution */ 1159 tp = t; 1160 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1161 idx = 3*r[k]; 1162 tp[0] = b[idx]; 1163 tp[1] = b[idx+1]; 1164 tp[2] = b[idx+2]; 1165 tp += 3; 1166 } 1167 1168 for (k=0; k<mbs; k++) { 1169 v = aa + 9*ai[k]; 1170 vj = aj + ai[k]; 1171 tp = t + k*3; 1172 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1173 nz = ai[k+1] - ai[k]; 1174 1175 tp = t + (*vj)*3; 1176 while (nz--) { 1177 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1178 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1179 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1180 vj++; 1181 tp = t + (*vj)*3; 1182 v += 9; 1183 } 1184 1185 /* xk = inv(Dk)*(Dk*xk) */ 1186 diag = aa+k*9; /* ptr to inv(Dk) */ 1187 tp = t + k*3; 1188 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1189 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1190 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1191 } 1192 1193 /* solve U*x = y by back substitution */ 1194 for (k=mbs-1; k>=0; k--) { 1195 v = aa + 9*ai[k]; 1196 vj = aj + ai[k]; 1197 tp = t + k*3; 1198 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1199 nz = ai[k+1] - ai[k]; 1200 1201 tp = t + (*vj)*3; 1202 while (nz--) { 1203 /* xk += U(k,:)*x(:) */ 1204 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1205 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1206 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1207 vj++; 1208 tp = t + (*vj)*3; 1209 v += 9; 1210 } 1211 tp = t + k*3; 1212 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1213 idx = 3*r[k]; 1214 x[idx] = x0; 1215 x[idx+1] = x1; 1216 x[idx+2] = x2; 1217 } 1218 1219 PetscCall(ISRestoreIndices(isrow,&r)); 1220 PetscCall(VecRestoreArrayRead(bb,&b)); 1221 PetscCall(VecRestoreArray(xx,&x)); 1222 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 1223 PetscFunctionReturn(0); 1224 } 1225 1226 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1227 { 1228 const MatScalar *v,*diag; 1229 PetscScalar *xp,x0,x1,x2; 1230 PetscInt nz,k; 1231 const PetscInt *vj; 1232 1233 PetscFunctionBegin; 1234 for (k=0; k<mbs; k++) { 1235 v = aa + 9*ai[k]; 1236 xp = x + k*3; 1237 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1238 nz = ai[k+1] - ai[k]; 1239 vj = aj + ai[k]; 1240 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1241 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1242 while (nz--) { 1243 xp = x + (*vj)*3; 1244 /* x(:) += U(k,:)^T*(Dk*xk) */ 1245 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1246 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1247 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1248 vj++; 1249 v += 9; 1250 } 1251 /* xk = inv(Dk)*(Dk*xk) */ 1252 diag = aa+k*9; /* ptr to inv(Dk) */ 1253 xp = x + k*3; 1254 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1255 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1256 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1257 } 1258 PetscFunctionReturn(0); 1259 } 1260 1261 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1262 { 1263 const MatScalar *v; 1264 PetscScalar *xp,x0,x1,x2; 1265 PetscInt nz,k; 1266 const PetscInt *vj; 1267 1268 PetscFunctionBegin; 1269 for (k=mbs-1; k>=0; k--) { 1270 v = aa + 9*ai[k]; 1271 xp = x + k*3; 1272 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1273 nz = ai[k+1] - ai[k]; 1274 vj = aj + ai[k]; 1275 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1276 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1277 while (nz--) { 1278 xp = x + (*vj)*3; 1279 /* xk += U(k,:)*x(:) */ 1280 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1281 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1282 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1283 vj++; 1284 v += 9; 1285 } 1286 xp = x + k*3; 1287 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1288 } 1289 PetscFunctionReturn(0); 1290 } 1291 1292 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1293 { 1294 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1295 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1296 const MatScalar *aa=a->a; 1297 PetscScalar *x; 1298 const PetscScalar *b; 1299 1300 PetscFunctionBegin; 1301 PetscCall(VecGetArrayRead(bb,&b)); 1302 PetscCall(VecGetArray(xx,&x)); 1303 1304 /* solve U^T * D * y = b by forward substitution */ 1305 PetscCall(PetscArraycpy(x,b,3*mbs)); 1306 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x)); 1307 1308 /* solve U*x = y by back substitution */ 1309 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x)); 1310 1311 PetscCall(VecRestoreArrayRead(bb,&b)); 1312 PetscCall(VecRestoreArray(xx,&x)); 1313 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 1314 PetscFunctionReturn(0); 1315 } 1316 1317 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1318 { 1319 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1320 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1321 const MatScalar *aa=a->a; 1322 PetscScalar *x; 1323 const PetscScalar *b; 1324 1325 PetscFunctionBegin; 1326 PetscCall(VecGetArrayRead(bb,&b)); 1327 PetscCall(VecGetArray(xx,&x)); 1328 PetscCall(PetscArraycpy(x,b,3*mbs)); 1329 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x)); 1330 PetscCall(VecRestoreArrayRead(bb,&b)); 1331 PetscCall(VecRestoreArray(xx,&x)); 1332 PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs)); 1333 PetscFunctionReturn(0); 1334 } 1335 1336 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1337 { 1338 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1339 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1340 const MatScalar *aa=a->a; 1341 PetscScalar *x; 1342 const PetscScalar *b; 1343 1344 PetscFunctionBegin; 1345 PetscCall(VecGetArrayRead(bb,&b)); 1346 PetscCall(VecGetArray(xx,&x)); 1347 PetscCall(PetscArraycpy(x,b,3*mbs)); 1348 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x)); 1349 PetscCall(VecRestoreArrayRead(bb,&b)); 1350 PetscCall(VecRestoreArray(xx,&x)); 1351 PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs))); 1352 PetscFunctionReturn(0); 1353 } 1354 1355 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1356 { 1357 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1358 IS isrow=a->row; 1359 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1360 const PetscInt *r,*vj; 1361 PetscInt nz,k,k2,idx; 1362 const MatScalar *aa=a->a,*v,*diag; 1363 PetscScalar *x,x0,x1,*t; 1364 const PetscScalar *b; 1365 1366 PetscFunctionBegin; 1367 PetscCall(VecGetArrayRead(bb,&b)); 1368 PetscCall(VecGetArray(xx,&x)); 1369 t = a->solve_work; 1370 PetscCall(ISGetIndices(isrow,&r)); 1371 1372 /* solve U^T * D * y = perm(b) by forward substitution */ 1373 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1374 idx = 2*r[k]; 1375 t[k*2] = b[idx]; 1376 t[k*2+1] = b[idx+1]; 1377 } 1378 for (k=0; k<mbs; k++) { 1379 v = aa + 4*ai[k]; 1380 vj = aj + ai[k]; 1381 k2 = k*2; 1382 x0 = t[k2]; x1 = t[k2+1]; 1383 nz = ai[k+1] - ai[k]; 1384 while (nz--) { 1385 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1386 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1387 vj++; v += 4; 1388 } 1389 diag = aa+k*4; /* ptr to inv(Dk) */ 1390 t[k2] = diag[0]*x0 + diag[2]*x1; 1391 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1392 } 1393 1394 /* solve U*x = y by back substitution */ 1395 for (k=mbs-1; k>=0; k--) { 1396 v = aa + 4*ai[k]; 1397 vj = aj + ai[k]; 1398 k2 = k*2; 1399 x0 = t[k2]; x1 = t[k2+1]; 1400 nz = ai[k+1] - ai[k]; 1401 while (nz--) { 1402 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1403 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1404 vj++; 1405 v += 4; 1406 } 1407 t[k2] = x0; 1408 t[k2+1] = x1; 1409 idx = 2*r[k]; 1410 x[idx] = x0; 1411 x[idx+1] = x1; 1412 } 1413 1414 PetscCall(ISRestoreIndices(isrow,&r)); 1415 PetscCall(VecRestoreArrayRead(bb,&b)); 1416 PetscCall(VecRestoreArray(xx,&x)); 1417 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 1418 PetscFunctionReturn(0); 1419 } 1420 1421 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1422 { 1423 const MatScalar *v,*diag; 1424 PetscScalar x0,x1; 1425 PetscInt nz,k,k2; 1426 const PetscInt *vj; 1427 1428 PetscFunctionBegin; 1429 for (k=0; k<mbs; k++) { 1430 v = aa + 4*ai[k]; 1431 vj = aj + ai[k]; 1432 k2 = k*2; 1433 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1434 nz = ai[k+1] - ai[k]; 1435 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1436 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1437 while (nz--) { 1438 /* x(:) += U(k,:)^T*(Dk*xk) */ 1439 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1440 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1441 vj++; v += 4; 1442 } 1443 /* xk = inv(Dk)*(Dk*xk) */ 1444 diag = aa+k*4; /* ptr to inv(Dk) */ 1445 x[k2] = diag[0]*x0 + diag[2]*x1; 1446 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1447 } 1448 PetscFunctionReturn(0); 1449 } 1450 1451 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1452 { 1453 const MatScalar *v; 1454 PetscScalar x0,x1; 1455 PetscInt nz,k,k2; 1456 const PetscInt *vj; 1457 1458 PetscFunctionBegin; 1459 for (k=mbs-1; k>=0; k--) { 1460 v = aa + 4*ai[k]; 1461 vj = aj + ai[k]; 1462 k2 = k*2; 1463 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1464 nz = ai[k+1] - ai[k]; 1465 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1466 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1467 while (nz--) { 1468 /* xk += U(k,:)*x(:) */ 1469 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1470 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1471 vj++; 1472 v += 4; 1473 } 1474 x[k2] = x0; 1475 x[k2+1] = x1; 1476 } 1477 PetscFunctionReturn(0); 1478 } 1479 1480 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1481 { 1482 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1483 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1484 const MatScalar *aa=a->a; 1485 PetscScalar *x; 1486 const PetscScalar *b; 1487 1488 PetscFunctionBegin; 1489 PetscCall(VecGetArrayRead(bb,&b)); 1490 PetscCall(VecGetArray(xx,&x)); 1491 1492 /* solve U^T * D * y = b by forward substitution */ 1493 PetscCall(PetscArraycpy(x,b,2*mbs)); 1494 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x)); 1495 1496 /* solve U*x = y by back substitution */ 1497 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x)); 1498 1499 PetscCall(VecRestoreArrayRead(bb,&b)); 1500 PetscCall(VecRestoreArray(xx,&x)); 1501 PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs)); 1502 PetscFunctionReturn(0); 1503 } 1504 1505 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1506 { 1507 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1508 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1509 const MatScalar *aa=a->a; 1510 PetscScalar *x; 1511 const PetscScalar *b; 1512 1513 PetscFunctionBegin; 1514 PetscCall(VecGetArrayRead(bb,&b)); 1515 PetscCall(VecGetArray(xx,&x)); 1516 PetscCall(PetscArraycpy(x,b,2*mbs)); 1517 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x)); 1518 PetscCall(VecRestoreArrayRead(bb,&b)); 1519 PetscCall(VecRestoreArray(xx,&x)); 1520 PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs)); 1521 PetscFunctionReturn(0); 1522 } 1523 1524 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1525 { 1526 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1527 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1528 const MatScalar *aa=a->a; 1529 PetscScalar *x; 1530 const PetscScalar *b; 1531 1532 PetscFunctionBegin; 1533 PetscCall(VecGetArrayRead(bb,&b)); 1534 PetscCall(VecGetArray(xx,&x)); 1535 PetscCall(PetscArraycpy(x,b,2*mbs)); 1536 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x)); 1537 PetscCall(VecRestoreArrayRead(bb,&b)); 1538 PetscCall(VecRestoreArray(xx,&x)); 1539 PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz - mbs))); 1540 PetscFunctionReturn(0); 1541 } 1542 1543 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1544 { 1545 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1546 IS isrow=a->row; 1547 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1548 const MatScalar *aa=a->a,*v; 1549 const PetscScalar *b; 1550 PetscScalar *x,xk,*t; 1551 PetscInt nz,k,j; 1552 1553 PetscFunctionBegin; 1554 PetscCall(VecGetArrayRead(bb,&b)); 1555 PetscCall(VecGetArray(xx,&x)); 1556 t = a->solve_work; 1557 PetscCall(ISGetIndices(isrow,&rp)); 1558 1559 /* solve U^T*D*y = perm(b) by forward substitution */ 1560 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1561 for (k=0; k<mbs; k++) { 1562 v = aa + ai[k]; 1563 vj = aj + ai[k]; 1564 xk = t[k]; 1565 nz = ai[k+1] - ai[k] - 1; 1566 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1567 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1568 } 1569 1570 /* solve U*perm(x) = y by back substitution */ 1571 for (k=mbs-1; k>=0; k--) { 1572 v = aa + adiag[k] - 1; 1573 vj = aj + adiag[k] - 1; 1574 nz = ai[k+1] - ai[k] - 1; 1575 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1576 x[rp[k]] = t[k]; 1577 } 1578 1579 PetscCall(ISRestoreIndices(isrow,&rp)); 1580 PetscCall(VecRestoreArrayRead(bb,&b)); 1581 PetscCall(VecRestoreArray(xx,&x)); 1582 PetscCall(PetscLogFlops(4.0*a->nz - 3.0*mbs)); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1587 { 1588 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1589 IS isrow=a->row; 1590 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1591 const MatScalar *aa=a->a,*v; 1592 PetscScalar *x,xk,*t; 1593 const PetscScalar *b; 1594 PetscInt nz,k; 1595 1596 PetscFunctionBegin; 1597 PetscCall(VecGetArrayRead(bb,&b)); 1598 PetscCall(VecGetArray(xx,&x)); 1599 t = a->solve_work; 1600 PetscCall(ISGetIndices(isrow,&rp)); 1601 1602 /* solve U^T*D*y = perm(b) by forward substitution */ 1603 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1604 for (k=0; k<mbs; k++) { 1605 v = aa + ai[k] + 1; 1606 vj = aj + ai[k] + 1; 1607 xk = t[k]; 1608 nz = ai[k+1] - ai[k] - 1; 1609 while (nz--) t[*vj++] += (*v++) * xk; 1610 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1611 } 1612 1613 /* solve U*perm(x) = y by back substitution */ 1614 for (k=mbs-1; k>=0; k--) { 1615 v = aa + ai[k] + 1; 1616 vj = aj + ai[k] + 1; 1617 nz = ai[k+1] - ai[k] - 1; 1618 while (nz--) t[k] += (*v++) * t[*vj++]; 1619 x[rp[k]] = t[k]; 1620 } 1621 1622 PetscCall(ISRestoreIndices(isrow,&rp)); 1623 PetscCall(VecRestoreArrayRead(bb,&b)); 1624 PetscCall(VecRestoreArray(xx,&x)); 1625 PetscCall(PetscLogFlops(4.0*a->nz - 3*mbs)); 1626 PetscFunctionReturn(0); 1627 } 1628 1629 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1630 { 1631 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1632 IS isrow=a->row; 1633 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1634 const MatScalar *aa=a->a,*v; 1635 PetscReal diagk; 1636 PetscScalar *x,xk; 1637 const PetscScalar *b; 1638 PetscInt nz,k; 1639 1640 PetscFunctionBegin; 1641 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1642 PetscCall(VecGetArrayRead(bb,&b)); 1643 PetscCall(VecGetArray(xx,&x)); 1644 PetscCall(ISGetIndices(isrow,&rp)); 1645 1646 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1647 for (k=0; k<mbs; k++) { 1648 v = aa + ai[k]; 1649 vj = aj + ai[k]; 1650 xk = x[k]; 1651 nz = ai[k+1] - ai[k] - 1; 1652 while (nz--) x[*vj++] += (*v++) * xk; 1653 1654 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1655 PetscCheckFalse(PetscImaginaryPart(aa[adiag[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1656 x[k] = xk*PetscSqrtReal(diagk); 1657 } 1658 PetscCall(ISRestoreIndices(isrow,&rp)); 1659 PetscCall(VecRestoreArrayRead(bb,&b)); 1660 PetscCall(VecRestoreArray(xx,&x)); 1661 PetscCall(PetscLogFlops(2.0*a->nz - mbs)); 1662 PetscFunctionReturn(0); 1663 } 1664 1665 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1666 { 1667 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1668 IS isrow=a->row; 1669 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1670 const MatScalar *aa=a->a,*v; 1671 PetscReal diagk; 1672 PetscScalar *x,xk; 1673 const PetscScalar *b; 1674 PetscInt nz,k; 1675 1676 PetscFunctionBegin; 1677 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1678 PetscCall(VecGetArrayRead(bb,&b)); 1679 PetscCall(VecGetArray(xx,&x)); 1680 PetscCall(ISGetIndices(isrow,&rp)); 1681 1682 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1683 for (k=0; k<mbs; k++) { 1684 v = aa + ai[k] + 1; 1685 vj = aj + ai[k] + 1; 1686 xk = x[k]; 1687 nz = ai[k+1] - ai[k] - 1; 1688 while (nz--) x[*vj++] += (*v++) * xk; 1689 1690 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1691 PetscCheckFalse(PetscImaginaryPart(aa[ai[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1692 x[k] = xk*PetscSqrtReal(diagk); 1693 } 1694 PetscCall(ISRestoreIndices(isrow,&rp)); 1695 PetscCall(VecRestoreArrayRead(bb,&b)); 1696 PetscCall(VecRestoreArray(xx,&x)); 1697 PetscCall(PetscLogFlops(2.0*a->nz - mbs)); 1698 PetscFunctionReturn(0); 1699 } 1700 1701 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1702 { 1703 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1704 IS isrow=a->row; 1705 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1706 const MatScalar *aa=a->a,*v; 1707 PetscReal diagk; 1708 PetscScalar *x,*t; 1709 const PetscScalar *b; 1710 PetscInt nz,k; 1711 1712 PetscFunctionBegin; 1713 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1714 PetscCall(VecGetArrayRead(bb,&b)); 1715 PetscCall(VecGetArray(xx,&x)); 1716 t = a->solve_work; 1717 PetscCall(ISGetIndices(isrow,&rp)); 1718 1719 for (k=mbs-1; k>=0; k--) { 1720 v = aa + ai[k]; 1721 vj = aj + ai[k]; 1722 diagk = PetscRealPart(aa[adiag[k]]); 1723 PetscCheckFalse(PetscImaginaryPart(aa[adiag[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1724 t[k] = b[k] * PetscSqrtReal(diagk); 1725 nz = ai[k+1] - ai[k] - 1; 1726 while (nz--) t[k] += (*v++) * t[*vj++]; 1727 x[rp[k]] = t[k]; 1728 } 1729 PetscCall(ISRestoreIndices(isrow,&rp)); 1730 PetscCall(VecRestoreArrayRead(bb,&b)); 1731 PetscCall(VecRestoreArray(xx,&x)); 1732 PetscCall(PetscLogFlops(2.0*a->nz - mbs)); 1733 PetscFunctionReturn(0); 1734 } 1735 1736 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1737 { 1738 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1739 IS isrow=a->row; 1740 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1741 const MatScalar *aa=a->a,*v; 1742 PetscReal diagk; 1743 PetscScalar *x,*t; 1744 const PetscScalar *b; 1745 PetscInt nz,k; 1746 1747 PetscFunctionBegin; 1748 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1749 PetscCall(VecGetArrayRead(bb,&b)); 1750 PetscCall(VecGetArray(xx,&x)); 1751 t = a->solve_work; 1752 PetscCall(ISGetIndices(isrow,&rp)); 1753 1754 for (k=mbs-1; k>=0; k--) { 1755 v = aa + ai[k] + 1; 1756 vj = aj + ai[k] + 1; 1757 diagk = PetscRealPart(aa[ai[k]]); 1758 PetscCheckFalse(PetscImaginaryPart(aa[ai[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1759 t[k] = b[k] * PetscSqrtReal(diagk); 1760 nz = ai[k+1] - ai[k] - 1; 1761 while (nz--) t[k] += (*v++) * t[*vj++]; 1762 x[rp[k]] = t[k]; 1763 } 1764 PetscCall(ISRestoreIndices(isrow,&rp)); 1765 PetscCall(VecRestoreArrayRead(bb,&b)); 1766 PetscCall(VecRestoreArray(xx,&x)); 1767 PetscCall(PetscLogFlops(2.0*a->nz - mbs)); 1768 PetscFunctionReturn(0); 1769 } 1770 1771 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1772 { 1773 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1774 1775 PetscFunctionBegin; 1776 if (A->rmap->bs == 1) { 1777 PetscCall(MatSolve_SeqSBAIJ_1(A,bb->v,xx->v)); 1778 } else { 1779 IS isrow=a->row; 1780 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1781 const MatScalar *aa=a->a,*v; 1782 PetscScalar *x,*t; 1783 const PetscScalar *b; 1784 PetscInt nz,k,n,i,j; 1785 1786 if (bb->n > a->solves_work_n) { 1787 PetscCall(PetscFree(a->solves_work)); 1788 PetscCall(PetscMalloc1(bb->n*A->rmap->N,&a->solves_work)); 1789 a->solves_work_n = bb->n; 1790 } 1791 n = bb->n; 1792 PetscCall(VecGetArrayRead(bb->v,&b)); 1793 PetscCall(VecGetArray(xx->v,&x)); 1794 t = a->solves_work; 1795 1796 PetscCall(ISGetIndices(isrow,&rp)); 1797 1798 /* solve U^T*D*y = perm(b) by forward substitution */ 1799 for (k=0; k<mbs; k++) { 1800 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1801 } 1802 for (k=0; k<mbs; k++) { 1803 v = aa + ai[k]; 1804 vj = aj + ai[k]; 1805 nz = ai[k+1] - ai[k] - 1; 1806 for (j=0; j<nz; j++) { 1807 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1808 v++;vj++; 1809 } 1810 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1811 } 1812 1813 /* solve U*perm(x) = y by back substitution */ 1814 for (k=mbs-1; k>=0; k--) { 1815 v = aa + ai[k] - 1; 1816 vj = aj + ai[k] - 1; 1817 nz = ai[k+1] - ai[k] - 1; 1818 for (j=0; j<nz; j++) { 1819 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1820 v++;vj++; 1821 } 1822 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1823 } 1824 1825 PetscCall(ISRestoreIndices(isrow,&rp)); 1826 PetscCall(VecRestoreArrayRead(bb->v,&b)); 1827 PetscCall(VecRestoreArray(xx->v,&x)); 1828 PetscCall(PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs))); 1829 } 1830 PetscFunctionReturn(0); 1831 } 1832 1833 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1834 { 1835 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1836 1837 PetscFunctionBegin; 1838 if (A->rmap->bs == 1) { 1839 PetscCall(MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v)); 1840 } else { 1841 IS isrow=a->row; 1842 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1843 const MatScalar *aa=a->a,*v; 1844 PetscScalar *x,*t; 1845 const PetscScalar *b; 1846 PetscInt nz,k,n,i; 1847 1848 if (bb->n > a->solves_work_n) { 1849 PetscCall(PetscFree(a->solves_work)); 1850 PetscCall(PetscMalloc1(bb->n*A->rmap->N,&a->solves_work)); 1851 a->solves_work_n = bb->n; 1852 } 1853 n = bb->n; 1854 PetscCall(VecGetArrayRead(bb->v,&b)); 1855 PetscCall(VecGetArray(xx->v,&x)); 1856 t = a->solves_work; 1857 1858 PetscCall(ISGetIndices(isrow,&rp)); 1859 1860 /* solve U^T*D*y = perm(b) by forward substitution */ 1861 for (k=0; k<mbs; k++) { 1862 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1863 } 1864 for (k=0; k<mbs; k++) { 1865 v = aa + ai[k]; 1866 vj = aj + ai[k]; 1867 nz = ai[k+1] - ai[k]; 1868 while (nz--) { 1869 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1870 v++;vj++; 1871 } 1872 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1873 } 1874 1875 /* solve U*perm(x) = y by back substitution */ 1876 for (k=mbs-1; k>=0; k--) { 1877 v = aa + ai[k]; 1878 vj = aj + ai[k]; 1879 nz = ai[k+1] - ai[k]; 1880 while (nz--) { 1881 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1882 v++;vj++; 1883 } 1884 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1885 } 1886 1887 PetscCall(ISRestoreIndices(isrow,&rp)); 1888 PetscCall(VecRestoreArrayRead(bb->v,&b)); 1889 PetscCall(VecRestoreArray(xx->v,&x)); 1890 PetscCall(PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs))); 1891 } 1892 PetscFunctionReturn(0); 1893 } 1894 1895 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1896 { 1897 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1898 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1899 const MatScalar *aa=a->a,*v; 1900 const PetscScalar *b; 1901 PetscScalar *x,xi; 1902 PetscInt nz,i,j; 1903 1904 PetscFunctionBegin; 1905 PetscCall(VecGetArrayRead(bb,&b)); 1906 PetscCall(VecGetArray(xx,&x)); 1907 /* solve U^T*D*y = b by forward substitution */ 1908 PetscCall(PetscArraycpy(x,b,mbs)); 1909 for (i=0; i<mbs; i++) { 1910 v = aa + ai[i]; 1911 vj = aj + ai[i]; 1912 xi = x[i]; 1913 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1914 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 1915 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 1916 } 1917 /* solve U*x = y by backward substitution */ 1918 for (i=mbs-2; i>=0; i--) { 1919 xi = x[i]; 1920 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 1921 vj = aj + adiag[i] - 1; 1922 nz = ai[i+1] - ai[i] - 1; 1923 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 1924 x[i] = xi; 1925 } 1926 PetscCall(VecRestoreArrayRead(bb,&b)); 1927 PetscCall(VecRestoreArray(xx,&x)); 1928 PetscCall(PetscLogFlops(4.0*a->nz - 3*mbs)); 1929 PetscFunctionReturn(0); 1930 } 1931 1932 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X) 1933 { 1934 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1935 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1936 const MatScalar *aa=a->a,*v; 1937 const PetscScalar *b; 1938 PetscScalar *x,xi; 1939 PetscInt nz,i,j,neq,ldb,ldx; 1940 PetscBool isdense; 1941 1942 PetscFunctionBegin; 1943 if (!mbs) PetscFunctionReturn(0); 1944 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense)); 1945 PetscCheck(isdense,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix"); 1946 if (X != B) { 1947 PetscCall(PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense)); 1948 PetscCheck(isdense,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix"); 1949 } 1950 PetscCall(MatDenseGetArrayRead(B,&b)); 1951 PetscCall(MatDenseGetLDA(B,&ldb)); 1952 PetscCall(MatDenseGetArray(X,&x)); 1953 PetscCall(MatDenseGetLDA(X,&ldx)); 1954 for (neq=0; neq<B->cmap->n; neq++) { 1955 /* solve U^T*D*y = b by forward substitution */ 1956 PetscCall(PetscArraycpy(x,b,mbs)); 1957 for (i=0; i<mbs; i++) { 1958 v = aa + ai[i]; 1959 vj = aj + ai[i]; 1960 xi = x[i]; 1961 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1962 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 1963 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 1964 } 1965 /* solve U*x = y by backward substitution */ 1966 for (i=mbs-2; i>=0; i--) { 1967 xi = x[i]; 1968 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 1969 vj = aj + adiag[i] - 1; 1970 nz = ai[i+1] - ai[i] - 1; 1971 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 1972 x[i] = xi; 1973 } 1974 b += ldb; 1975 x += ldx; 1976 } 1977 PetscCall(MatDenseRestoreArrayRead(B,&b)); 1978 PetscCall(MatDenseRestoreArray(X,&x)); 1979 PetscCall(PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs))); 1980 PetscFunctionReturn(0); 1981 } 1982 1983 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1984 { 1985 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1986 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 1987 const MatScalar *aa=a->a,*v; 1988 PetscScalar *x,xk; 1989 const PetscScalar *b; 1990 PetscInt nz,k; 1991 1992 PetscFunctionBegin; 1993 PetscCall(VecGetArrayRead(bb,&b)); 1994 PetscCall(VecGetArray(xx,&x)); 1995 1996 /* solve U^T*D*y = b by forward substitution */ 1997 PetscCall(PetscArraycpy(x,b,mbs)); 1998 for (k=0; k<mbs; k++) { 1999 v = aa + ai[k] + 1; 2000 vj = aj + ai[k] + 1; 2001 xk = x[k]; 2002 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2003 while (nz--) x[*vj++] += (*v++) * xk; 2004 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2005 } 2006 2007 /* solve U*x = y by back substitution */ 2008 for (k=mbs-2; k>=0; k--) { 2009 v = aa + ai[k] + 1; 2010 vj = aj + ai[k] + 1; 2011 xk = x[k]; 2012 nz = ai[k+1] - ai[k] - 1; 2013 while (nz--) { 2014 xk += (*v++) * x[*vj++]; 2015 } 2016 x[k] = xk; 2017 } 2018 2019 PetscCall(VecRestoreArrayRead(bb,&b)); 2020 PetscCall(VecRestoreArray(xx,&x)); 2021 PetscCall(PetscLogFlops(4.0*a->nz - 3*mbs)); 2022 PetscFunctionReturn(0); 2023 } 2024 2025 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2026 { 2027 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2028 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2029 const MatScalar *aa=a->a,*v; 2030 PetscReal diagk; 2031 PetscScalar *x; 2032 const PetscScalar *b; 2033 PetscInt nz,k; 2034 2035 PetscFunctionBegin; 2036 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2037 PetscCall(VecGetArrayRead(bb,&b)); 2038 PetscCall(VecGetArray(xx,&x)); 2039 PetscCall(PetscArraycpy(x,b,mbs)); 2040 for (k=0; k<mbs; k++) { 2041 v = aa + ai[k]; 2042 vj = aj + ai[k]; 2043 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2044 while (nz--) x[*vj++] += (*v++) * x[k]; 2045 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2046 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",(double)PetscRealPart(aa[adiag[k]]),(double)PetscImaginaryPart(aa[adiag[k]])); 2047 x[k] *= PetscSqrtReal(diagk); 2048 } 2049 PetscCall(VecRestoreArrayRead(bb,&b)); 2050 PetscCall(VecRestoreArray(xx,&x)); 2051 PetscCall(PetscLogFlops(2.0*a->nz - mbs)); 2052 PetscFunctionReturn(0); 2053 } 2054 2055 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2056 { 2057 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2058 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2059 const MatScalar *aa=a->a,*v; 2060 PetscReal diagk; 2061 PetscScalar *x; 2062 const PetscScalar *b; 2063 PetscInt nz,k; 2064 2065 PetscFunctionBegin; 2066 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2067 PetscCall(VecGetArrayRead(bb,&b)); 2068 PetscCall(VecGetArray(xx,&x)); 2069 PetscCall(PetscArraycpy(x,b,mbs)); 2070 for (k=0; k<mbs; k++) { 2071 v = aa + ai[k] + 1; 2072 vj = aj + ai[k] + 1; 2073 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2074 while (nz--) x[*vj++] += (*v++) * x[k]; 2075 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2076 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",(double)PetscRealPart(aa[ai[k]]),(double)PetscImaginaryPart(aa[ai[k]])); 2077 x[k] *= PetscSqrtReal(diagk); 2078 } 2079 PetscCall(VecRestoreArrayRead(bb,&b)); 2080 PetscCall(VecRestoreArray(xx,&x)); 2081 PetscCall(PetscLogFlops(2.0*a->nz - mbs)); 2082 PetscFunctionReturn(0); 2083 } 2084 2085 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2086 { 2087 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2088 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2089 const MatScalar *aa=a->a,*v; 2090 PetscReal diagk; 2091 PetscScalar *x; 2092 const PetscScalar *b; 2093 PetscInt nz,k; 2094 2095 PetscFunctionBegin; 2096 /* solve D^(1/2)*U*x = b by back substitution */ 2097 PetscCall(VecGetArrayRead(bb,&b)); 2098 PetscCall(VecGetArray(xx,&x)); 2099 2100 for (k=mbs-1; k>=0; k--) { 2101 v = aa + ai[k]; 2102 vj = aj + ai[k]; 2103 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2104 PetscCheckFalse(PetscImaginaryPart(aa[adiag[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2105 x[k] = PetscSqrtReal(diagk)*b[k]; 2106 nz = ai[k+1] - ai[k] - 1; 2107 while (nz--) x[k] += (*v++) * x[*vj++]; 2108 } 2109 PetscCall(VecRestoreArrayRead(bb,&b)); 2110 PetscCall(VecRestoreArray(xx,&x)); 2111 PetscCall(PetscLogFlops(2.0*a->nz - mbs)); 2112 PetscFunctionReturn(0); 2113 } 2114 2115 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2116 { 2117 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2118 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2119 const MatScalar *aa=a->a,*v; 2120 PetscReal diagk; 2121 PetscScalar *x; 2122 const PetscScalar *b; 2123 PetscInt nz,k; 2124 2125 PetscFunctionBegin; 2126 /* solve D^(1/2)*U*x = b by back substitution */ 2127 PetscCall(VecGetArrayRead(bb,&b)); 2128 PetscCall(VecGetArray(xx,&x)); 2129 2130 for (k=mbs-1; k>=0; k--) { 2131 v = aa + ai[k] + 1; 2132 vj = aj + ai[k] + 1; 2133 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2134 PetscCheckFalse(PetscImaginaryPart(aa[ai[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2135 x[k] = PetscSqrtReal(diagk)*b[k]; 2136 nz = ai[k+1] - ai[k] - 1; 2137 while (nz--) x[k] += (*v++) * x[*vj++]; 2138 } 2139 PetscCall(VecRestoreArrayRead(bb,&b)); 2140 PetscCall(VecRestoreArray(xx,&x)); 2141 PetscCall(PetscLogFlops(2.0*a->nz - mbs)); 2142 PetscFunctionReturn(0); 2143 } 2144 2145 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2146 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2147 { 2148 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2149 const PetscInt *rip,mbs = a->mbs,*ai,*aj; 2150 PetscInt *jutmp,bs = A->rmap->bs,i; 2151 PetscInt m,reallocs = 0,*levtmp; 2152 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2153 PetscInt incrlev,*lev,shift,prow,nz; 2154 PetscReal f = info->fill,levels = info->levels; 2155 PetscBool perm_identity; 2156 2157 PetscFunctionBegin; 2158 /* check whether perm is the identity mapping */ 2159 PetscCall(ISIdentity(perm,&perm_identity)); 2160 2161 if (perm_identity) { 2162 a->permute = PETSC_FALSE; 2163 ai = a->i; aj = a->j; 2164 } else { /* non-trivial permutation */ 2165 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2166 } 2167 2168 /* initialization */ 2169 PetscCall(ISGetIndices(perm,&rip)); 2170 umax = (PetscInt)(f*ai[mbs] + 1); 2171 PetscCall(PetscMalloc1(umax,&lev)); 2172 umax += mbs + 1; 2173 shift = mbs + 1; 2174 PetscCall(PetscMalloc1(mbs+1,&iu)); 2175 PetscCall(PetscMalloc1(umax,&ju)); 2176 iu[0] = mbs + 1; 2177 juidx = mbs + 1; 2178 /* prowl: linked list for pivot row */ 2179 PetscCall(PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp)); 2180 /* q: linked list for col index */ 2181 2182 for (i=0; i<mbs; i++) { 2183 prowl[i] = mbs; 2184 q[i] = 0; 2185 } 2186 2187 /* for each row k */ 2188 for (k=0; k<mbs; k++) { 2189 nzk = 0; 2190 q[k] = mbs; 2191 /* copy current row into linked list */ 2192 nz = ai[rip[k]+1] - ai[rip[k]]; 2193 j = ai[rip[k]]; 2194 while (nz--) { 2195 vj = rip[aj[j++]]; 2196 if (vj > k) { 2197 qm = k; 2198 do { 2199 m = qm; qm = q[m]; 2200 } while (qm < vj); 2201 PetscCheck(qm != vj,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A"); 2202 nzk++; 2203 q[m] = vj; 2204 q[vj] = qm; 2205 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2206 } 2207 } 2208 2209 /* modify nonzero structure of k-th row by computing fill-in 2210 for each row prow to be merged in */ 2211 prow = k; 2212 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2213 2214 while (prow < k) { 2215 /* merge row prow into k-th row */ 2216 jmin = iu[prow] + 1; 2217 jmax = iu[prow+1]; 2218 qm = k; 2219 for (j=jmin; j<jmax; j++) { 2220 incrlev = lev[j-shift] + 1; 2221 if (incrlev > levels) continue; 2222 vj = ju[j]; 2223 do { 2224 m = qm; qm = q[m]; 2225 } while (qm < vj); 2226 if (qm != vj) { /* a fill */ 2227 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2228 levtmp[vj] = incrlev; 2229 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2230 } 2231 prow = prowl[prow]; /* next pivot row */ 2232 } 2233 2234 /* add k to row list for first nonzero element in k-th row */ 2235 if (nzk > 1) { 2236 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2237 prowl[k] = prowl[i]; prowl[i] = k; 2238 } 2239 iu[k+1] = iu[k] + nzk; 2240 2241 /* allocate more space to ju and lev if needed */ 2242 if (iu[k+1] > umax) { 2243 /* estimate how much additional space we will need */ 2244 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2245 /* just double the memory each time */ 2246 maxadd = umax; 2247 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2248 umax += maxadd; 2249 2250 /* allocate a longer ju */ 2251 PetscCall(PetscMalloc1(umax,&jutmp)); 2252 PetscCall(PetscArraycpy(jutmp,ju,iu[k])); 2253 PetscCall(PetscFree(ju)); 2254 ju = jutmp; 2255 2256 PetscCall(PetscMalloc1(umax,&jutmp)); 2257 PetscCall(PetscArraycpy(jutmp,lev,iu[k]-shift)); 2258 PetscCall(PetscFree(lev)); 2259 lev = jutmp; 2260 reallocs += 2; /* count how many times we realloc */ 2261 } 2262 2263 /* save nonzero structure of k-th row in ju */ 2264 i=k; 2265 while (nzk--) { 2266 i = q[i]; 2267 ju[juidx] = i; 2268 lev[juidx-shift] = levtmp[i]; 2269 juidx++; 2270 } 2271 } 2272 2273 #if defined(PETSC_USE_INFO) 2274 if (ai[mbs] != 0) { 2275 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2276 PetscCall(PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af)); 2277 PetscCall(PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af)); 2278 PetscCall(PetscInfo(A,"PCFactorSetFill(pc,%g);\n",(double)af)); 2279 PetscCall(PetscInfo(A,"for best performance.\n")); 2280 } else { 2281 PetscCall(PetscInfo(A,"Empty matrix\n")); 2282 } 2283 #endif 2284 2285 PetscCall(ISRestoreIndices(perm,&rip)); 2286 PetscCall(PetscFree3(prowl,q,levtmp)); 2287 PetscCall(PetscFree(lev)); 2288 2289 /* put together the new matrix */ 2290 PetscCall(MatSeqSBAIJSetPreallocation(B,bs,0,NULL)); 2291 2292 /* PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)iperm)); */ 2293 b = (Mat_SeqSBAIJ*)(B)->data; 2294 PetscCall(PetscFree2(b->imax,b->ilen)); 2295 2296 b->singlemalloc = PETSC_FALSE; 2297 b->free_a = PETSC_TRUE; 2298 b->free_ij = PETSC_TRUE; 2299 /* the next line frees the default space generated by the Create() */ 2300 PetscCall(PetscFree3(b->a,b->j,b->i)); 2301 PetscCall(PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a)); 2302 b->j = ju; 2303 b->i = iu; 2304 b->diag = NULL; 2305 b->ilen = NULL; 2306 b->imax = NULL; 2307 2308 PetscCall(ISDestroy(&b->row)); 2309 PetscCall(ISDestroy(&b->icol)); 2310 b->row = perm; 2311 b->icol = perm; 2312 PetscCall(PetscObjectReference((PetscObject)perm)); 2313 PetscCall(PetscObjectReference((PetscObject)perm)); 2314 PetscCall(PetscMalloc1(bs*mbs+bs,&b->solve_work)); 2315 /* In b structure: Free imax, ilen, old a, old j. 2316 Allocate idnew, solve_work, new a, new j */ 2317 PetscCall(PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)))); 2318 b->maxnz = b->nz = iu[mbs]; 2319 2320 (B)->info.factor_mallocs = reallocs; 2321 (B)->info.fill_ratio_given = f; 2322 if (ai[mbs] != 0) { 2323 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2324 } else { 2325 (B)->info.fill_ratio_needed = 0.0; 2326 } 2327 PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity)); 2328 PetscFunctionReturn(0); 2329 } 2330 2331 /* 2332 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2333 */ 2334 #include <petscbt.h> 2335 #include <../src/mat/utils/freespace.h> 2336 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2337 { 2338 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2339 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2340 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2341 const PetscInt *rip; 2342 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2343 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2344 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2345 PetscReal fill =info->fill,levels=info->levels; 2346 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2347 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2348 PetscBT lnkbt; 2349 2350 PetscFunctionBegin; 2351 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 2352 PetscCall(MatMissingDiagonal(A,&missing,&d)); 2353 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,d); 2354 if (bs > 1) { 2355 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info)); 2356 PetscFunctionReturn(0); 2357 } 2358 2359 /* check whether perm is the identity mapping */ 2360 PetscCall(ISIdentity(perm,&perm_identity)); 2361 PetscCheck(perm_identity,PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2362 a->permute = PETSC_FALSE; 2363 2364 PetscCall(PetscMalloc1(am+1,&ui)); 2365 PetscCall(PetscMalloc1(am+1,&udiag)); 2366 ui[0] = 0; 2367 2368 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2369 if (!levels) { 2370 /* reuse the column pointers and row offsets for memory savings */ 2371 for (i=0; i<am; i++) { 2372 ncols = ai[i+1] - ai[i]; 2373 ui[i+1] = ui[i] + ncols; 2374 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2375 } 2376 PetscCall(PetscMalloc1(ui[am]+1,&uj)); 2377 cols = uj; 2378 for (i=0; i<am; i++) { 2379 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2380 ncols = ai[i+1] - ai[i] -1; 2381 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2382 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2383 } 2384 } else { /* case: levels>0 */ 2385 PetscCall(ISGetIndices(perm,&rip)); 2386 2387 /* initialization */ 2388 /* jl: linked list for storing indices of the pivot rows 2389 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2390 PetscCall(PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl)); 2391 for (i=0; i<am; i++) { 2392 jl[i] = am; il[i] = 0; 2393 } 2394 2395 /* create and initialize a linked list for storing column indices of the active row k */ 2396 nlnk = am + 1; 2397 PetscCall(PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt)); 2398 2399 /* initial FreeSpace size is fill*(ai[am]+1) */ 2400 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space)); 2401 2402 current_space = free_space; 2403 2404 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl)); 2405 2406 current_space_lvl = free_space_lvl; 2407 2408 for (k=0; k<am; k++) { /* for each active row k */ 2409 /* initialize lnk by the column indices of row k */ 2410 nzk = 0; 2411 ncols = ai[k+1] - ai[k]; 2412 PetscCheck(ncols,PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %" PetscInt_FMT " in matrix ",k); 2413 cols = aj+ai[k]; 2414 PetscCall(PetscIncompleteLLInit(ncols,cols,am,rip,&nlnk,lnk,lnk_lvl,lnkbt)); 2415 nzk += nlnk; 2416 2417 /* update lnk by computing fill-in for each pivot row to be merged in */ 2418 prow = jl[k]; /* 1st pivot row */ 2419 2420 while (prow < k) { 2421 nextprow = jl[prow]; 2422 2423 /* merge prow into k-th row */ 2424 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2425 jmax = ui[prow+1]; 2426 ncols = jmax-jmin; 2427 i = jmin - ui[prow]; 2428 2429 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2430 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2431 j = *(uj - 1); 2432 PetscCall(PetscICCLLAddSorted(ncols,cols,levels,uj,am,&nlnk,lnk,lnk_lvl,lnkbt,j)); 2433 nzk += nlnk; 2434 2435 /* update il and jl for prow */ 2436 if (jmin < jmax) { 2437 il[prow] = jmin; 2438 2439 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2440 } 2441 prow = nextprow; 2442 } 2443 2444 /* if free space is not available, make more free space */ 2445 if (current_space->local_remaining<nzk) { 2446 i = am - k + 1; /* num of unfactored rows */ 2447 i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2448 PetscCall(PetscFreeSpaceGet(i,¤t_space)); 2449 PetscCall(PetscFreeSpaceGet(i,¤t_space_lvl)); 2450 reallocs++; 2451 } 2452 2453 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2454 PetscCheck(nzk != 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %" PetscInt_FMT " in ICC matrix factor",k); 2455 PetscCall(PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt)); 2456 2457 /* add the k-th row into il and jl */ 2458 if (nzk > 1) { 2459 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2460 jl[k] = jl[i]; jl[i] = k; 2461 il[k] = ui[k] + 1; 2462 } 2463 uj_ptr[k] = current_space->array; 2464 uj_lvl_ptr[k] = current_space_lvl->array; 2465 2466 current_space->array += nzk; 2467 current_space->local_used += nzk; 2468 current_space->local_remaining -= nzk; 2469 current_space_lvl->array += nzk; 2470 current_space_lvl->local_used += nzk; 2471 current_space_lvl->local_remaining -= nzk; 2472 2473 ui[k+1] = ui[k] + nzk; 2474 } 2475 2476 PetscCall(ISRestoreIndices(perm,&rip)); 2477 PetscCall(PetscFree4(uj_ptr,uj_lvl_ptr,il,jl)); 2478 2479 /* destroy list of free space and other temporary array(s) */ 2480 PetscCall(PetscMalloc1(ui[am]+1,&uj)); 2481 PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag)); /* store matrix factor */ 2482 PetscCall(PetscIncompleteLLDestroy(lnk,lnkbt)); 2483 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2484 2485 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2486 2487 /* put together the new matrix in MATSEQSBAIJ format */ 2488 PetscCall(MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL)); 2489 2490 b = (Mat_SeqSBAIJ*)(fact)->data; 2491 PetscCall(PetscFree2(b->imax,b->ilen)); 2492 2493 b->singlemalloc = PETSC_FALSE; 2494 b->free_a = PETSC_TRUE; 2495 b->free_ij = free_ij; 2496 2497 PetscCall(PetscMalloc1(ui[am]+1,&b->a)); 2498 2499 b->j = uj; 2500 b->i = ui; 2501 b->diag = udiag; 2502 b->free_diag = PETSC_TRUE; 2503 b->ilen = NULL; 2504 b->imax = NULL; 2505 b->row = perm; 2506 b->col = perm; 2507 2508 PetscCall(PetscObjectReference((PetscObject)perm)); 2509 PetscCall(PetscObjectReference((PetscObject)perm)); 2510 2511 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2512 2513 PetscCall(PetscMalloc1(am+1,&b->solve_work)); 2514 PetscCall(PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)))); 2515 2516 b->maxnz = b->nz = ui[am]; 2517 2518 fact->info.factor_mallocs = reallocs; 2519 fact->info.fill_ratio_given = fill; 2520 if (ai[am] != 0) { 2521 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2522 } else { 2523 fact->info.fill_ratio_needed = 0.0; 2524 } 2525 #if defined(PETSC_USE_INFO) 2526 if (ai[am] != 0) { 2527 PetscReal af = fact->info.fill_ratio_needed; 2528 PetscCall(PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af)); 2529 PetscCall(PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af)); 2530 PetscCall(PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af)); 2531 } else { 2532 PetscCall(PetscInfo(A,"Empty matrix\n")); 2533 } 2534 #endif 2535 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2536 PetscFunctionReturn(0); 2537 } 2538 2539 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2540 { 2541 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2542 Mat_SeqSBAIJ *b; 2543 PetscBool perm_identity,free_ij = PETSC_TRUE; 2544 PetscInt bs=A->rmap->bs,am=a->mbs; 2545 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2546 PetscInt reallocs=0,i,*ui; 2547 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2548 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2549 PetscReal fill =info->fill,levels=info->levels,ratio_needed; 2550 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2551 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2552 PetscBT lnkbt; 2553 2554 PetscFunctionBegin; 2555 /* 2556 This code originally uses Modified Sparse Row (MSR) storage 2557 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2558 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2559 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2560 thus the original code in MSR format is still used for these cases. 2561 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2562 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2563 */ 2564 if (bs > 1) { 2565 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info)); 2566 PetscFunctionReturn(0); 2567 } 2568 2569 /* check whether perm is the identity mapping */ 2570 PetscCall(ISIdentity(perm,&perm_identity)); 2571 PetscCheck(perm_identity,PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2572 a->permute = PETSC_FALSE; 2573 2574 /* special case that simply copies fill pattern */ 2575 if (!levels) { 2576 /* reuse the column pointers and row offsets for memory savings */ 2577 ui = a->i; 2578 uj = a->j; 2579 free_ij = PETSC_FALSE; 2580 ratio_needed = 1.0; 2581 } else { /* case: levels>0 */ 2582 PetscCall(ISGetIndices(perm,&rip)); 2583 2584 /* initialization */ 2585 PetscCall(PetscMalloc1(am+1,&ui)); 2586 ui[0] = 0; 2587 2588 /* jl: linked list for storing indices of the pivot rows 2589 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2590 PetscCall(PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl)); 2591 for (i=0; i<am; i++) { 2592 jl[i] = am; il[i] = 0; 2593 } 2594 2595 /* create and initialize a linked list for storing column indices of the active row k */ 2596 nlnk = am + 1; 2597 PetscCall(PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt)); 2598 2599 /* initial FreeSpace size is fill*(ai[am]+1) */ 2600 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space)); 2601 2602 current_space = free_space; 2603 2604 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl)); 2605 2606 current_space_lvl = free_space_lvl; 2607 2608 for (k=0; k<am; k++) { /* for each active row k */ 2609 /* initialize lnk by the column indices of row rip[k] */ 2610 nzk = 0; 2611 ncols = ai[rip[k]+1] - ai[rip[k]]; 2612 cols = aj+ai[rip[k]]; 2613 PetscCall(PetscIncompleteLLInit(ncols,cols,am,rip,&nlnk,lnk,lnk_lvl,lnkbt)); 2614 nzk += nlnk; 2615 2616 /* update lnk by computing fill-in for each pivot row to be merged in */ 2617 prow = jl[k]; /* 1st pivot row */ 2618 2619 while (prow < k) { 2620 nextprow = jl[prow]; 2621 2622 /* merge prow into k-th row */ 2623 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2624 jmax = ui[prow+1]; 2625 ncols = jmax-jmin; 2626 i = jmin - ui[prow]; 2627 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2628 j = *(uj_lvl_ptr[prow] + i - 1); 2629 cols_lvl = uj_lvl_ptr[prow]+i; 2630 PetscCall(PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,&nlnk,lnk,lnk_lvl,lnkbt,j)); 2631 nzk += nlnk; 2632 2633 /* update il and jl for prow */ 2634 if (jmin < jmax) { 2635 il[prow] = jmin; 2636 j = *cols; 2637 jl[prow] = jl[j]; 2638 jl[j] = prow; 2639 } 2640 prow = nextprow; 2641 } 2642 2643 /* if free space is not available, make more free space */ 2644 if (current_space->local_remaining<nzk) { 2645 i = am - k + 1; /* num of unfactored rows */ 2646 i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2647 PetscCall(PetscFreeSpaceGet(i,¤t_space)); 2648 PetscCall(PetscFreeSpaceGet(i,¤t_space_lvl)); 2649 reallocs++; 2650 } 2651 2652 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2653 PetscCall(PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt)); 2654 2655 /* add the k-th row into il and jl */ 2656 if (nzk-1 > 0) { 2657 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2658 jl[k] = jl[i]; jl[i] = k; 2659 il[k] = ui[k] + 1; 2660 } 2661 uj_ptr[k] = current_space->array; 2662 uj_lvl_ptr[k] = current_space_lvl->array; 2663 2664 current_space->array += nzk; 2665 current_space->local_used += nzk; 2666 current_space->local_remaining -= nzk; 2667 current_space_lvl->array += nzk; 2668 current_space_lvl->local_used += nzk; 2669 current_space_lvl->local_remaining -= nzk; 2670 2671 ui[k+1] = ui[k] + nzk; 2672 } 2673 2674 PetscCall(ISRestoreIndices(perm,&rip)); 2675 PetscCall(PetscFree4(uj_ptr,uj_lvl_ptr,il,jl)); 2676 2677 /* destroy list of free space and other temporary array(s) */ 2678 PetscCall(PetscMalloc1(ui[am]+1,&uj)); 2679 PetscCall(PetscFreeSpaceContiguous(&free_space,uj)); 2680 PetscCall(PetscIncompleteLLDestroy(lnk,lnkbt)); 2681 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2682 if (ai[am] != 0) { 2683 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2684 } else { 2685 ratio_needed = 0.0; 2686 } 2687 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2688 2689 /* put together the new matrix in MATSEQSBAIJ format */ 2690 PetscCall(MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL)); 2691 2692 b = (Mat_SeqSBAIJ*)(fact)->data; 2693 2694 PetscCall(PetscFree2(b->imax,b->ilen)); 2695 2696 b->singlemalloc = PETSC_FALSE; 2697 b->free_a = PETSC_TRUE; 2698 b->free_ij = free_ij; 2699 2700 PetscCall(PetscMalloc1(ui[am]+1,&b->a)); 2701 2702 b->j = uj; 2703 b->i = ui; 2704 b->diag = NULL; 2705 b->ilen = NULL; 2706 b->imax = NULL; 2707 b->row = perm; 2708 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2709 2710 PetscCall(PetscObjectReference((PetscObject)perm)); 2711 b->icol = perm; 2712 PetscCall(PetscObjectReference((PetscObject)perm)); 2713 PetscCall(PetscMalloc1(am+1,&b->solve_work)); 2714 2715 b->maxnz = b->nz = ui[am]; 2716 2717 fact->info.factor_mallocs = reallocs; 2718 fact->info.fill_ratio_given = fill; 2719 fact->info.fill_ratio_needed = ratio_needed; 2720 #if defined(PETSC_USE_INFO) 2721 if (ai[am] != 0) { 2722 PetscReal af = fact->info.fill_ratio_needed; 2723 PetscCall(PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af)); 2724 PetscCall(PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af)); 2725 PetscCall(PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af)); 2726 } else { 2727 PetscCall(PetscInfo(A,"Empty matrix\n")); 2728 } 2729 #endif 2730 if (perm_identity) { 2731 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2732 } else { 2733 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2734 } 2735 PetscFunctionReturn(0); 2736 } 2737