1 /*$Id: sbaijfact.c,v 1.61 2001/08/06 21:15:47 bsmith Exp $*/ 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/mat/impls/sbaij/seq/sbaij.h" 5 #include "src/vec/vecimpl.h" 6 #include "src/inline/ilu.h" 7 #include "include/petscis.h" 8 9 #if !defined(PETSC_USE_COMPLEX) 10 /* 11 input: 12 F -- numeric factor 13 output: 14 nneg, nzero, npos: matrix inertia 15 */ 16 17 #undef __FUNCT__ 18 #define __FUNCT__ "MatGetInertia_SeqSBAIJ" 19 int MatGetInertia_SeqSBAIJ(Mat F,int *nneig,int *nzero,int *npos) 20 { 21 Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data; 22 PetscScalar *dd = fact_ptr->a; 23 int mbs=fact_ptr->mbs,bs=fact_ptr->bs,i,nneig_tmp,npos_tmp, 24 *fi = fact_ptr->i; 25 26 PetscFunctionBegin; 27 if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %d >1 yet",bs); 28 nneig_tmp = 0; npos_tmp = 0; 29 for (i=0; i<mbs; i++){ 30 if (PetscRealPart(dd[*fi]) > 0.0){ 31 npos_tmp++; 32 } else if (PetscRealPart(dd[*fi]) < 0.0){ 33 nneig_tmp++; 34 } 35 fi++; 36 } 37 if (nneig) *nneig = nneig_tmp; 38 if (npos) *npos = npos_tmp; 39 if (nzero) *nzero = mbs - nneig_tmp - npos_tmp; 40 41 PetscFunctionReturn(0); 42 } 43 #endif /* !defined(PETSC_USE_COMPLEX) */ 44 45 /* Using Modified Sparse Row (MSR) storage. 46 See page 85, "Iterative Methods ..." by Saad. */ 47 /* 48 Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP. 49 */ 50 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 51 #undef __FUNCT__ 52 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ" 53 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B) 54 { 55 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 56 int *rip,ierr,i,mbs = a->mbs,*ai,*aj; 57 int *jutmp,bs = a->bs,bs2=a->bs2; 58 int m,realloc = 0,prow; 59 int *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 60 int *il,ili,nextprow; 61 PetscReal f = info->fill; 62 PetscTruth perm_identity; 63 64 PetscFunctionBegin; 65 /* check whether perm is the identity mapping */ 66 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 67 68 /* -- inplace factorization, i.e., use sbaij for *B -- */ 69 if (perm_identity && bs==1 ){ 70 if (!perm_identity) a->permute = PETSC_TRUE; 71 72 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 73 74 if (perm_identity){ /* without permutation */ 75 ai = a->i; aj = a->j; 76 } else { /* non-trivial permutation */ 77 ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr); 78 ai = a->inew; aj = a->jnew; 79 } 80 81 /* initialization */ 82 ierr = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr); 83 umax = (int)(f*ai[mbs] + 1); 84 ierr = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr); 85 iu[0] = 0; 86 juidx = 0; /* index for ju */ 87 ierr = PetscMalloc((3*mbs+1)*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for getting pivot row */ 88 q = jl + mbs; /* linked list for col index of active row */ 89 il = q + mbs; 90 for (i=0; i<mbs; i++){ 91 jl[i] = mbs; 92 q[i] = 0; 93 il[i] = 0; 94 } 95 96 /* for each row k */ 97 for (k=0; k<mbs; k++){ 98 nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */ 99 q[k] = mbs; 100 /* initialize nonzero structure of k-th row to row rip[k] of A */ 101 jmin = ai[rip[k]] +1; /* exclude diag[k] */ 102 jmax = ai[rip[k]+1]; 103 for (j=jmin; j<jmax; j++){ 104 vj = rip[aj[j]]; /* col. value */ 105 if(vj > k){ 106 qm = k; 107 do { 108 m = qm; qm = q[m]; 109 } while(qm < vj); 110 if (qm == vj) { 111 SETERRQ(1," error: duplicate entry in A\n"); 112 } 113 nzk++; 114 q[m] = vj; 115 q[vj] = qm; 116 } /* if(vj > k) */ 117 } /* for (j=jmin; j<jmax; j++) */ 118 119 /* modify nonzero structure of k-th row by computing fill-in 120 for each row i to be merged in */ 121 prow = k; 122 prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */ 123 124 while (prow < k){ 125 nextprow = jl[prow]; 126 127 /* merge row prow into k-th row */ 128 ili = il[prow]; 129 jmin = ili + 1; /* points to 2nd nzero entry in U(prow,k:mbs-1) */ 130 jmax = iu[prow+1]; 131 qm = k; 132 for (j=jmin; j<jmax; j++){ 133 vj = ju[j]; 134 do { 135 m = qm; qm = q[m]; 136 } while (qm < vj); 137 if (qm != vj){ /* a fill */ 138 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 139 } 140 } /* end of for (j=jmin; j<jmax; j++) */ 141 if (jmin < jmax){ 142 il[prow] = jmin; 143 j = ju[jmin]; 144 jl[prow] = jl[j]; jl[j] = prow; /* update jl */ 145 } 146 prow = nextprow; 147 } 148 149 /* update il and jl */ 150 if (nzk > 0){ 151 i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 152 jl[k] = jl[i]; jl[i] = k; 153 il[k] = iu[k] + 1; 154 } 155 iu[k+1] = iu[k] + nzk + 1; /* include diag[k] */ 156 157 /* allocate more space to ju if needed */ 158 if (iu[k+1] > umax) { 159 /* estimate how much additional space we will need */ 160 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 161 /* just double the memory each time */ 162 maxadd = umax; 163 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 164 umax += maxadd; 165 166 /* allocate a longer ju */ 167 ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr); 168 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 169 ierr = PetscFree(ju);CHKERRQ(ierr); 170 ju = jutmp; 171 realloc++; /* count how many times we realloc */ 172 } 173 174 /* save nonzero structure of k-th row in ju */ 175 ju[juidx++] = k; /* diag[k] */ 176 i = k; 177 while (nzk --) { 178 i = q[i]; 179 ju[juidx++] = i; 180 } 181 } 182 183 if (ai[mbs] != 0) { 184 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 185 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 186 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af); 187 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af); 188 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"); 189 } else { 190 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 191 } 192 193 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 194 /* ierr = PetscFree(q);CHKERRQ(ierr); */ 195 ierr = PetscFree(jl);CHKERRQ(ierr); 196 197 /* put together the new matrix */ 198 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 199 /* PetscLogObjectParent(*B,iperm); */ 200 b = (Mat_SeqSBAIJ*)(*B)->data; 201 ierr = PetscFree(b->imax);CHKERRQ(ierr); 202 b->singlemalloc = PETSC_FALSE; 203 /* the next line frees the default space generated by the Create() */ 204 ierr = PetscFree(b->a);CHKERRQ(ierr); 205 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 206 ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr); 207 b->j = ju; 208 b->i = iu; 209 b->diag = 0; 210 b->ilen = 0; 211 b->imax = 0; 212 b->row = perm; 213 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 214 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 215 b->icol = perm; 216 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 217 ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 218 /* In b structure: Free imax, ilen, old a, old j. 219 Allocate idnew, solve_work, new a, new j */ 220 PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 221 b->s_maxnz = b->s_nz = iu[mbs]; 222 223 (*B)->factor = FACTOR_CHOLESKY; 224 (*B)->info.factor_mallocs = realloc; 225 (*B)->info.fill_ratio_given = f; 226 if (ai[mbs] != 0) { 227 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 228 } else { 229 (*B)->info.fill_ratio_needed = 0.0; 230 } 231 232 233 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 234 (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering; 235 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"); 236 237 PetscFunctionReturn(0); 238 } 239 /* ----------- end of new code --------------------*/ 240 241 242 if (!perm_identity) a->permute = PETSC_TRUE; 243 244 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 245 246 if (perm_identity){ /* without permutation */ 247 ai = a->i; aj = a->j; 248 } else { /* non-trivial permutation */ 249 ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr); 250 ai = a->inew; aj = a->jnew; 251 } 252 253 /* initialization */ 254 ierr = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr); 255 umax = (int)(f*ai[mbs] + 1); umax += mbs + 1; 256 ierr = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr); 257 iu[0] = mbs+1; 258 juidx = mbs + 1; /* index for ju */ 259 ierr = PetscMalloc(2*mbs*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for pivot row */ 260 q = jl + mbs; /* linked list for col index */ 261 for (i=0; i<mbs; i++){ 262 jl[i] = mbs; 263 q[i] = 0; 264 } 265 266 /* for each row k */ 267 for (k=0; k<mbs; k++){ 268 for (i=0; i<mbs; i++) q[i] = 0; /* to be removed! */ 269 nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */ 270 q[k] = mbs; 271 /* initialize nonzero structure of k-th row to row rip[k] of A */ 272 jmin = ai[rip[k]] +1; /* exclude diag[k] */ 273 jmax = ai[rip[k]+1]; 274 for (j=jmin; j<jmax; j++){ 275 vj = rip[aj[j]]; /* col. value */ 276 if(vj > k){ 277 qm = k; 278 do { 279 m = qm; qm = q[m]; 280 } while(qm < vj); 281 if (qm == vj) { 282 SETERRQ(1," error: duplicate entry in A\n"); 283 } 284 nzk++; 285 q[m] = vj; 286 q[vj] = qm; 287 } /* if(vj > k) */ 288 } /* for (j=jmin; j<jmax; j++) */ 289 290 /* modify nonzero structure of k-th row by computing fill-in 291 for each row i to be merged in */ 292 prow = k; 293 prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */ 294 295 while (prow < k){ 296 /* merge row prow into k-th row */ 297 jmin = iu[prow] + 1; jmax = iu[prow+1]; 298 qm = k; 299 for (j=jmin; j<jmax; j++){ 300 vj = ju[j]; 301 do { 302 m = qm; qm = q[m]; 303 } while (qm < vj); 304 if (qm != vj){ 305 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 306 } 307 } 308 prow = jl[prow]; /* next pivot row */ 309 } 310 311 /* add k to row list for first nonzero element in k-th row */ 312 if (nzk > 0){ 313 i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */ 314 jl[k] = jl[i]; jl[i] = k; 315 } 316 iu[k+1] = iu[k] + nzk; 317 318 /* allocate more space to ju if needed */ 319 if (iu[k+1] > umax) { 320 /* estimate how much additional space we will need */ 321 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 322 /* just double the memory each time */ 323 maxadd = umax; 324 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 325 umax += maxadd; 326 327 /* allocate a longer ju */ 328 ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr); 329 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 330 ierr = PetscFree(ju);CHKERRQ(ierr); 331 ju = jutmp; 332 realloc++; /* count how many times we realloc */ 333 } 334 335 /* save nonzero structure of k-th row in ju */ 336 i=k; 337 while (nzk --) { 338 i = q[i]; 339 ju[juidx++] = i; 340 } 341 } 342 343 if (ai[mbs] != 0) { 344 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 345 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 346 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af); 347 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af); 348 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"); 349 } else { 350 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 351 } 352 353 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 354 /* ierr = PetscFree(q);CHKERRQ(ierr); */ 355 ierr = PetscFree(jl);CHKERRQ(ierr); 356 357 /* put together the new matrix */ 358 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 359 /* PetscLogObjectParent(*B,iperm); */ 360 b = (Mat_SeqSBAIJ*)(*B)->data; 361 ierr = PetscFree(b->imax);CHKERRQ(ierr); 362 b->singlemalloc = PETSC_FALSE; 363 /* the next line frees the default space generated by the Create() */ 364 ierr = PetscFree(b->a);CHKERRQ(ierr); 365 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 366 ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr); 367 b->j = ju; 368 b->i = iu; 369 b->diag = 0; 370 b->ilen = 0; 371 b->imax = 0; 372 b->row = perm; 373 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 374 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 375 b->icol = perm; 376 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 377 ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 378 /* In b structure: Free imax, ilen, old a, old j. 379 Allocate idnew, solve_work, new a, new j */ 380 PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 381 b->s_maxnz = b->s_nz = iu[mbs]; 382 383 (*B)->factor = FACTOR_CHOLESKY; 384 (*B)->info.factor_mallocs = realloc; 385 (*B)->info.fill_ratio_given = f; 386 if (ai[mbs] != 0) { 387 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 388 } else { 389 (*B)->info.fill_ratio_needed = 0.0; 390 } 391 392 if (perm_identity){ 393 switch (bs) { 394 case 1: 395 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 396 (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering; 397 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"); 398 break; 399 case 2: 400 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering; 401 (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering; 402 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n"); 403 break; 404 case 3: 405 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering; 406 (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering; 407 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n"); 408 break; 409 case 4: 410 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering; 411 (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering; 412 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n"); 413 break; 414 case 5: 415 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering; 416 (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering; 417 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n"); 418 break; 419 case 6: 420 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering; 421 (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering; 422 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n"); 423 break; 424 case 7: 425 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering; 426 (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering; 427 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n"); 428 break; 429 default: 430 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering; 431 (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering; 432 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n"); 433 break; 434 } 435 } 436 437 PetscFunctionReturn(0); 438 } 439 440 441 #undef __FUNCT__ 442 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N" 443 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B) 444 { 445 Mat C = *B; 446 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 447 IS perm = b->row; 448 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 449 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 450 int bs=a->bs,bs2 = a->bs2; 451 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 452 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 453 MatScalar *work; 454 int *pivots; 455 456 PetscFunctionBegin; 457 458 /* initialization */ 459 ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 460 ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr); 461 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 462 jl = il + mbs; 463 for (i=0; i<mbs; i++) { 464 jl[i] = mbs; il[0] = 0; 465 } 466 ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr); 467 uik = dk + bs2; 468 work = uik + bs2; 469 ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr); 470 471 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 472 473 /* check permutation */ 474 if (!a->permute){ 475 ai = a->i; aj = a->j; aa = a->a; 476 } else { 477 ai = a->inew; aj = a->jnew; 478 ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr); 479 ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 480 ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr); 481 ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 482 483 for (i=0; i<mbs; i++){ 484 jmin = ai[i]; jmax = ai[i+1]; 485 for (j=jmin; j<jmax; j++){ 486 while (a2anew[j] != j){ 487 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 488 for (k1=0; k1<bs2; k1++){ 489 dk[k1] = aa[k*bs2+k1]; 490 aa[k*bs2+k1] = aa[j*bs2+k1]; 491 aa[j*bs2+k1] = dk[k1]; 492 } 493 } 494 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 495 if (i > aj[j]){ 496 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 497 ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */ 498 for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 499 for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */ 500 for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1]; 501 } 502 } 503 } 504 } 505 ierr = PetscFree(a2anew);CHKERRQ(ierr); 506 } 507 508 /* for each row k */ 509 for (k = 0; k<mbs; k++){ 510 511 /*initialize k-th row with elements nonzero in row perm(k) of A */ 512 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 513 514 ap = aa + jmin*bs2; 515 for (j = jmin; j < jmax; j++){ 516 vj = perm_ptr[aj[j]]; /* block col. index */ 517 rtmp_ptr = rtmp + vj*bs2; 518 for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++; 519 } 520 521 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 522 ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 523 i = jl[k]; /* first row to be added to k_th row */ 524 525 while (i < k){ 526 nexti = jl[i]; /* next row to be added to k_th row */ 527 528 /* compute multiplier */ 529 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 530 531 /* uik = -inv(Di)*U_bar(i,k) */ 532 diag = ba + i*bs2; 533 u = ba + ili*bs2; 534 ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 535 Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u); 536 537 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 538 Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u); 539 540 /* update -U(i,k) */ 541 ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 542 543 /* add multiple of row i to k-th row ... */ 544 jmin = ili + 1; jmax = bi[i+1]; 545 if (jmin < jmax){ 546 for (j=jmin; j<jmax; j++) { 547 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 548 rtmp_ptr = rtmp + bj[j]*bs2; 549 u = ba + j*bs2; 550 Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u); 551 } 552 553 /* ... add i to row list for next nonzero entry */ 554 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 555 j = bj[jmin]; 556 jl[i] = jl[j]; jl[j] = i; /* update jl */ 557 } 558 i = nexti; 559 } 560 561 /* save nonzero entries in k-th row of U ... */ 562 563 /* invert diagonal block */ 564 diag = ba+k*bs2; 565 ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr); 566 Kernel_A_gets_inverse_A(bs,diag,pivots,work); 567 568 jmin = bi[k]; jmax = bi[k+1]; 569 if (jmin < jmax) { 570 for (j=jmin; j<jmax; j++){ 571 vj = bj[j]; /* block col. index of U */ 572 u = ba + j*bs2; 573 rtmp_ptr = rtmp + vj*bs2; 574 for (k1=0; k1<bs2; k1++){ 575 *u++ = *rtmp_ptr; 576 *rtmp_ptr++ = 0.0; 577 } 578 } 579 580 /* ... add k to row list for first nonzero entry in k-th row */ 581 il[k] = jmin; 582 i = bj[jmin]; 583 jl[k] = jl[i]; jl[i] = k; 584 } 585 } 586 587 ierr = PetscFree(rtmp);CHKERRQ(ierr); 588 ierr = PetscFree(il);CHKERRQ(ierr); 589 ierr = PetscFree(dk);CHKERRQ(ierr); 590 ierr = PetscFree(pivots);CHKERRQ(ierr); 591 if (a->permute){ 592 ierr = PetscFree(aa);CHKERRQ(ierr); 593 } 594 595 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 596 C->factor = FACTOR_CHOLESKY; 597 C->assembled = PETSC_TRUE; 598 C->preallocated = PETSC_TRUE; 599 PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 600 PetscFunctionReturn(0); 601 } 602 603 #undef __FUNCT__ 604 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering" 605 int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B) 606 { 607 Mat C = *B; 608 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 609 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 610 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 611 int bs=a->bs,bs2 = a->bs2; 612 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 613 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 614 MatScalar *work; 615 int *pivots; 616 617 PetscFunctionBegin; 618 619 /* initialization */ 620 621 ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 622 ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr); 623 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 624 jl = il + mbs; 625 for (i=0; i<mbs; i++) { 626 jl[i] = mbs; il[0] = 0; 627 } 628 ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr); 629 uik = dk + bs2; 630 work = uik + bs2; 631 ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr); 632 633 ai = a->i; aj = a->j; aa = a->a; 634 635 /* for each row k */ 636 for (k = 0; k<mbs; k++){ 637 638 /*initialize k-th row with elements nonzero in row k of A */ 639 jmin = ai[k]; jmax = ai[k+1]; 640 ap = aa + jmin*bs2; 641 for (j = jmin; j < jmax; j++){ 642 vj = aj[j]; /* block col. index */ 643 rtmp_ptr = rtmp + vj*bs2; 644 for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++; 645 } 646 647 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 648 ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 649 i = jl[k]; /* first row to be added to k_th row */ 650 651 while (i < k){ 652 nexti = jl[i]; /* next row to be added to k_th row */ 653 654 /* compute multiplier */ 655 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 656 657 /* uik = -inv(Di)*U_bar(i,k) */ 658 diag = ba + i*bs2; 659 u = ba + ili*bs2; 660 ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 661 Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u); 662 663 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 664 Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u); 665 666 /* update -U(i,k) */ 667 ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 668 669 /* add multiple of row i to k-th row ... */ 670 jmin = ili + 1; jmax = bi[i+1]; 671 if (jmin < jmax){ 672 for (j=jmin; j<jmax; j++) { 673 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 674 rtmp_ptr = rtmp + bj[j]*bs2; 675 u = ba + j*bs2; 676 Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u); 677 } 678 679 /* ... add i to row list for next nonzero entry */ 680 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 681 j = bj[jmin]; 682 jl[i] = jl[j]; jl[j] = i; /* update jl */ 683 } 684 i = nexti; 685 } 686 687 /* save nonzero entries in k-th row of U ... */ 688 689 /* invert diagonal block */ 690 diag = ba+k*bs2; 691 ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr); 692 Kernel_A_gets_inverse_A(bs,diag,pivots,work); 693 694 jmin = bi[k]; jmax = bi[k+1]; 695 if (jmin < jmax) { 696 for (j=jmin; j<jmax; j++){ 697 vj = bj[j]; /* block col. index of U */ 698 u = ba + j*bs2; 699 rtmp_ptr = rtmp + vj*bs2; 700 for (k1=0; k1<bs2; k1++){ 701 *u++ = *rtmp_ptr; 702 *rtmp_ptr++ = 0.0; 703 } 704 } 705 706 /* ... add k to row list for first nonzero entry in k-th row */ 707 il[k] = jmin; 708 i = bj[jmin]; 709 jl[k] = jl[i]; jl[i] = k; 710 } 711 } 712 713 ierr = PetscFree(rtmp);CHKERRQ(ierr); 714 ierr = PetscFree(il);CHKERRQ(ierr); 715 ierr = PetscFree(dk);CHKERRQ(ierr); 716 ierr = PetscFree(pivots);CHKERRQ(ierr); 717 718 C->factor = FACTOR_CHOLESKY; 719 C->assembled = PETSC_TRUE; 720 C->preallocated = PETSC_TRUE; 721 PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 722 PetscFunctionReturn(0); 723 } 724 725 /* 726 Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP. 727 Version for blocks 2 by 2. 728 */ 729 #undef __FUNCT__ 730 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2" 731 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B) 732 { 733 Mat C = *B; 734 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 735 IS perm = b->row; 736 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 737 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 738 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 739 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 740 741 PetscFunctionBegin; 742 743 /* initialization */ 744 /* il and jl record the first nonzero element in each row of the accessing 745 window U(0:k, k:mbs-1). 746 jl: list of rows to be added to uneliminated rows 747 i>= k: jl(i) is the first row to be added to row i 748 i< k: jl(i) is the row following row i in some list of rows 749 jl(i) = mbs indicates the end of a list 750 il(i): points to the first nonzero element in columns k,...,mbs-1 of 751 row i of U */ 752 ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 753 ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr); 754 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 755 jl = il + mbs; 756 for (i=0; i<mbs; i++) { 757 jl[i] = mbs; il[0] = 0; 758 } 759 ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr); 760 uik = dk + 4; 761 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 762 763 /* check permutation */ 764 if (!a->permute){ 765 ai = a->i; aj = a->j; aa = a->a; 766 } else { 767 ai = a->inew; aj = a->jnew; 768 ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr); 769 ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 770 ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr); 771 ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 772 773 for (i=0; i<mbs; i++){ 774 jmin = ai[i]; jmax = ai[i+1]; 775 for (j=jmin; j<jmax; j++){ 776 while (a2anew[j] != j){ 777 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 778 for (k1=0; k1<4; k1++){ 779 dk[k1] = aa[k*4+k1]; 780 aa[k*4+k1] = aa[j*4+k1]; 781 aa[j*4+k1] = dk[k1]; 782 } 783 } 784 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 785 if (i > aj[j]){ 786 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 787 ap = aa + j*4; /* ptr to the beginning of the block */ 788 dk[1] = ap[1]; /* swap ap[1] and ap[2] */ 789 ap[1] = ap[2]; 790 ap[2] = dk[1]; 791 } 792 } 793 } 794 ierr = PetscFree(a2anew);CHKERRQ(ierr); 795 } 796 797 /* for each row k */ 798 for (k = 0; k<mbs; k++){ 799 800 /*initialize k-th row with elements nonzero in row perm(k) of A */ 801 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 802 ap = aa + jmin*4; 803 for (j = jmin; j < jmax; j++){ 804 vj = perm_ptr[aj[j]]; /* block col. index */ 805 rtmp_ptr = rtmp + vj*4; 806 for (i=0; i<4; i++) *rtmp_ptr++ = *ap++; 807 } 808 809 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 810 ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr); 811 i = jl[k]; /* first row to be added to k_th row */ 812 813 while (i < k){ 814 nexti = jl[i]; /* next row to be added to k_th row */ 815 816 /* compute multiplier */ 817 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 818 819 /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */ 820 diag = ba + i*4; 821 u = ba + ili*4; 822 uik[0] = -(diag[0]*u[0] + diag[2]*u[1]); 823 uik[1] = -(diag[1]*u[0] + diag[3]*u[1]); 824 uik[2] = -(diag[0]*u[2] + diag[2]*u[3]); 825 uik[3] = -(diag[1]*u[2] + diag[3]*u[3]); 826 827 /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */ 828 dk[0] += uik[0]*u[0] + uik[1]*u[1]; 829 dk[1] += uik[2]*u[0] + uik[3]*u[1]; 830 dk[2] += uik[0]*u[2] + uik[1]*u[3]; 831 dk[3] += uik[2]*u[2] + uik[3]*u[3]; 832 833 /* update -U(i,k): ba[ili] = uik */ 834 ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr); 835 836 /* add multiple of row i to k-th row ... */ 837 jmin = ili + 1; jmax = bi[i+1]; 838 if (jmin < jmax){ 839 for (j=jmin; j<jmax; j++) { 840 /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */ 841 rtmp_ptr = rtmp + bj[j]*4; 842 u = ba + j*4; 843 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1]; 844 rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1]; 845 rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3]; 846 rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3]; 847 } 848 849 /* ... add i to row list for next nonzero entry */ 850 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 851 j = bj[jmin]; 852 jl[i] = jl[j]; jl[j] = i; /* update jl */ 853 } 854 i = nexti; 855 } 856 857 /* save nonzero entries in k-th row of U ... */ 858 859 /* invert diagonal block */ 860 diag = ba+k*4; 861 ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr); 862 ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr); 863 864 jmin = bi[k]; jmax = bi[k+1]; 865 if (jmin < jmax) { 866 for (j=jmin; j<jmax; j++){ 867 vj = bj[j]; /* block col. index of U */ 868 u = ba + j*4; 869 rtmp_ptr = rtmp + vj*4; 870 for (k1=0; k1<4; k1++){ 871 *u++ = *rtmp_ptr; 872 *rtmp_ptr++ = 0.0; 873 } 874 } 875 876 /* ... add k to row list for first nonzero entry in k-th row */ 877 il[k] = jmin; 878 i = bj[jmin]; 879 jl[k] = jl[i]; jl[i] = k; 880 } 881 } 882 883 ierr = PetscFree(rtmp);CHKERRQ(ierr); 884 ierr = PetscFree(il);CHKERRQ(ierr); 885 ierr = PetscFree(dk);CHKERRQ(ierr); 886 if (a->permute) { 887 ierr = PetscFree(aa);CHKERRQ(ierr); 888 } 889 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 890 C->factor = FACTOR_CHOLESKY; 891 C->assembled = PETSC_TRUE; 892 C->preallocated = PETSC_TRUE; 893 PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 894 PetscFunctionReturn(0); 895 } 896 897 /* 898 Version for when blocks are 2 by 2 Using natural ordering 899 */ 900 #undef __FUNCT__ 901 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering" 902 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B) 903 { 904 Mat C = *B; 905 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 906 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 907 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 908 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 909 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 910 911 PetscFunctionBegin; 912 913 /* initialization */ 914 /* il and jl record the first nonzero element in each row of the accessing 915 window U(0:k, k:mbs-1). 916 jl: list of rows to be added to uneliminated rows 917 i>= k: jl(i) is the first row to be added to row i 918 i< k: jl(i) is the row following row i in some list of rows 919 jl(i) = mbs indicates the end of a list 920 il(i): points to the first nonzero element in columns k,...,mbs-1 of 921 row i of U */ 922 ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 923 ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr); 924 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 925 jl = il + mbs; 926 for (i=0; i<mbs; i++) { 927 jl[i] = mbs; il[0] = 0; 928 } 929 ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr); 930 uik = dk + 4; 931 932 ai = a->i; aj = a->j; aa = a->a; 933 934 /* for each row k */ 935 for (k = 0; k<mbs; k++){ 936 937 /*initialize k-th row with elements nonzero in row k of A */ 938 jmin = ai[k]; jmax = ai[k+1]; 939 ap = aa + jmin*4; 940 for (j = jmin; j < jmax; j++){ 941 vj = aj[j]; /* block col. index */ 942 rtmp_ptr = rtmp + vj*4; 943 for (i=0; i<4; i++) *rtmp_ptr++ = *ap++; 944 } 945 946 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 947 ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr); 948 i = jl[k]; /* first row to be added to k_th row */ 949 950 while (i < k){ 951 nexti = jl[i]; /* next row to be added to k_th row */ 952 953 /* compute multiplier */ 954 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 955 956 /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */ 957 diag = ba + i*4; 958 u = ba + ili*4; 959 uik[0] = -(diag[0]*u[0] + diag[2]*u[1]); 960 uik[1] = -(diag[1]*u[0] + diag[3]*u[1]); 961 uik[2] = -(diag[0]*u[2] + diag[2]*u[3]); 962 uik[3] = -(diag[1]*u[2] + diag[3]*u[3]); 963 964 /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */ 965 dk[0] += uik[0]*u[0] + uik[1]*u[1]; 966 dk[1] += uik[2]*u[0] + uik[3]*u[1]; 967 dk[2] += uik[0]*u[2] + uik[1]*u[3]; 968 dk[3] += uik[2]*u[2] + uik[3]*u[3]; 969 970 /* update -U(i,k): ba[ili] = uik */ 971 ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr); 972 973 /* add multiple of row i to k-th row ... */ 974 jmin = ili + 1; jmax = bi[i+1]; 975 if (jmin < jmax){ 976 for (j=jmin; j<jmax; j++) { 977 /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */ 978 rtmp_ptr = rtmp + bj[j]*4; 979 u = ba + j*4; 980 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1]; 981 rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1]; 982 rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3]; 983 rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3]; 984 } 985 986 /* ... add i to row list for next nonzero entry */ 987 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 988 j = bj[jmin]; 989 jl[i] = jl[j]; jl[j] = i; /* update jl */ 990 } 991 i = nexti; 992 } 993 994 /* save nonzero entries in k-th row of U ... */ 995 996 /* invert diagonal block */ 997 diag = ba+k*4; 998 ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr); 999 ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr); 1000 1001 jmin = bi[k]; jmax = bi[k+1]; 1002 if (jmin < jmax) { 1003 for (j=jmin; j<jmax; j++){ 1004 vj = bj[j]; /* block col. index of U */ 1005 u = ba + j*4; 1006 rtmp_ptr = rtmp + vj*4; 1007 for (k1=0; k1<4; k1++){ 1008 *u++ = *rtmp_ptr; 1009 *rtmp_ptr++ = 0.0; 1010 } 1011 } 1012 1013 /* ... add k to row list for first nonzero entry in k-th row */ 1014 il[k] = jmin; 1015 i = bj[jmin]; 1016 jl[k] = jl[i]; jl[i] = k; 1017 } 1018 } 1019 1020 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1021 ierr = PetscFree(il);CHKERRQ(ierr); 1022 ierr = PetscFree(dk);CHKERRQ(ierr); 1023 1024 C->factor = FACTOR_CHOLESKY; 1025 C->assembled = PETSC_TRUE; 1026 C->preallocated = PETSC_TRUE; 1027 PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 1028 PetscFunctionReturn(0); 1029 } 1030 1031 /* 1032 Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP. 1033 Version for blocks are 1 by 1. 1034 */ 1035 #undef __FUNCT__ 1036 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1" 1037 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B) 1038 { 1039 Mat C = *B; 1040 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 1041 IS ip = b->row; 1042 int *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j; 1043 int *ai,*aj,*r; 1044 int k,jmin,jmax,*jl,*il,vj,nexti,ili; 1045 MatScalar *rtmp; 1046 MatScalar *ba = b->a,*aa,ak; 1047 MatScalar dk,uikdi; 1048 1049 PetscFunctionBegin; 1050 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1051 if (!a->permute){ 1052 ai = a->i; aj = a->j; aa = a->a; 1053 } else { 1054 ai = a->inew; aj = a->jnew; 1055 ierr = PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr); 1056 ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1057 ierr = PetscMalloc(ai[mbs]*sizeof(int),&r);CHKERRQ(ierr); 1058 ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 1059 1060 jmin = ai[0]; jmax = ai[mbs]; 1061 for (j=jmin; j<jmax; j++){ 1062 while (r[j] != j){ 1063 k = r[j]; r[j] = r[k]; r[k] = k; 1064 ak = aa[k]; aa[k] = aa[j]; aa[j] = ak; 1065 } 1066 } 1067 ierr = PetscFree(r);CHKERRQ(ierr); 1068 } 1069 1070 /* initialization */ 1071 /* il and jl record the first nonzero element in each row of the accessing 1072 window U(0:k, k:mbs-1). 1073 jl: list of rows to be added to uneliminated rows 1074 i>= k: jl(i) is the first row to be added to row i 1075 i< k: jl(i) is the row following row i in some list of rows 1076 jl(i) = mbs indicates the end of a list 1077 il(i): points to the first nonzero element in columns k,...,mbs-1 of 1078 row i of U */ 1079 ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1080 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 1081 jl = il + mbs; 1082 for (i=0; i<mbs; i++) { 1083 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 1084 } 1085 1086 /* for each row k */ 1087 for (k = 0; k<mbs; k++){ 1088 1089 /*initialize k-th row with elements nonzero in row perm(k) of A */ 1090 jmin = ai[rip[k]]; jmax = ai[rip[k]+1]; 1091 1092 for (j = jmin; j < jmax; j++){ 1093 vj = rip[aj[j]]; 1094 rtmp[vj] = aa[j]; 1095 } 1096 1097 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 1098 dk = rtmp[k]; 1099 i = jl[k]; /* first row to be added to k_th row */ 1100 1101 while (i < k){ 1102 nexti = jl[i]; /* next row to be added to k_th row */ 1103 1104 /* compute multiplier, update D(k) and U(i,k) */ 1105 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1106 uikdi = - ba[ili]*ba[i]; 1107 dk += uikdi*ba[ili]; 1108 ba[ili] = uikdi; /* -U(i,k) */ 1109 1110 /* add multiple of row i to k-th row ... */ 1111 jmin = ili + 1; jmax = bi[i+1]; 1112 if (jmin < jmax){ 1113 for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j]; 1114 /* ... add i to row list for next nonzero entry */ 1115 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 1116 j = bj[jmin]; 1117 jl[i] = jl[j]; jl[j] = i; /* update jl */ 1118 } 1119 i = nexti; 1120 } 1121 1122 /* check for zero pivot and save diagoanl element */ 1123 if (dk == 0.0){ 1124 SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot"); 1125 /* 1126 } else if (PetscRealPart(dk) < 0.0){ 1127 SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Negative pivot: d[%d] = %g\n",k,dk); 1128 */ 1129 } 1130 1131 /* save nonzero entries in k-th row of U ... */ 1132 ba[k] = 1.0/dk; 1133 jmin = bi[k]; jmax = bi[k+1]; 1134 if (jmin < jmax) { 1135 for (j=jmin; j<jmax; j++){ 1136 vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0; 1137 } 1138 /* ... add k to row list for first nonzero entry in k-th row */ 1139 il[k] = jmin; 1140 i = bj[jmin]; 1141 jl[k] = jl[i]; jl[i] = k; 1142 } 1143 } 1144 1145 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1146 ierr = PetscFree(il);CHKERRQ(ierr); 1147 if (a->permute){ 1148 ierr = PetscFree(aa);CHKERRQ(ierr); 1149 } 1150 1151 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1152 C->factor = FACTOR_CHOLESKY; 1153 C->assembled = PETSC_TRUE; 1154 C->preallocated = PETSC_TRUE; 1155 PetscLogFlops(b->mbs); 1156 PetscFunctionReturn(0); 1157 } 1158 1159 /* 1160 Version for when blocks are 1 by 1 Using natural ordering 1161 */ 1162 #undef __FUNCT__ 1163 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering" 1164 int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B) 1165 { 1166 Mat C = *B; 1167 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data; 1168 int ierr,i,j,mbs = a->mbs; 1169 int *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 1170 int k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0; 1171 MatScalar *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval; 1172 PetscReal damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount; 1173 PetscTruth damp,chshift; 1174 int nshift=0; 1175 1176 PetscFunctionBegin; 1177 /* initialization */ 1178 /* il and jl record the first nonzero element in each row of the accessing 1179 window U(0:k, k:mbs-1). 1180 jl: list of rows to be added to uneliminated rows 1181 i>= k: jl(i) is the first row to be added to row i 1182 i< k: jl(i) is the row following row i in some list of rows 1183 jl(i) = mbs indicates the end of a list 1184 il(i): points to the first nonzero element in U(i,k:mbs-1) 1185 */ 1186 ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1187 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 1188 jl = il + mbs; 1189 1190 shift_amount = 0; 1191 do { 1192 damp = PETSC_FALSE; 1193 chshift = PETSC_FALSE; 1194 for (i=0; i<mbs; i++) { 1195 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 1196 } 1197 1198 for (k = 0; k<mbs; k++){ /* row k */ 1199 /*initialize k-th row with elements nonzero in row perm(k) of A */ 1200 nz = ai[k+1] - ai[k]; 1201 acol = aj + ai[k]; 1202 aval = aa + ai[k]; 1203 bval = ba + bi[k]; 1204 while (nz -- ){ 1205 rtmp[*acol++] = *aval++; 1206 *bval++ = 0.0; /* for in-place factorization */ 1207 } 1208 /* damp the diagonal of the matrix */ 1209 if (ndamp||nshift) rtmp[k] += damping+shift_amount; 1210 1211 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 1212 dk = rtmp[k]; 1213 i = jl[k]; /* first row to be added to k_th row */ 1214 1215 while (i < k){ 1216 nexti = jl[i]; /* next row to be added to k_th row */ 1217 1218 /* compute multiplier, update D(k) and U(i,k) */ 1219 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1220 uikdi = - ba[ili]*ba[bi[i]]; 1221 dk += uikdi*ba[ili]; 1222 ba[ili] = uikdi; /* -U(i,k) */ 1223 1224 /* add multiple of row i to k-th row ... */ 1225 jmin = ili + 1; 1226 nz = bi[i+1] - jmin; 1227 if (nz > 0){ 1228 bcol = bj + jmin; 1229 bval = ba + jmin; 1230 while (nz --) rtmp[*bcol++] += uikdi*(*bval++); 1231 /* ... add i to row list for next nonzero entry */ 1232 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 1233 j = bj[jmin]; 1234 jl[i] = jl[j]; jl[j] = i; /* update jl */ 1235 } 1236 i = nexti; 1237 } 1238 1239 if (PetscRealPart(dk) < zeropivot && b->factor_shift){ 1240 /* calculate a shift that would make this row diagonally dominant */ 1241 PetscReal rs = PetscAbs(PetscRealPart(dk)); 1242 jmin = bi[k]+1; 1243 nz = bi[k+1] - jmin; 1244 if (nz){ 1245 bcol = bj + jmin; 1246 bval = ba + jmin; 1247 while (nz--){ 1248 rs += PetscAbsScalar(rtmp[*bcol++]); 1249 } 1250 } 1251 /* if this shift is less than the previous, just up the previous 1252 one by a bit */ 1253 shift_amount = PetscMax(rs,1.1*shift_amount); 1254 chshift = PETSC_TRUE; 1255 /* Unlike in the ILU case there is no exit condition on nshift: 1256 we increase the shift until it converges. There is no guarantee that 1257 this algorithm converges faster or slower, or is better or worse 1258 than the ILU algorithm. */ 1259 nshift++; 1260 break; 1261 } 1262 if (PetscRealPart(dk) < zeropivot){ 1263 if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1); 1264 if (damping > 0.0) { 1265 if (ndamp) damping *= 2.0; 1266 damp = PETSC_TRUE; 1267 ndamp++; 1268 break; 1269 } else if (PetscAbsScalar(dk) < zeropivot){ 1270 SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",k,PetscRealPart(dk),zeropivot); 1271 } else { 1272 PetscLogInfo((PetscObject)A,"Negative pivot %g in row %d of Cholesky factorization\n",PetscRealPart(dk),k); 1273 } 1274 } 1275 1276 /* save nonzero entries in k-th row of U ... */ 1277 /* printf("%d, dk: %g, 1/dk: %g\n",k,dk,1/dk); */ 1278 ba[bi[k]] = 1.0/dk; 1279 jmin = bi[k]+1; 1280 nz = bi[k+1] - jmin; 1281 if (nz){ 1282 bcol = bj + jmin; 1283 bval = ba + jmin; 1284 while (nz--){ 1285 *bval++ = rtmp[*bcol]; 1286 rtmp[*bcol++] = 0.0; 1287 } 1288 /* ... add k to row list for first nonzero entry in k-th row */ 1289 il[k] = jmin; 1290 i = bj[jmin]; 1291 jl[k] = jl[i]; jl[i] = k; 1292 } 1293 } /* end of for (k = 0; k<mbs; k++) */ 1294 } while (damp||chshift); 1295 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1296 ierr = PetscFree(il);CHKERRQ(ierr); 1297 1298 C->factor = FACTOR_CHOLESKY; 1299 C->assembled = PETSC_TRUE; 1300 C->preallocated = PETSC_TRUE; 1301 PetscLogFlops(b->mbs); 1302 if (ndamp) { 1303 PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %d damping value %g\n",ndamp,damping); 1304 } 1305 if (nshift) { 1306 PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %d shifts\n",nshift); 1307 } 1308 1309 PetscFunctionReturn(0); 1310 } 1311 1312 #undef __FUNCT__ 1313 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ" 1314 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info) 1315 { 1316 int ierr; 1317 Mat C; 1318 1319 PetscFunctionBegin; 1320 ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr); 1321 ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr); 1322 ierr = MatHeaderCopy(A,C);CHKERRQ(ierr); 1323 PetscFunctionReturn(0); 1324 } 1325 1326 1327