1 /*$Id: sbaijfact.c,v 1.61 2001/08/06 21:15:47 bsmith Exp $*/ 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/mat/impls/sbaij/seq/sbaij.h" 5 #include "src/inline/ilu.h" 6 #include "include/petscis.h" 7 8 #if !defined(PETSC_USE_COMPLEX) 9 /* 10 input: 11 F -- numeric factor 12 output: 13 nneg, nzero, npos: matrix inertia 14 */ 15 16 #undef __FUNCT__ 17 #define __FUNCT__ "MatGetInertia_SeqSBAIJ" 18 int MatGetInertia_SeqSBAIJ(Mat F,int *nneig,int *nzero,int *npos) 19 { 20 Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data; 21 PetscScalar *dd = fact_ptr->a; 22 int mbs=fact_ptr->mbs,bs=fact_ptr->bs,i,nneig_tmp,npos_tmp, 23 *fi = fact_ptr->i; 24 25 PetscFunctionBegin; 26 if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %d >1 yet",bs); 27 nneig_tmp = 0; npos_tmp = 0; 28 for (i=0; i<mbs; i++){ 29 if (PetscRealPart(dd[*fi]) > 0.0){ 30 npos_tmp++; 31 } else if (PetscRealPart(dd[*fi]) < 0.0){ 32 nneig_tmp++; 33 } 34 fi++; 35 } 36 if (nneig) *nneig = nneig_tmp; 37 if (npos) *npos = npos_tmp; 38 if (nzero) *nzero = mbs - nneig_tmp - npos_tmp; 39 40 PetscFunctionReturn(0); 41 } 42 #endif /* !defined(PETSC_USE_COMPLEX) */ 43 44 /* Using Modified Sparse Row (MSR) storage. 45 See page 85, "Iterative Methods ..." by Saad. */ 46 /* 47 Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP. 48 */ 49 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 50 #undef __FUNCT__ 51 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ" 52 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B) 53 { 54 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 55 int *rip,ierr,i,mbs = a->mbs,*ai,*aj; 56 int *jutmp,bs = a->bs,bs2=a->bs2; 57 int m,realloc = 0,prow; 58 int *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 59 int *il,ili,nextprow; 60 PetscReal f = info->fill; 61 PetscTruth perm_identity; 62 63 PetscFunctionBegin; 64 /* check whether perm is the identity mapping */ 65 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 66 67 /* -- inplace factorization, i.e., use sbaij for *B -- */ 68 if (perm_identity && bs==1 ){ 69 if (!perm_identity) a->permute = PETSC_TRUE; 70 71 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 72 73 if (perm_identity){ /* without permutation */ 74 ai = a->i; aj = a->j; 75 } else { /* non-trivial permutation */ 76 ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr); 77 ai = a->inew; aj = a->jnew; 78 } 79 80 /* initialization */ 81 ierr = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr); 82 umax = (int)(f*ai[mbs] + 1); 83 ierr = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr); 84 iu[0] = 0; 85 juidx = 0; /* index for ju */ 86 ierr = PetscMalloc((3*mbs+1)*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for getting pivot row */ 87 q = jl + mbs; /* linked list for col index of active row */ 88 il = q + mbs; 89 for (i=0; i<mbs; i++){ 90 jl[i] = mbs; 91 q[i] = 0; 92 il[i] = 0; 93 } 94 95 /* for each row k */ 96 for (k=0; k<mbs; k++){ 97 nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */ 98 q[k] = mbs; 99 /* initialize nonzero structure of k-th row to row rip[k] of A */ 100 jmin = ai[rip[k]] +1; /* exclude diag[k] */ 101 jmax = ai[rip[k]+1]; 102 for (j=jmin; j<jmax; j++){ 103 vj = rip[aj[j]]; /* col. value */ 104 if(vj > k){ 105 qm = k; 106 do { 107 m = qm; qm = q[m]; 108 } while(qm < vj); 109 if (qm == vj) { 110 SETERRQ(1," error: duplicate entry in A\n"); 111 } 112 nzk++; 113 q[m] = vj; 114 q[vj] = qm; 115 } /* if(vj > k) */ 116 } /* for (j=jmin; j<jmax; j++) */ 117 118 /* modify nonzero structure of k-th row by computing fill-in 119 for each row i to be merged in */ 120 prow = k; 121 prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */ 122 123 while (prow < k){ 124 nextprow = jl[prow]; 125 126 /* merge row prow into k-th row */ 127 ili = il[prow]; 128 jmin = ili + 1; /* points to 2nd nzero entry in U(prow,k:mbs-1) */ 129 jmax = iu[prow+1]; 130 qm = k; 131 for (j=jmin; j<jmax; j++){ 132 vj = ju[j]; 133 do { 134 m = qm; qm = q[m]; 135 } while (qm < vj); 136 if (qm != vj){ /* a fill */ 137 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 138 } 139 } /* end of for (j=jmin; j<jmax; j++) */ 140 if (jmin < jmax){ 141 il[prow] = jmin; 142 j = ju[jmin]; 143 jl[prow] = jl[j]; jl[j] = prow; /* update jl */ 144 } 145 prow = nextprow; 146 } 147 148 /* update il and jl */ 149 if (nzk > 0){ 150 i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 151 jl[k] = jl[i]; jl[i] = k; 152 il[k] = iu[k] + 1; 153 } 154 iu[k+1] = iu[k] + nzk + 1; /* include diag[k] */ 155 156 /* allocate more space to ju if needed */ 157 if (iu[k+1] > umax) { 158 /* estimate how much additional space we will need */ 159 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 160 /* just double the memory each time */ 161 maxadd = umax; 162 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 163 umax += maxadd; 164 165 /* allocate a longer ju */ 166 ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr); 167 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 168 ierr = PetscFree(ju);CHKERRQ(ierr); 169 ju = jutmp; 170 realloc++; /* count how many times we realloc */ 171 } 172 173 /* save nonzero structure of k-th row in ju */ 174 ju[juidx++] = k; /* diag[k] */ 175 i = k; 176 while (nzk --) { 177 i = q[i]; 178 ju[juidx++] = i; 179 } 180 } 181 182 if (ai[mbs] != 0) { 183 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 184 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 185 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af); 186 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af); 187 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"); 188 } else { 189 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 190 } 191 192 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 193 /* ierr = PetscFree(q);CHKERRQ(ierr); */ 194 ierr = PetscFree(jl);CHKERRQ(ierr); 195 196 /* put together the new matrix */ 197 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 198 /* PetscLogObjectParent(*B,iperm); */ 199 b = (Mat_SeqSBAIJ*)(*B)->data; 200 ierr = PetscFree(b->imax);CHKERRQ(ierr); 201 b->singlemalloc = PETSC_FALSE; 202 /* the next line frees the default space generated by the Create() */ 203 ierr = PetscFree(b->a);CHKERRQ(ierr); 204 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 205 ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr); 206 b->j = ju; 207 b->i = iu; 208 b->diag = 0; 209 b->ilen = 0; 210 b->imax = 0; 211 b->row = perm; 212 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 213 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 214 b->icol = perm; 215 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 216 ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 217 /* In b structure: Free imax, ilen, old a, old j. 218 Allocate idnew, solve_work, new a, new j */ 219 PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 220 b->maxnz = b->nz = iu[mbs]; 221 222 (*B)->factor = FACTOR_CHOLESKY; 223 (*B)->info.factor_mallocs = realloc; 224 (*B)->info.fill_ratio_given = f; 225 if (ai[mbs] != 0) { 226 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 227 } else { 228 (*B)->info.fill_ratio_needed = 0.0; 229 } 230 231 232 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 233 (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering; 234 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"); 235 236 PetscFunctionReturn(0); 237 } 238 /* ----------- end of new code --------------------*/ 239 240 241 if (!perm_identity) a->permute = PETSC_TRUE; 242 243 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 244 245 if (perm_identity){ /* without permutation */ 246 ai = a->i; aj = a->j; 247 } else { /* non-trivial permutation */ 248 ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr); 249 ai = a->inew; aj = a->jnew; 250 } 251 252 /* initialization */ 253 ierr = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr); 254 umax = (int)(f*ai[mbs] + 1); umax += mbs + 1; 255 ierr = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr); 256 iu[0] = mbs+1; 257 juidx = mbs + 1; /* index for ju */ 258 ierr = PetscMalloc(2*mbs*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for pivot row */ 259 q = jl + mbs; /* linked list for col index */ 260 for (i=0; i<mbs; i++){ 261 jl[i] = mbs; 262 q[i] = 0; 263 } 264 265 /* for each row k */ 266 for (k=0; k<mbs; k++){ 267 for (i=0; i<mbs; i++) q[i] = 0; /* to be removed! */ 268 nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */ 269 q[k] = mbs; 270 /* initialize nonzero structure of k-th row to row rip[k] of A */ 271 jmin = ai[rip[k]] +1; /* exclude diag[k] */ 272 jmax = ai[rip[k]+1]; 273 for (j=jmin; j<jmax; j++){ 274 vj = rip[aj[j]]; /* col. value */ 275 if(vj > k){ 276 qm = k; 277 do { 278 m = qm; qm = q[m]; 279 } while(qm < vj); 280 if (qm == vj) { 281 SETERRQ(1," error: duplicate entry in A\n"); 282 } 283 nzk++; 284 q[m] = vj; 285 q[vj] = qm; 286 } /* if(vj > k) */ 287 } /* for (j=jmin; j<jmax; j++) */ 288 289 /* modify nonzero structure of k-th row by computing fill-in 290 for each row i to be merged in */ 291 prow = k; 292 prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */ 293 294 while (prow < k){ 295 /* merge row prow into k-th row */ 296 jmin = iu[prow] + 1; jmax = iu[prow+1]; 297 qm = k; 298 for (j=jmin; j<jmax; j++){ 299 vj = ju[j]; 300 do { 301 m = qm; qm = q[m]; 302 } while (qm < vj); 303 if (qm != vj){ 304 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 305 } 306 } 307 prow = jl[prow]; /* next pivot row */ 308 } 309 310 /* add k to row list for first nonzero element in k-th row */ 311 if (nzk > 0){ 312 i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */ 313 jl[k] = jl[i]; jl[i] = k; 314 } 315 iu[k+1] = iu[k] + nzk; 316 317 /* allocate more space to ju if needed */ 318 if (iu[k+1] > umax) { 319 /* estimate how much additional space we will need */ 320 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 321 /* just double the memory each time */ 322 maxadd = umax; 323 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 324 umax += maxadd; 325 326 /* allocate a longer ju */ 327 ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr); 328 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 329 ierr = PetscFree(ju);CHKERRQ(ierr); 330 ju = jutmp; 331 realloc++; /* count how many times we realloc */ 332 } 333 334 /* save nonzero structure of k-th row in ju */ 335 i=k; 336 while (nzk --) { 337 i = q[i]; 338 ju[juidx++] = i; 339 } 340 } 341 342 if (ai[mbs] != 0) { 343 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 344 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 345 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af); 346 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af); 347 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"); 348 } else { 349 PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 350 } 351 352 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 353 /* ierr = PetscFree(q);CHKERRQ(ierr); */ 354 ierr = PetscFree(jl);CHKERRQ(ierr); 355 356 /* put together the new matrix */ 357 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 358 /* PetscLogObjectParent(*B,iperm); */ 359 b = (Mat_SeqSBAIJ*)(*B)->data; 360 ierr = PetscFree(b->imax);CHKERRQ(ierr); 361 b->singlemalloc = PETSC_FALSE; 362 /* the next line frees the default space generated by the Create() */ 363 ierr = PetscFree(b->a);CHKERRQ(ierr); 364 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 365 ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr); 366 b->j = ju; 367 b->i = iu; 368 b->diag = 0; 369 b->ilen = 0; 370 b->imax = 0; 371 b->row = perm; 372 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 373 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 374 b->icol = perm; 375 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 376 ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 377 /* In b structure: Free imax, ilen, old a, old j. 378 Allocate idnew, solve_work, new a, new j */ 379 PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 380 b->maxnz = b->nz = iu[mbs]; 381 382 (*B)->factor = FACTOR_CHOLESKY; 383 (*B)->info.factor_mallocs = realloc; 384 (*B)->info.fill_ratio_given = f; 385 if (ai[mbs] != 0) { 386 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 387 } else { 388 (*B)->info.fill_ratio_needed = 0.0; 389 } 390 391 if (perm_identity){ 392 switch (bs) { 393 case 1: 394 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 395 (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering; 396 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"); 397 break; 398 case 2: 399 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering; 400 (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering; 401 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n"); 402 break; 403 case 3: 404 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering; 405 (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering; 406 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n"); 407 break; 408 case 4: 409 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering; 410 (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering; 411 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n"); 412 break; 413 case 5: 414 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering; 415 (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering; 416 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n"); 417 break; 418 case 6: 419 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering; 420 (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering; 421 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n"); 422 break; 423 case 7: 424 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering; 425 (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering; 426 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n"); 427 break; 428 default: 429 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering; 430 (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering; 431 PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n"); 432 break; 433 } 434 } 435 436 PetscFunctionReturn(0); 437 } 438 439 440 #undef __FUNCT__ 441 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N" 442 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B) 443 { 444 Mat C = *B; 445 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 446 IS perm = b->row; 447 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 448 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 449 int bs=a->bs,bs2 = a->bs2; 450 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 451 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 452 MatScalar *work; 453 int *pivots; 454 455 PetscFunctionBegin; 456 457 /* initialization */ 458 ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 459 ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr); 460 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 461 jl = il + mbs; 462 for (i=0; i<mbs; i++) { 463 jl[i] = mbs; il[0] = 0; 464 } 465 ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr); 466 uik = dk + bs2; 467 work = uik + bs2; 468 ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr); 469 470 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 471 472 /* check permutation */ 473 if (!a->permute){ 474 ai = a->i; aj = a->j; aa = a->a; 475 } else { 476 ai = a->inew; aj = a->jnew; 477 ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr); 478 ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 479 ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr); 480 ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 481 482 for (i=0; i<mbs; i++){ 483 jmin = ai[i]; jmax = ai[i+1]; 484 for (j=jmin; j<jmax; j++){ 485 while (a2anew[j] != j){ 486 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 487 for (k1=0; k1<bs2; k1++){ 488 dk[k1] = aa[k*bs2+k1]; 489 aa[k*bs2+k1] = aa[j*bs2+k1]; 490 aa[j*bs2+k1] = dk[k1]; 491 } 492 } 493 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 494 if (i > aj[j]){ 495 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 496 ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */ 497 for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 498 for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */ 499 for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1]; 500 } 501 } 502 } 503 } 504 ierr = PetscFree(a2anew);CHKERRQ(ierr); 505 } 506 507 /* for each row k */ 508 for (k = 0; k<mbs; k++){ 509 510 /*initialize k-th row with elements nonzero in row perm(k) of A */ 511 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 512 513 ap = aa + jmin*bs2; 514 for (j = jmin; j < jmax; j++){ 515 vj = perm_ptr[aj[j]]; /* block col. index */ 516 rtmp_ptr = rtmp + vj*bs2; 517 for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++; 518 } 519 520 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 521 ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 522 i = jl[k]; /* first row to be added to k_th row */ 523 524 while (i < k){ 525 nexti = jl[i]; /* next row to be added to k_th row */ 526 527 /* compute multiplier */ 528 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 529 530 /* uik = -inv(Di)*U_bar(i,k) */ 531 diag = ba + i*bs2; 532 u = ba + ili*bs2; 533 ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 534 Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u); 535 536 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 537 Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u); 538 539 /* update -U(i,k) */ 540 ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 541 542 /* add multiple of row i to k-th row ... */ 543 jmin = ili + 1; jmax = bi[i+1]; 544 if (jmin < jmax){ 545 for (j=jmin; j<jmax; j++) { 546 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 547 rtmp_ptr = rtmp + bj[j]*bs2; 548 u = ba + j*bs2; 549 Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u); 550 } 551 552 /* ... add i to row list for next nonzero entry */ 553 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 554 j = bj[jmin]; 555 jl[i] = jl[j]; jl[j] = i; /* update jl */ 556 } 557 i = nexti; 558 } 559 560 /* save nonzero entries in k-th row of U ... */ 561 562 /* invert diagonal block */ 563 diag = ba+k*bs2; 564 ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr); 565 Kernel_A_gets_inverse_A(bs,diag,pivots,work); 566 567 jmin = bi[k]; jmax = bi[k+1]; 568 if (jmin < jmax) { 569 for (j=jmin; j<jmax; j++){ 570 vj = bj[j]; /* block col. index of U */ 571 u = ba + j*bs2; 572 rtmp_ptr = rtmp + vj*bs2; 573 for (k1=0; k1<bs2; k1++){ 574 *u++ = *rtmp_ptr; 575 *rtmp_ptr++ = 0.0; 576 } 577 } 578 579 /* ... add k to row list for first nonzero entry in k-th row */ 580 il[k] = jmin; 581 i = bj[jmin]; 582 jl[k] = jl[i]; jl[i] = k; 583 } 584 } 585 586 ierr = PetscFree(rtmp);CHKERRQ(ierr); 587 ierr = PetscFree(il);CHKERRQ(ierr); 588 ierr = PetscFree(dk);CHKERRQ(ierr); 589 ierr = PetscFree(pivots);CHKERRQ(ierr); 590 if (a->permute){ 591 ierr = PetscFree(aa);CHKERRQ(ierr); 592 } 593 594 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 595 C->factor = FACTOR_CHOLESKY; 596 C->assembled = PETSC_TRUE; 597 C->preallocated = PETSC_TRUE; 598 PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 599 PetscFunctionReturn(0); 600 } 601 602 #undef __FUNCT__ 603 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering" 604 int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B) 605 { 606 Mat C = *B; 607 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 608 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 609 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 610 int bs=a->bs,bs2 = a->bs2; 611 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 612 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 613 MatScalar *work; 614 int *pivots; 615 616 PetscFunctionBegin; 617 618 /* initialization */ 619 620 ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 621 ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr); 622 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 623 jl = il + mbs; 624 for (i=0; i<mbs; i++) { 625 jl[i] = mbs; il[0] = 0; 626 } 627 ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr); 628 uik = dk + bs2; 629 work = uik + bs2; 630 ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr); 631 632 ai = a->i; aj = a->j; aa = a->a; 633 634 /* for each row k */ 635 for (k = 0; k<mbs; k++){ 636 637 /*initialize k-th row with elements nonzero in row k of A */ 638 jmin = ai[k]; jmax = ai[k+1]; 639 ap = aa + jmin*bs2; 640 for (j = jmin; j < jmax; j++){ 641 vj = aj[j]; /* block col. index */ 642 rtmp_ptr = rtmp + vj*bs2; 643 for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++; 644 } 645 646 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 647 ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 648 i = jl[k]; /* first row to be added to k_th row */ 649 650 while (i < k){ 651 nexti = jl[i]; /* next row to be added to k_th row */ 652 653 /* compute multiplier */ 654 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 655 656 /* uik = -inv(Di)*U_bar(i,k) */ 657 diag = ba + i*bs2; 658 u = ba + ili*bs2; 659 ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 660 Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u); 661 662 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 663 Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u); 664 665 /* update -U(i,k) */ 666 ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 667 668 /* add multiple of row i to k-th row ... */ 669 jmin = ili + 1; jmax = bi[i+1]; 670 if (jmin < jmax){ 671 for (j=jmin; j<jmax; j++) { 672 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 673 rtmp_ptr = rtmp + bj[j]*bs2; 674 u = ba + j*bs2; 675 Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u); 676 } 677 678 /* ... add i to row list for next nonzero entry */ 679 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 680 j = bj[jmin]; 681 jl[i] = jl[j]; jl[j] = i; /* update jl */ 682 } 683 i = nexti; 684 } 685 686 /* save nonzero entries in k-th row of U ... */ 687 688 /* invert diagonal block */ 689 diag = ba+k*bs2; 690 ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr); 691 Kernel_A_gets_inverse_A(bs,diag,pivots,work); 692 693 jmin = bi[k]; jmax = bi[k+1]; 694 if (jmin < jmax) { 695 for (j=jmin; j<jmax; j++){ 696 vj = bj[j]; /* block col. index of U */ 697 u = ba + j*bs2; 698 rtmp_ptr = rtmp + vj*bs2; 699 for (k1=0; k1<bs2; k1++){ 700 *u++ = *rtmp_ptr; 701 *rtmp_ptr++ = 0.0; 702 } 703 } 704 705 /* ... add k to row list for first nonzero entry in k-th row */ 706 il[k] = jmin; 707 i = bj[jmin]; 708 jl[k] = jl[i]; jl[i] = k; 709 } 710 } 711 712 ierr = PetscFree(rtmp);CHKERRQ(ierr); 713 ierr = PetscFree(il);CHKERRQ(ierr); 714 ierr = PetscFree(dk);CHKERRQ(ierr); 715 ierr = PetscFree(pivots);CHKERRQ(ierr); 716 717 C->factor = FACTOR_CHOLESKY; 718 C->assembled = PETSC_TRUE; 719 C->preallocated = PETSC_TRUE; 720 PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 721 PetscFunctionReturn(0); 722 } 723 724 /* 725 Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP. 726 Version for blocks 2 by 2. 727 */ 728 #undef __FUNCT__ 729 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2" 730 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B) 731 { 732 Mat C = *B; 733 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 734 IS perm = b->row; 735 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 736 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 737 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 738 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 739 740 PetscFunctionBegin; 741 742 /* initialization */ 743 /* il and jl record the first nonzero element in each row of the accessing 744 window U(0:k, k:mbs-1). 745 jl: list of rows to be added to uneliminated rows 746 i>= k: jl(i) is the first row to be added to row i 747 i< k: jl(i) is the row following row i in some list of rows 748 jl(i) = mbs indicates the end of a list 749 il(i): points to the first nonzero element in columns k,...,mbs-1 of 750 row i of U */ 751 ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 752 ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr); 753 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 754 jl = il + mbs; 755 for (i=0; i<mbs; i++) { 756 jl[i] = mbs; il[0] = 0; 757 } 758 ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr); 759 uik = dk + 4; 760 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 761 762 /* check permutation */ 763 if (!a->permute){ 764 ai = a->i; aj = a->j; aa = a->a; 765 } else { 766 ai = a->inew; aj = a->jnew; 767 ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr); 768 ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 769 ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr); 770 ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 771 772 for (i=0; i<mbs; i++){ 773 jmin = ai[i]; jmax = ai[i+1]; 774 for (j=jmin; j<jmax; j++){ 775 while (a2anew[j] != j){ 776 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 777 for (k1=0; k1<4; k1++){ 778 dk[k1] = aa[k*4+k1]; 779 aa[k*4+k1] = aa[j*4+k1]; 780 aa[j*4+k1] = dk[k1]; 781 } 782 } 783 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 784 if (i > aj[j]){ 785 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 786 ap = aa + j*4; /* ptr to the beginning of the block */ 787 dk[1] = ap[1]; /* swap ap[1] and ap[2] */ 788 ap[1] = ap[2]; 789 ap[2] = dk[1]; 790 } 791 } 792 } 793 ierr = PetscFree(a2anew);CHKERRQ(ierr); 794 } 795 796 /* for each row k */ 797 for (k = 0; k<mbs; k++){ 798 799 /*initialize k-th row with elements nonzero in row perm(k) of A */ 800 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 801 ap = aa + jmin*4; 802 for (j = jmin; j < jmax; j++){ 803 vj = perm_ptr[aj[j]]; /* block col. index */ 804 rtmp_ptr = rtmp + vj*4; 805 for (i=0; i<4; i++) *rtmp_ptr++ = *ap++; 806 } 807 808 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 809 ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr); 810 i = jl[k]; /* first row to be added to k_th row */ 811 812 while (i < k){ 813 nexti = jl[i]; /* next row to be added to k_th row */ 814 815 /* compute multiplier */ 816 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 817 818 /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */ 819 diag = ba + i*4; 820 u = ba + ili*4; 821 uik[0] = -(diag[0]*u[0] + diag[2]*u[1]); 822 uik[1] = -(diag[1]*u[0] + diag[3]*u[1]); 823 uik[2] = -(diag[0]*u[2] + diag[2]*u[3]); 824 uik[3] = -(diag[1]*u[2] + diag[3]*u[3]); 825 826 /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */ 827 dk[0] += uik[0]*u[0] + uik[1]*u[1]; 828 dk[1] += uik[2]*u[0] + uik[3]*u[1]; 829 dk[2] += uik[0]*u[2] + uik[1]*u[3]; 830 dk[3] += uik[2]*u[2] + uik[3]*u[3]; 831 832 /* update -U(i,k): ba[ili] = uik */ 833 ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr); 834 835 /* add multiple of row i to k-th row ... */ 836 jmin = ili + 1; jmax = bi[i+1]; 837 if (jmin < jmax){ 838 for (j=jmin; j<jmax; j++) { 839 /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */ 840 rtmp_ptr = rtmp + bj[j]*4; 841 u = ba + j*4; 842 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1]; 843 rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1]; 844 rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3]; 845 rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3]; 846 } 847 848 /* ... add i to row list for next nonzero entry */ 849 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 850 j = bj[jmin]; 851 jl[i] = jl[j]; jl[j] = i; /* update jl */ 852 } 853 i = nexti; 854 } 855 856 /* save nonzero entries in k-th row of U ... */ 857 858 /* invert diagonal block */ 859 diag = ba+k*4; 860 ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr); 861 ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr); 862 863 jmin = bi[k]; jmax = bi[k+1]; 864 if (jmin < jmax) { 865 for (j=jmin; j<jmax; j++){ 866 vj = bj[j]; /* block col. index of U */ 867 u = ba + j*4; 868 rtmp_ptr = rtmp + vj*4; 869 for (k1=0; k1<4; k1++){ 870 *u++ = *rtmp_ptr; 871 *rtmp_ptr++ = 0.0; 872 } 873 } 874 875 /* ... add k to row list for first nonzero entry in k-th row */ 876 il[k] = jmin; 877 i = bj[jmin]; 878 jl[k] = jl[i]; jl[i] = k; 879 } 880 } 881 882 ierr = PetscFree(rtmp);CHKERRQ(ierr); 883 ierr = PetscFree(il);CHKERRQ(ierr); 884 ierr = PetscFree(dk);CHKERRQ(ierr); 885 if (a->permute) { 886 ierr = PetscFree(aa);CHKERRQ(ierr); 887 } 888 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 889 C->factor = FACTOR_CHOLESKY; 890 C->assembled = PETSC_TRUE; 891 C->preallocated = PETSC_TRUE; 892 PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 893 PetscFunctionReturn(0); 894 } 895 896 /* 897 Version for when blocks are 2 by 2 Using natural ordering 898 */ 899 #undef __FUNCT__ 900 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering" 901 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B) 902 { 903 Mat C = *B; 904 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 905 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 906 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 907 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 908 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 909 910 PetscFunctionBegin; 911 912 /* initialization */ 913 /* il and jl record the first nonzero element in each row of the accessing 914 window U(0:k, k:mbs-1). 915 jl: list of rows to be added to uneliminated rows 916 i>= k: jl(i) is the first row to be added to row i 917 i< k: jl(i) is the row following row i in some list of rows 918 jl(i) = mbs indicates the end of a list 919 il(i): points to the first nonzero element in columns k,...,mbs-1 of 920 row i of U */ 921 ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 922 ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr); 923 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 924 jl = il + mbs; 925 for (i=0; i<mbs; i++) { 926 jl[i] = mbs; il[0] = 0; 927 } 928 ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr); 929 uik = dk + 4; 930 931 ai = a->i; aj = a->j; aa = a->a; 932 933 /* for each row k */ 934 for (k = 0; k<mbs; k++){ 935 936 /*initialize k-th row with elements nonzero in row k of A */ 937 jmin = ai[k]; jmax = ai[k+1]; 938 ap = aa + jmin*4; 939 for (j = jmin; j < jmax; j++){ 940 vj = aj[j]; /* block col. index */ 941 rtmp_ptr = rtmp + vj*4; 942 for (i=0; i<4; i++) *rtmp_ptr++ = *ap++; 943 } 944 945 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 946 ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr); 947 i = jl[k]; /* first row to be added to k_th row */ 948 949 while (i < k){ 950 nexti = jl[i]; /* next row to be added to k_th row */ 951 952 /* compute multiplier */ 953 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 954 955 /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */ 956 diag = ba + i*4; 957 u = ba + ili*4; 958 uik[0] = -(diag[0]*u[0] + diag[2]*u[1]); 959 uik[1] = -(diag[1]*u[0] + diag[3]*u[1]); 960 uik[2] = -(diag[0]*u[2] + diag[2]*u[3]); 961 uik[3] = -(diag[1]*u[2] + diag[3]*u[3]); 962 963 /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */ 964 dk[0] += uik[0]*u[0] + uik[1]*u[1]; 965 dk[1] += uik[2]*u[0] + uik[3]*u[1]; 966 dk[2] += uik[0]*u[2] + uik[1]*u[3]; 967 dk[3] += uik[2]*u[2] + uik[3]*u[3]; 968 969 /* update -U(i,k): ba[ili] = uik */ 970 ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr); 971 972 /* add multiple of row i to k-th row ... */ 973 jmin = ili + 1; jmax = bi[i+1]; 974 if (jmin < jmax){ 975 for (j=jmin; j<jmax; j++) { 976 /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */ 977 rtmp_ptr = rtmp + bj[j]*4; 978 u = ba + j*4; 979 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1]; 980 rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1]; 981 rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3]; 982 rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3]; 983 } 984 985 /* ... add i to row list for next nonzero entry */ 986 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 987 j = bj[jmin]; 988 jl[i] = jl[j]; jl[j] = i; /* update jl */ 989 } 990 i = nexti; 991 } 992 993 /* save nonzero entries in k-th row of U ... */ 994 995 /* invert diagonal block */ 996 diag = ba+k*4; 997 ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr); 998 ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr); 999 1000 jmin = bi[k]; jmax = bi[k+1]; 1001 if (jmin < jmax) { 1002 for (j=jmin; j<jmax; j++){ 1003 vj = bj[j]; /* block col. index of U */ 1004 u = ba + j*4; 1005 rtmp_ptr = rtmp + vj*4; 1006 for (k1=0; k1<4; k1++){ 1007 *u++ = *rtmp_ptr; 1008 *rtmp_ptr++ = 0.0; 1009 } 1010 } 1011 1012 /* ... add k to row list for first nonzero entry in k-th row */ 1013 il[k] = jmin; 1014 i = bj[jmin]; 1015 jl[k] = jl[i]; jl[i] = k; 1016 } 1017 } 1018 1019 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1020 ierr = PetscFree(il);CHKERRQ(ierr); 1021 ierr = PetscFree(dk);CHKERRQ(ierr); 1022 1023 C->factor = FACTOR_CHOLESKY; 1024 C->assembled = PETSC_TRUE; 1025 C->preallocated = PETSC_TRUE; 1026 PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 1027 PetscFunctionReturn(0); 1028 } 1029 1030 /* 1031 Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP. 1032 Version for blocks are 1 by 1. 1033 */ 1034 #undef __FUNCT__ 1035 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1" 1036 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B) 1037 { 1038 Mat C = *B; 1039 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 1040 IS ip = b->row; 1041 int *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j; 1042 int *ai,*aj,*r; 1043 int k,jmin,jmax,*jl,*il,vj,nexti,ili; 1044 MatScalar *rtmp; 1045 MatScalar *ba = b->a,*aa,ak; 1046 MatScalar dk,uikdi; 1047 1048 PetscFunctionBegin; 1049 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1050 if (!a->permute){ 1051 ai = a->i; aj = a->j; aa = a->a; 1052 } else { 1053 ai = a->inew; aj = a->jnew; 1054 ierr = PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr); 1055 ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1056 ierr = PetscMalloc(ai[mbs]*sizeof(int),&r);CHKERRQ(ierr); 1057 ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 1058 1059 jmin = ai[0]; jmax = ai[mbs]; 1060 for (j=jmin; j<jmax; j++){ 1061 while (r[j] != j){ 1062 k = r[j]; r[j] = r[k]; r[k] = k; 1063 ak = aa[k]; aa[k] = aa[j]; aa[j] = ak; 1064 } 1065 } 1066 ierr = PetscFree(r);CHKERRQ(ierr); 1067 } 1068 1069 /* initialization */ 1070 /* il and jl record the first nonzero element in each row of the accessing 1071 window U(0:k, k:mbs-1). 1072 jl: list of rows to be added to uneliminated rows 1073 i>= k: jl(i) is the first row to be added to row i 1074 i< k: jl(i) is the row following row i in some list of rows 1075 jl(i) = mbs indicates the end of a list 1076 il(i): points to the first nonzero element in columns k,...,mbs-1 of 1077 row i of U */ 1078 ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1079 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 1080 jl = il + mbs; 1081 for (i=0; i<mbs; i++) { 1082 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 1083 } 1084 1085 /* for each row k */ 1086 for (k = 0; k<mbs; k++){ 1087 1088 /*initialize k-th row with elements nonzero in row perm(k) of A */ 1089 jmin = ai[rip[k]]; jmax = ai[rip[k]+1]; 1090 1091 for (j = jmin; j < jmax; j++){ 1092 vj = rip[aj[j]]; 1093 rtmp[vj] = aa[j]; 1094 } 1095 1096 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 1097 dk = rtmp[k]; 1098 i = jl[k]; /* first row to be added to k_th row */ 1099 1100 while (i < k){ 1101 nexti = jl[i]; /* next row to be added to k_th row */ 1102 1103 /* compute multiplier, update D(k) and U(i,k) */ 1104 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1105 uikdi = - ba[ili]*ba[i]; 1106 dk += uikdi*ba[ili]; 1107 ba[ili] = uikdi; /* -U(i,k) */ 1108 1109 /* add multiple of row i to k-th row ... */ 1110 jmin = ili + 1; jmax = bi[i+1]; 1111 if (jmin < jmax){ 1112 for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j]; 1113 /* ... add i to row list for next nonzero entry */ 1114 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 1115 j = bj[jmin]; 1116 jl[i] = jl[j]; jl[j] = i; /* update jl */ 1117 } 1118 i = nexti; 1119 } 1120 1121 /* check for zero pivot and save diagoanl element */ 1122 if (dk == 0.0){ 1123 SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot"); 1124 /* 1125 } else if (PetscRealPart(dk) < 0.0){ 1126 SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Negative pivot: d[%d] = %g\n",k,dk); 1127 */ 1128 } 1129 1130 /* save nonzero entries in k-th row of U ... */ 1131 ba[k] = 1.0/dk; 1132 jmin = bi[k]; jmax = bi[k+1]; 1133 if (jmin < jmax) { 1134 for (j=jmin; j<jmax; j++){ 1135 vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0; 1136 } 1137 /* ... add k to row list for first nonzero entry in k-th row */ 1138 il[k] = jmin; 1139 i = bj[jmin]; 1140 jl[k] = jl[i]; jl[i] = k; 1141 } 1142 } 1143 1144 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1145 ierr = PetscFree(il);CHKERRQ(ierr); 1146 if (a->permute){ 1147 ierr = PetscFree(aa);CHKERRQ(ierr); 1148 } 1149 1150 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1151 C->factor = FACTOR_CHOLESKY; 1152 C->assembled = PETSC_TRUE; 1153 C->preallocated = PETSC_TRUE; 1154 PetscLogFlops(b->mbs); 1155 PetscFunctionReturn(0); 1156 } 1157 1158 /* 1159 Version for when blocks are 1 by 1 Using natural ordering 1160 */ 1161 #undef __FUNCT__ 1162 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering" 1163 int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B) 1164 { 1165 Mat C = *B; 1166 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data; 1167 int ierr,i,j,mbs = a->mbs; 1168 int *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 1169 int k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0; 1170 MatScalar *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval; 1171 PetscReal damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount; 1172 PetscTruth damp,chshift; 1173 int nshift=0; 1174 1175 PetscFunctionBegin; 1176 /* initialization */ 1177 /* il and jl record the first nonzero element in each row of the accessing 1178 window U(0:k, k:mbs-1). 1179 jl: list of rows to be added to uneliminated rows 1180 i>= k: jl(i) is the first row to be added to row i 1181 i< k: jl(i) is the row following row i in some list of rows 1182 jl(i) = mbs indicates the end of a list 1183 il(i): points to the first nonzero element in U(i,k:mbs-1) 1184 */ 1185 ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1186 ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr); 1187 jl = il + mbs; 1188 1189 shift_amount = 0; 1190 do { 1191 damp = PETSC_FALSE; 1192 chshift = PETSC_FALSE; 1193 for (i=0; i<mbs; i++) { 1194 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 1195 } 1196 1197 for (k = 0; k<mbs; k++){ /* row k */ 1198 /*initialize k-th row with elements nonzero in row perm(k) of A */ 1199 nz = ai[k+1] - ai[k]; 1200 acol = aj + ai[k]; 1201 aval = aa + ai[k]; 1202 bval = ba + bi[k]; 1203 while (nz -- ){ 1204 rtmp[*acol++] = *aval++; 1205 *bval++ = 0.0; /* for in-place factorization */ 1206 } 1207 /* damp the diagonal of the matrix */ 1208 if (ndamp||nshift) rtmp[k] += damping+shift_amount; 1209 1210 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 1211 dk = rtmp[k]; 1212 i = jl[k]; /* first row to be added to k_th row */ 1213 1214 while (i < k){ 1215 nexti = jl[i]; /* next row to be added to k_th row */ 1216 1217 /* compute multiplier, update D(k) and U(i,k) */ 1218 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1219 uikdi = - ba[ili]*ba[bi[i]]; 1220 dk += uikdi*ba[ili]; 1221 ba[ili] = uikdi; /* -U(i,k) */ 1222 1223 /* add multiple of row i to k-th row ... */ 1224 jmin = ili + 1; 1225 nz = bi[i+1] - jmin; 1226 if (nz > 0){ 1227 bcol = bj + jmin; 1228 bval = ba + jmin; 1229 while (nz --) rtmp[*bcol++] += uikdi*(*bval++); 1230 /* ... add i to row list for next nonzero entry */ 1231 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 1232 j = bj[jmin]; 1233 jl[i] = jl[j]; jl[j] = i; /* update jl */ 1234 } 1235 i = nexti; 1236 } 1237 1238 if (PetscRealPart(dk) < zeropivot && b->factor_shift){ 1239 /* calculate a shift that would make this row diagonally dominant */ 1240 PetscReal rs = PetscAbs(PetscRealPart(dk)); 1241 jmin = bi[k]+1; 1242 nz = bi[k+1] - jmin; 1243 if (nz){ 1244 bcol = bj + jmin; 1245 bval = ba + jmin; 1246 while (nz--){ 1247 rs += PetscAbsScalar(rtmp[*bcol++]); 1248 } 1249 } 1250 /* if this shift is less than the previous, just up the previous 1251 one by a bit */ 1252 shift_amount = PetscMax(rs,1.1*shift_amount); 1253 chshift = PETSC_TRUE; 1254 /* Unlike in the ILU case there is no exit condition on nshift: 1255 we increase the shift until it converges. There is no guarantee that 1256 this algorithm converges faster or slower, or is better or worse 1257 than the ILU algorithm. */ 1258 nshift++; 1259 break; 1260 } 1261 if (PetscRealPart(dk) < zeropivot){ 1262 if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1); 1263 if (damping > 0.0) { 1264 if (ndamp) damping *= 2.0; 1265 damp = PETSC_TRUE; 1266 ndamp++; 1267 break; 1268 } else if (PetscAbsScalar(dk) < zeropivot){ 1269 SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",k,PetscRealPart(dk),zeropivot); 1270 } else { 1271 PetscLogInfo((PetscObject)A,"Negative pivot %g in row %d of Cholesky factorization\n",PetscRealPart(dk),k); 1272 } 1273 } 1274 1275 /* save nonzero entries in k-th row of U ... */ 1276 /* printf("%d, dk: %g, 1/dk: %g\n",k,dk,1/dk); */ 1277 ba[bi[k]] = 1.0/dk; 1278 jmin = bi[k]+1; 1279 nz = bi[k+1] - jmin; 1280 if (nz){ 1281 bcol = bj + jmin; 1282 bval = ba + jmin; 1283 while (nz--){ 1284 *bval++ = rtmp[*bcol]; 1285 rtmp[*bcol++] = 0.0; 1286 } 1287 /* ... add k to row list for first nonzero entry in k-th row */ 1288 il[k] = jmin; 1289 i = bj[jmin]; 1290 jl[k] = jl[i]; jl[i] = k; 1291 } 1292 } /* end of for (k = 0; k<mbs; k++) */ 1293 } while (damp||chshift); 1294 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1295 ierr = PetscFree(il);CHKERRQ(ierr); 1296 1297 C->factor = FACTOR_CHOLESKY; 1298 C->assembled = PETSC_TRUE; 1299 C->preallocated = PETSC_TRUE; 1300 PetscLogFlops(b->mbs); 1301 if (ndamp) { 1302 PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %d damping value %g\n",ndamp,damping); 1303 } 1304 if (nshift) { 1305 PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %d shifts\n",nshift); 1306 } 1307 1308 PetscFunctionReturn(0); 1309 } 1310 1311 #undef __FUNCT__ 1312 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ" 1313 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info) 1314 { 1315 int ierr; 1316 Mat C; 1317 1318 PetscFunctionBegin; 1319 ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr); 1320 ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr); 1321 ierr = MatHeaderCopy(A,C);CHKERRQ(ierr); 1322 PetscFunctionReturn(0); 1323 } 1324 1325 1326