1 /* Using Modified Sparse Row (MSR) storage. 2 See page 85, "Iterative Methods ..." by Saad. */ 3 4 /*$Id: sbaijfact.c,v 1.47 2000/11/07 02:00:25 hzhang Exp hzhang $*/ 5 /* 6 Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP. 7 */ 8 #include "sbaij.h" 9 #include "src/mat/impls/baij/seq/baij.h" 10 #include "src/vec/vecimpl.h" 11 #include "src/inline/ilu.h" 12 #include "include/petscis.h" 13 14 #undef __FUNC__ 15 #define __FUNC__ "MatCholeskyFactorSymbolic_SeqSBAIJ" 16 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B) 17 { 18 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 19 int *rip,ierr,i,mbs = a->mbs,*ai,*aj; 20 int *jutmp,bs = a->bs,bs2=a->bs2; 21 int m,nzi,realloc = 0; 22 int *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 23 PetscTruth perm_identity; 24 25 PetscFunctionBegin; 26 if (!perm) { 27 ierr = ISCreateStride(PETSC_COMM_SELF,mbs,0,1,&perm);CHKERRQ(ierr); 28 } 29 PetscValidHeaderSpecific(perm,IS_COOKIE); 30 31 /* check whether perm is the identity mapping */ 32 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 33 if (!perm_identity) a->permute = PETSC_TRUE; 34 35 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 36 37 if (perm_identity){ /* without permutation */ 38 ai = a->i; aj = a->j; 39 } else { /* non-trivial permutation */ 40 ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRA(ierr); 41 ai = a->inew; aj = a->jnew; 42 } 43 44 /* initialization */ 45 /* Don't know how many column pointers are needed so estimate. 46 Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */ 47 iu = (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(iu); 48 umax = (int)(f*ai[mbs] + 1); umax += mbs + 1; 49 ju = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju); 50 iu[0] = mbs+1; 51 juptr = mbs; 52 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 53 q = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q); 54 for (i=0; i<mbs; i++){ 55 jl[i] = mbs; q[i] = 0; 56 } 57 58 /* for each row k */ 59 for (k=0; k<mbs; k++){ 60 nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */ 61 q[k] = mbs; 62 /* initialize nonzero structure of k-th row to row rip[k] of A */ 63 jmin = ai[rip[k]]; 64 jmax = ai[rip[k]+1]; 65 for (j=jmin; j<jmax; j++){ 66 vj = rip[aj[j]]; /* col. value */ 67 if(vj > k){ 68 qm = k; 69 do { 70 m = qm; qm = q[m]; 71 } while(qm < vj); 72 if (qm == vj) { 73 printf(" error: duplicate entry in A\n"); break; 74 } 75 nzk++; 76 q[m] = vj; 77 q[vj] = qm; 78 } /* if(vj > k) */ 79 } /* for (j=jmin; j<jmax; j++) */ 80 81 /* modify nonzero structure of k-th row by computing fill-in 82 for each row i to be merged in */ 83 i = k; 84 i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */ 85 /* printf(" next pivot row i=%d\n",i); */ 86 while (i < mbs){ 87 /* merge row i into k-th row */ 88 nzi = iu[i+1] - (iu[i]+1); 89 jmin = iu[i] + 1; jmax = iu[i] + nzi; 90 qm = k; 91 for (j=jmin; j<jmax+1; j++){ 92 vj = ju[j]; 93 do { 94 m = qm; qm = q[m]; 95 } while (qm < vj); 96 if (qm != vj){ 97 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 98 } 99 } 100 i = jl[i]; /* next pivot row */ 101 } 102 103 /* add k to row list for first nonzero element in k-th row */ 104 if (nzk > 0){ 105 i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */ 106 jl[k] = jl[i]; jl[i] = k; 107 } 108 iu[k+1] = iu[k] + nzk; /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/ 109 110 /* allocate more space to ju if needed */ 111 if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax); 112 /* estimate how much additional space we will need */ 113 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 114 /* just double the memory each time */ 115 maxadd = umax; 116 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 117 umax += maxadd; 118 119 /* allocate a longer ju */ 120 jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp); 121 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 122 ierr = PetscFree(ju);CHKERRQ(ierr); 123 ju = jutmp; 124 realloc++; /* count how many times we realloc */ 125 } 126 127 /* save nonzero structure of k-th row in ju */ 128 i=k; 129 jumin = juptr + 1; juptr += nzk; 130 for (j=jumin; j<juptr+1; j++){ 131 i=q[i]; 132 ju[j]=i; 133 } 134 } 135 136 if (ai[mbs] != 0) { 137 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 138 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 139 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af); 140 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af); 141 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"); 142 } else { 143 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 144 } 145 146 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 147 ierr = PetscFree(q);CHKERRQ(ierr); 148 ierr = PetscFree(jl);CHKERRQ(ierr); 149 150 /* put together the new matrix */ 151 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 152 /* PLogObjectParent(*B,iperm); */ 153 b = (Mat_SeqSBAIJ*)(*B)->data; 154 ierr = PetscFree(b->imax);CHKERRQ(ierr); 155 b->singlemalloc = PETSC_FALSE; 156 /* the next line frees the default space generated by the Create() */ 157 ierr = PetscFree(b->a);CHKERRQ(ierr); 158 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 159 b->a = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a); 160 b->j = ju; 161 b->i = iu; 162 b->diag = 0; 163 b->ilen = 0; 164 b->imax = 0; 165 b->row = perm; 166 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 167 b->icol = perm; 168 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 169 b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 170 /* In b structure: Free imax, ilen, old a, old j. 171 Allocate idnew, solve_work, new a, new j */ 172 PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 173 b->s_maxnz = b->s_nz = iu[mbs]; 174 175 (*B)->factor = FACTOR_CHOLESKY; 176 (*B)->info.factor_mallocs = realloc; 177 (*B)->info.fill_ratio_given = f; 178 if (ai[mbs] != 0) { 179 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 180 } else { 181 (*B)->info.fill_ratio_needed = 0.0; 182 } 183 184 if (perm_identity){ 185 switch (bs) { 186 case 1: 187 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 188 (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering; 189 PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"); 190 break; 191 case 2: 192 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering; 193 (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering; 194 PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n"); 195 break; 196 case 3: 197 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering; 198 (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering; 199 PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n"); 200 break; 201 case 4: 202 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering; 203 (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering; 204 PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n"); 205 break; 206 case 5: 207 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering; 208 (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering; 209 PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n"); 210 break; 211 case 6: 212 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering; 213 (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering; 214 PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n"); 215 break; 216 case 7: 217 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering; 218 (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering; 219 PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n"); 220 break; 221 default: 222 (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering; 223 (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering; 224 PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n"); 225 break; 226 } 227 } 228 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNC__ 233 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N" 234 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B) 235 { 236 Mat C = *B; 237 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 238 IS perm = b->row; 239 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 240 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 241 int bs=a->bs,bs2 = a->bs2; 242 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 243 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 244 MatScalar *W,*work; 245 int *pivots; 246 247 PetscFunctionBegin; 248 249 /* initialization */ 250 rtmp = (MatScalar*)PetscMalloc(bs2*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 251 ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr); 252 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 253 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 254 for (i=0; i<mbs; i++) { 255 jl[i] = mbs; il[0] = 0; 256 } 257 dk = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(dk); 258 uik = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(uik); 259 W = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(W); 260 work = (MatScalar*)PetscMalloc(bs*sizeof(MatScalar));CHKPTRQ(work); 261 pivots= (int*)PetscMalloc(bs*sizeof(int));CHKPTRQ(pivots); 262 263 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 264 265 /* check permutation */ 266 if (!a->permute){ 267 ai = a->i; aj = a->j; aa = a->a; 268 } else { 269 ai = a->inew; aj = a->jnew; 270 aa = (MatScalar*)PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa); 271 ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 272 a2anew = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew); 273 ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 274 275 for (i=0; i<mbs; i++){ 276 jmin = ai[i]; jmax = ai[i+1]; 277 for (j=jmin; j<jmax; j++){ 278 while (a2anew[j] != j){ 279 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 280 for (k1=0; k1<bs2; k1++){ 281 dk[k1] = aa[k*bs2+k1]; 282 aa[k*bs2+k1] = aa[j*bs2+k1]; 283 aa[j*bs2+k1] = dk[k1]; 284 } 285 } 286 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 287 if (i > aj[j]){ 288 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 289 ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */ 290 for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 291 for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */ 292 for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1]; 293 } 294 } 295 } 296 } 297 ierr = PetscFree(a2anew);CHKERRA(ierr); 298 } 299 300 /* for each row k */ 301 for (k = 0; k<mbs; k++){ 302 303 /*initialize k-th row with elements nonzero in row perm(k) of A */ 304 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 305 if (jmin < jmax) { 306 ap = aa + jmin*bs2; 307 for (j = jmin; j < jmax; j++){ 308 vj = perm_ptr[aj[j]]; /* block col. index */ 309 rtmp_ptr = rtmp + vj*bs2; 310 for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++; 311 } 312 } 313 314 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 315 ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 316 i = jl[k]; /* first row to be added to k_th row */ 317 318 while (i < mbs){ 319 nexti = jl[i]; /* next row to be added to k_th row */ 320 321 /* compute multiplier */ 322 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 323 324 /* uik = -inv(Di)*U_bar(i,k) */ 325 diag = ba + i*bs2; 326 u = ba + ili*bs2; 327 ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 328 Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u); 329 330 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 331 Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u); 332 333 /* update -U(i,k) */ 334 ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 335 336 /* add multiple of row i to k-th row ... */ 337 jmin = ili + 1; jmax = bi[i+1]; 338 if (jmin < jmax){ 339 for (j=jmin; j<jmax; j++) { 340 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 341 rtmp_ptr = rtmp + bj[j]*bs2; 342 u = ba + j*bs2; 343 Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u); 344 } 345 346 /* ... add i to row list for next nonzero entry */ 347 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 348 j = bj[jmin]; 349 jl[i] = jl[j]; jl[j] = i; /* update jl */ 350 } 351 i = nexti; 352 } 353 354 /* save nonzero entries in k-th row of U ... */ 355 356 /* invert diagonal block */ 357 diag = ba+k*bs2; 358 ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr); 359 Kernel_A_gets_inverse_A(bs,diag,pivots,work); 360 361 jmin = bi[k]; jmax = bi[k+1]; 362 if (jmin < jmax) { 363 for (j=jmin; j<jmax; j++){ 364 vj = bj[j]; /* block col. index of U */ 365 u = ba + j*bs2; 366 rtmp_ptr = rtmp + vj*bs2; 367 for (k1=0; k1<bs2; k1++){ 368 *u++ = *rtmp_ptr; 369 *rtmp_ptr++ = 0.0; 370 } 371 } 372 373 /* ... add k to row list for first nonzero entry in k-th row */ 374 il[k] = jmin; 375 i = bj[jmin]; 376 jl[k] = jl[i]; jl[i] = k; 377 } 378 } 379 380 ierr = PetscFree(rtmp);CHKERRQ(ierr); 381 ierr = PetscFree(il);CHKERRQ(ierr); 382 ierr = PetscFree(jl);CHKERRQ(ierr); 383 ierr = PetscFree(dk);CHKERRQ(ierr); 384 ierr = PetscFree(uik);CHKERRQ(ierr); 385 ierr = PetscFree(W);CHKERRQ(ierr); 386 ierr = PetscFree(work);CHKERRQ(ierr); 387 ierr = PetscFree(pivots);CHKERRQ(ierr); 388 if (a->permute){ 389 ierr = PetscFree(aa);CHKERRQ(ierr); 390 } 391 392 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 393 C->factor = FACTOR_CHOLESKY; 394 C->assembled = PETSC_TRUE; 395 C->preallocated = PETSC_TRUE; 396 PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 397 PetscFunctionReturn(0); 398 } 399 400 #undef __FUNC__ 401 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering" 402 int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B) 403 { 404 Mat C = *B; 405 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 406 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 407 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 408 int bs=a->bs,bs2 = a->bs2; 409 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 410 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 411 MatScalar *W,*work; 412 int *pivots; 413 414 PetscFunctionBegin; 415 416 /* initialization */ 417 418 rtmp = (MatScalar*)PetscMalloc(bs2*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 419 ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr); 420 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 421 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 422 for (i=0; i<mbs; i++) { 423 jl[i] = mbs; il[0] = 0; 424 } 425 dk = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(dk); 426 uik = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(uik); 427 W = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(W); 428 work = (MatScalar*)PetscMalloc(bs*sizeof(MatScalar));CHKPTRQ(work); 429 pivots= (int*)PetscMalloc(bs*sizeof(int));CHKPTRQ(pivots); 430 431 ai = a->i; aj = a->j; aa = a->a; 432 433 /* for each row k */ 434 for (k = 0; k<mbs; k++){ 435 436 /*initialize k-th row with elements nonzero in row k of A */ 437 jmin = ai[k]; jmax = ai[k+1]; 438 if (jmin < jmax) { 439 ap = aa + jmin*bs2; 440 for (j = jmin; j < jmax; j++){ 441 vj = aj[j]; /* block col. index */ 442 rtmp_ptr = rtmp + vj*bs2; 443 for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++; 444 } 445 } 446 447 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 448 ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 449 i = jl[k]; /* first row to be added to k_th row */ 450 451 while (i < mbs){ 452 nexti = jl[i]; /* next row to be added to k_th row */ 453 454 /* compute multiplier */ 455 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 456 457 /* uik = -inv(Di)*U_bar(i,k) */ 458 diag = ba + i*bs2; 459 u = ba + ili*bs2; 460 ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 461 Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u); 462 463 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 464 Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u); 465 466 /* update -U(i,k) */ 467 ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 468 469 /* add multiple of row i to k-th row ... */ 470 jmin = ili + 1; jmax = bi[i+1]; 471 if (jmin < jmax){ 472 for (j=jmin; j<jmax; j++) { 473 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 474 rtmp_ptr = rtmp + bj[j]*bs2; 475 u = ba + j*bs2; 476 Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u); 477 } 478 479 /* ... add i to row list for next nonzero entry */ 480 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 481 j = bj[jmin]; 482 jl[i] = jl[j]; jl[j] = i; /* update jl */ 483 } 484 i = nexti; 485 } 486 487 /* save nonzero entries in k-th row of U ... */ 488 489 /* invert diagonal block */ 490 diag = ba+k*bs2; 491 ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr); 492 Kernel_A_gets_inverse_A(bs,diag,pivots,work); 493 494 jmin = bi[k]; jmax = bi[k+1]; 495 if (jmin < jmax) { 496 for (j=jmin; j<jmax; j++){ 497 vj = bj[j]; /* block col. index of U */ 498 u = ba + j*bs2; 499 rtmp_ptr = rtmp + vj*bs2; 500 for (k1=0; k1<bs2; k1++){ 501 *u++ = *rtmp_ptr; 502 *rtmp_ptr++ = 0.0; 503 } 504 } 505 506 /* ... add k to row list for first nonzero entry in k-th row */ 507 il[k] = jmin; 508 i = bj[jmin]; 509 jl[k] = jl[i]; jl[i] = k; 510 } 511 } 512 513 ierr = PetscFree(rtmp);CHKERRQ(ierr); 514 ierr = PetscFree(il);CHKERRQ(ierr); 515 ierr = PetscFree(jl);CHKERRQ(ierr); 516 ierr = PetscFree(dk);CHKERRQ(ierr); 517 ierr = PetscFree(uik);CHKERRQ(ierr); 518 ierr = PetscFree(W);CHKERRQ(ierr); 519 ierr = PetscFree(work);CHKERRQ(ierr); 520 ierr = PetscFree(pivots);CHKERRQ(ierr); 521 522 C->factor = FACTOR_CHOLESKY; 523 C->assembled = PETSC_TRUE; 524 C->preallocated = PETSC_TRUE; 525 PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 526 PetscFunctionReturn(0); 527 } 528 529 /* Version for when blocks are 7 by 7 */ 530 #undef __FUNC__ 531 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7" 532 int MatCholeskyFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B) 533 { 534 Mat C = *B; 535 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 536 IS perm = b->row; 537 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 538 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 539 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 540 MatScalar *u,*d,*w,*wp; 541 542 PetscFunctionBegin; 543 /* initialization */ 544 w = (MatScalar*)PetscMalloc(49*mbs*sizeof(MatScalar));CHKPTRQ(w); 545 ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr); 546 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 547 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 548 for (i=0; i<mbs; i++) { 549 jl[i] = mbs; il[0] = 0; 550 } 551 dk = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(dk); 552 uik = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(uik); 553 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 554 555 /* check permutation */ 556 if (!a->permute){ 557 ai = a->i; aj = a->j; aa = a->a; 558 } else { 559 ai = a->inew; aj = a->jnew; 560 aa = (MatScalar*)PetscMalloc(49*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa); 561 ierr = PetscMemcpy(aa,a->a,49*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 562 a2anew = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew); 563 ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 564 565 for (i=0; i<mbs; i++){ 566 jmin = ai[i]; jmax = ai[i+1]; 567 for (j=jmin; j<jmax; j++){ 568 while (a2anew[j] != j){ 569 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 570 for (k1=0; k1<49; k1++){ 571 dk[k1] = aa[k*49+k1]; 572 aa[k*49+k1] = aa[j*49+k1]; 573 aa[j*49+k1] = dk[k1]; 574 } 575 } 576 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 577 if (i > aj[j]){ 578 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 579 ap = aa + j*49; /* ptr to the beginning of j-th block of aa */ 580 for (k=0; k<49; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 581 for (k=0; k<7; k++){ /* j-th block of aa <- dk^T */ 582 for (k1=0; k1<7; k1++) *ap++ = dk[k + 7*k1]; 583 } 584 } 585 } 586 } 587 ierr = PetscFree(a2anew);CHKERRA(ierr); 588 } 589 590 /* for each row k */ 591 for (k = 0; k<mbs; k++){ 592 593 /*initialize k-th row with elements nonzero in row perm(k) of A */ 594 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 595 if (jmin < jmax) { 596 ap = aa + jmin*49; 597 for (j = jmin; j < jmax; j++){ 598 vj = perm_ptr[aj[j]]; /* block col. index */ 599 wp = w + vj*49; 600 for (i=0; i<49; i++) *wp++ = *ap++; 601 } 602 } 603 604 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 605 ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr); 606 i = jl[k]; /* first row to be added to k_th row */ 607 608 while (i < mbs){ 609 nexti = jl[i]; /* next row to be added to k_th row */ 610 611 /* compute multiplier */ 612 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 613 614 /* uik = -inv(Di)*U_bar(i,k) */ 615 d = ba + i*49; 616 u = ba + ili*49; 617 618 uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]); 619 uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]); 620 uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]); 621 uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]); 622 uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]); 623 uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]); 624 uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]); 625 626 uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]); 627 uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]); 628 uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]); 629 uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]); 630 uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]); 631 uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]); 632 uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]); 633 634 uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]); 635 uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]); 636 uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]); 637 uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]); 638 uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]); 639 uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]); 640 uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]); 641 642 uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]); 643 uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]); 644 uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]); 645 uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]); 646 uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]); 647 uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]); 648 uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]); 649 650 uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]); 651 uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]); 652 uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]); 653 uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]); 654 uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]); 655 uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]); 656 uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]); 657 658 uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]); 659 uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]); 660 uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]); 661 uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]); 662 uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]); 663 uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]); 664 uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]); 665 666 uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]); 667 uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]); 668 uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]); 669 uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]); 670 uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]); 671 uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]); 672 uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]); 673 674 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 675 dk[0]+= uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6]; 676 dk[1]+= uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6]; 677 dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6]; 678 dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6]; 679 dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6]; 680 dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6]; 681 dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6]; 682 683 dk[7]+= uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13]; 684 dk[8]+= uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]; 685 dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13]; 686 dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13]; 687 dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13]; 688 dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13]; 689 dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13]; 690 691 dk[14]+= uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20]; 692 dk[15]+= uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20]; 693 dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]; 694 dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20]; 695 dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20]; 696 dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20]; 697 dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20]; 698 699 dk[21]+= uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27]; 700 dk[22]+= uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27]; 701 dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27]; 702 dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]; 703 dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27]; 704 dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27]; 705 dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27]; 706 707 dk[28]+= uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34]; 708 dk[29]+= uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34]; 709 dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34]; 710 dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34]; 711 dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]; 712 dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34]; 713 dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34]; 714 715 dk[35]+= uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41]; 716 dk[36]+= uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41]; 717 dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41]; 718 dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41]; 719 dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41]; 720 dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41]; 721 dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41]; 722 723 dk[42]+= uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48]; 724 dk[43]+= uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48]; 725 dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48]; 726 dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48]; 727 dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48]; 728 dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48]; 729 dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48]; 730 731 /* update -U(i,k) */ 732 ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr); 733 734 /* add multiple of row i to k-th row ... */ 735 jmin = ili + 1; jmax = bi[i+1]; 736 if (jmin < jmax){ 737 for (j=jmin; j<jmax; j++) { 738 /* w += -U(i,k)^T * U_bar(i,j) */ 739 wp = w + bj[j]*49; 740 u = ba + j*49; 741 742 wp[0]+= uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6]; 743 wp[1]+= uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6]; 744 wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6]; 745 wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6]; 746 wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6]; 747 wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6]; 748 wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6]; 749 750 wp[7]+= uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13]; 751 wp[8]+= uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]; 752 wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13]; 753 wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13]; 754 wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13]; 755 wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13]; 756 wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13]; 757 758 wp[14]+= uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20]; 759 wp[15]+= uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20]; 760 wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]; 761 wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20]; 762 wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20]; 763 wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20]; 764 wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20]; 765 766 wp[21]+= uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27]; 767 wp[22]+= uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27]; 768 wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27]; 769 wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]; 770 wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27]; 771 wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27]; 772 wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27]; 773 774 wp[28]+= uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34]; 775 wp[29]+= uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34]; 776 wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34]; 777 wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34]; 778 wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]; 779 wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34]; 780 wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34]; 781 782 wp[35]+= uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41]; 783 wp[36]+= uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41]; 784 wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41]; 785 wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41]; 786 wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41]; 787 wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41]; 788 wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41]; 789 790 wp[42]+= uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48]; 791 wp[43]+= uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48]; 792 wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48]; 793 wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48]; 794 wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48]; 795 wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48]; 796 wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48]; 797 } 798 799 /* ... add i to row list for next nonzero entry */ 800 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 801 j = bj[jmin]; 802 jl[i] = jl[j]; jl[j] = i; /* update jl */ 803 } 804 i = nexti; 805 } 806 807 /* save nonzero entries in k-th row of U ... */ 808 809 /* invert diagonal block */ 810 d = ba+k*49; 811 ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr); 812 ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr); 813 814 jmin = bi[k]; jmax = bi[k+1]; 815 if (jmin < jmax) { 816 for (j=jmin; j<jmax; j++){ 817 vj = bj[j]; /* block col. index of U */ 818 u = ba + j*49; 819 wp = w + vj*49; 820 for (k1=0; k1<49; k1++){ 821 *u++ = *wp; 822 *wp++ = 0.0; 823 } 824 } 825 826 /* ... add k to row list for first nonzero entry in k-th row */ 827 il[k] = jmin; 828 i = bj[jmin]; 829 jl[k] = jl[i]; jl[i] = k; 830 } 831 } 832 833 ierr = PetscFree(w);CHKERRQ(ierr); 834 ierr = PetscFree(il);CHKERRQ(ierr); 835 ierr = PetscFree(jl);CHKERRQ(ierr); 836 ierr = PetscFree(dk);CHKERRQ(ierr); 837 ierr = PetscFree(uik);CHKERRQ(ierr); 838 if (a->permute){ 839 ierr = PetscFree(aa);CHKERRQ(ierr); 840 } 841 842 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 843 C->factor = FACTOR_CHOLESKY; 844 C->assembled = PETSC_TRUE; 845 C->preallocated = PETSC_TRUE; 846 PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */ 847 PetscFunctionReturn(0); 848 } 849 850 /* 851 Version for when blocks are 7 by 7 Using natural ordering 852 */ 853 #undef __FUNC__ 854 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering" 855 int MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B) 856 { 857 Mat C = *B; 858 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 859 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 860 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 861 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 862 MatScalar *u,*d,*w,*wp; 863 864 PetscFunctionBegin; 865 866 /* initialization */ 867 w = (MatScalar*)PetscMalloc(49*mbs*sizeof(MatScalar));CHKPTRQ(w); 868 ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr); 869 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 870 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 871 for (i=0; i<mbs; i++) { 872 jl[i] = mbs; il[0] = 0; 873 } 874 dk = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(dk); 875 uik = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(uik); 876 877 ai = a->i; aj = a->j; aa = a->a; 878 879 /* for each row k */ 880 for (k = 0; k<mbs; k++){ 881 882 /*initialize k-th row with elements nonzero in row k of A */ 883 jmin = ai[k]; jmax = ai[k+1]; 884 if (jmin < jmax) { 885 ap = aa + jmin*49; 886 for (j = jmin; j < jmax; j++){ 887 vj = aj[j]; /* block col. index */ 888 wp = w + vj*49; 889 for (i=0; i<49; i++) *wp++ = *ap++; 890 } 891 } 892 893 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 894 ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr); 895 i = jl[k]; /* first row to be added to k_th row */ 896 897 while (i < mbs){ 898 nexti = jl[i]; /* next row to be added to k_th row */ 899 900 /* compute multiplier */ 901 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 902 903 /* uik = -inv(Di)*U_bar(i,k) */ 904 d = ba + i*49; 905 u = ba + ili*49; 906 907 uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]); 908 uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]); 909 uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]); 910 uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]); 911 uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]); 912 uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]); 913 uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]); 914 915 uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]); 916 uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]); 917 uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]); 918 uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]); 919 uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]); 920 uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]); 921 uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]); 922 923 uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]); 924 uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]); 925 uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]); 926 uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]); 927 uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]); 928 uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]); 929 uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]); 930 931 uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]); 932 uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]); 933 uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]); 934 uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]); 935 uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]); 936 uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]); 937 uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]); 938 939 uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]); 940 uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]); 941 uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]); 942 uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]); 943 uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]); 944 uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]); 945 uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]); 946 947 uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]); 948 uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]); 949 uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]); 950 uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]); 951 uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]); 952 uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]); 953 uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]); 954 955 uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]); 956 uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]); 957 uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]); 958 uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]); 959 uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]); 960 uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]); 961 uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]); 962 963 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 964 dk[0]+= uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6]; 965 dk[1]+= uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6]; 966 dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6]; 967 dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6]; 968 dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6]; 969 dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6]; 970 dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6]; 971 972 dk[7]+= uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13]; 973 dk[8]+= uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]; 974 dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13]; 975 dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13]; 976 dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13]; 977 dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13]; 978 dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13]; 979 980 dk[14]+= uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20]; 981 dk[15]+= uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20]; 982 dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]; 983 dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20]; 984 dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20]; 985 dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20]; 986 dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20]; 987 988 dk[21]+= uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27]; 989 dk[22]+= uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27]; 990 dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27]; 991 dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]; 992 dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27]; 993 dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27]; 994 dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27]; 995 996 dk[28]+= uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34]; 997 dk[29]+= uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34]; 998 dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34]; 999 dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34]; 1000 dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]; 1001 dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34]; 1002 dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34]; 1003 1004 dk[35]+= uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41]; 1005 dk[36]+= uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41]; 1006 dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41]; 1007 dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41]; 1008 dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41]; 1009 dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41]; 1010 dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41]; 1011 1012 dk[42]+= uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48]; 1013 dk[43]+= uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48]; 1014 dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48]; 1015 dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48]; 1016 dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48]; 1017 dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48]; 1018 dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48]; 1019 1020 /* update -U(i,k) */ 1021 ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr); 1022 1023 /* add multiple of row i to k-th row ... */ 1024 jmin = ili + 1; jmax = bi[i+1]; 1025 if (jmin < jmax){ 1026 for (j=jmin; j<jmax; j++) { 1027 /* w += -U(i,k)^T * U_bar(i,j) */ 1028 wp = w + bj[j]*49; 1029 u = ba + j*49; 1030 1031 wp[0]+= uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6]; 1032 wp[1]+= uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6]; 1033 wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6]; 1034 wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6]; 1035 wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6]; 1036 wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6]; 1037 wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6]; 1038 1039 wp[7]+= uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13]; 1040 wp[8]+= uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]; 1041 wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13]; 1042 wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13]; 1043 wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13]; 1044 wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13]; 1045 wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13]; 1046 1047 wp[14]+= uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20]; 1048 wp[15]+= uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20]; 1049 wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]; 1050 wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20]; 1051 wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20]; 1052 wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20]; 1053 wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20]; 1054 1055 wp[21]+= uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27]; 1056 wp[22]+= uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27]; 1057 wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27]; 1058 wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]; 1059 wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27]; 1060 wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27]; 1061 wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27]; 1062 1063 wp[28]+= uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34]; 1064 wp[29]+= uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34]; 1065 wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34]; 1066 wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34]; 1067 wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]; 1068 wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34]; 1069 wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34]; 1070 1071 wp[35]+= uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41]; 1072 wp[36]+= uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41]; 1073 wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41]; 1074 wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41]; 1075 wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41]; 1076 wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41]; 1077 wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41]; 1078 1079 wp[42]+= uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48]; 1080 wp[43]+= uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48]; 1081 wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48]; 1082 wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48]; 1083 wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48]; 1084 wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48]; 1085 wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48]; 1086 } 1087 1088 /* ... add i to row list for next nonzero entry */ 1089 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 1090 j = bj[jmin]; 1091 jl[i] = jl[j]; jl[j] = i; /* update jl */ 1092 } 1093 i = nexti; 1094 } 1095 1096 /* save nonzero entries in k-th row of U ... */ 1097 1098 /* invert diagonal block */ 1099 d = ba+k*49; 1100 ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr); 1101 ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr); 1102 1103 jmin = bi[k]; jmax = bi[k+1]; 1104 if (jmin < jmax) { 1105 for (j=jmin; j<jmax; j++){ 1106 vj = bj[j]; /* block col. index of U */ 1107 u = ba + j*49; 1108 wp = w + vj*49; 1109 for (k1=0; k1<49; k1++){ 1110 *u++ = *wp; 1111 *wp++ = 0.0; 1112 } 1113 } 1114 1115 /* ... add k to row list for first nonzero entry in k-th row */ 1116 il[k] = jmin; 1117 i = bj[jmin]; 1118 jl[k] = jl[i]; jl[i] = k; 1119 } 1120 } 1121 1122 ierr = PetscFree(w);CHKERRQ(ierr); 1123 ierr = PetscFree(il);CHKERRQ(ierr); 1124 ierr = PetscFree(jl);CHKERRQ(ierr); 1125 ierr = PetscFree(dk);CHKERRQ(ierr); 1126 ierr = PetscFree(uik);CHKERRQ(ierr); 1127 1128 C->factor = FACTOR_CHOLESKY; 1129 C->assembled = PETSC_TRUE; 1130 C->preallocated = PETSC_TRUE; 1131 PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */ 1132 PetscFunctionReturn(0); 1133 } 1134 1135 /* Version for when blocks are 6 by 6 */ 1136 #undef __FUNC__ 1137 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6" 1138 int MatCholeskyFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B) 1139 { 1140 Mat C = *B; 1141 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 1142 IS perm = b->row; 1143 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 1144 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 1145 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 1146 MatScalar *u,*d,*w,*wp; 1147 1148 PetscFunctionBegin; 1149 /* initialization */ 1150 w = (MatScalar*)PetscMalloc(36*mbs*sizeof(MatScalar));CHKPTRQ(w); 1151 ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr); 1152 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 1153 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 1154 for (i=0; i<mbs; i++) { 1155 jl[i] = mbs; il[0] = 0; 1156 } 1157 dk = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(dk); 1158 uik = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(uik); 1159 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 1160 1161 /* check permutation */ 1162 if (!a->permute){ 1163 ai = a->i; aj = a->j; aa = a->a; 1164 } else { 1165 ai = a->inew; aj = a->jnew; 1166 aa = (MatScalar*)PetscMalloc(36*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa); 1167 ierr = PetscMemcpy(aa,a->a,36*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1168 a2anew = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew); 1169 ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 1170 1171 for (i=0; i<mbs; i++){ 1172 jmin = ai[i]; jmax = ai[i+1]; 1173 for (j=jmin; j<jmax; j++){ 1174 while (a2anew[j] != j){ 1175 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 1176 for (k1=0; k1<36; k1++){ 1177 dk[k1] = aa[k*36+k1]; 1178 aa[k*36+k1] = aa[j*36+k1]; 1179 aa[j*36+k1] = dk[k1]; 1180 } 1181 } 1182 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 1183 if (i > aj[j]){ 1184 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 1185 ap = aa + j*36; /* ptr to the beginning of j-th block of aa */ 1186 for (k=0; k<36; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 1187 for (k=0; k<6; k++){ /* j-th block of aa <- dk^T */ 1188 for (k1=0; k1<6; k1++) *ap++ = dk[k + 6*k1]; 1189 } 1190 } 1191 } 1192 } 1193 ierr = PetscFree(a2anew);CHKERRA(ierr); 1194 } 1195 1196 /* for each row k */ 1197 for (k = 0; k<mbs; k++){ 1198 1199 /*initialize k-th row with elements nonzero in row perm(k) of A */ 1200 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 1201 if (jmin < jmax) { 1202 ap = aa + jmin*36; 1203 for (j = jmin; j < jmax; j++){ 1204 vj = perm_ptr[aj[j]]; /* block col. index */ 1205 wp = w + vj*36; 1206 for (i=0; i<36; i++) *wp++ = *ap++; 1207 } 1208 } 1209 1210 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 1211 ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr); 1212 i = jl[k]; /* first row to be added to k_th row */ 1213 1214 while (i < mbs){ 1215 nexti = jl[i]; /* next row to be added to k_th row */ 1216 1217 /* compute multiplier */ 1218 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1219 1220 /* uik = -inv(Di)*U_bar(i,k) */ 1221 d = ba + i*36; 1222 u = ba + ili*36; 1223 1224 uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]); 1225 uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]); 1226 uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]); 1227 uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]); 1228 uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]); 1229 uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]); 1230 1231 uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]); 1232 uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]); 1233 uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]); 1234 uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]); 1235 uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]); 1236 uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]); 1237 1238 uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]); 1239 uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]); 1240 uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]); 1241 uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]); 1242 uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]); 1243 uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]); 1244 1245 uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]); 1246 uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]); 1247 uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]); 1248 uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]); 1249 uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]); 1250 uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]); 1251 1252 uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]); 1253 uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]); 1254 uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]); 1255 uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]); 1256 uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]); 1257 uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]); 1258 1259 uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]); 1260 uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]); 1261 uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]); 1262 uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]); 1263 uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]); 1264 uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]); 1265 1266 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 1267 dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5]; 1268 dk[1] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5]; 1269 dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5]; 1270 dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5]; 1271 dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5]; 1272 dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5]; 1273 1274 dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11]; 1275 dk[7] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]; 1276 dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11]; 1277 dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11]; 1278 dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11]; 1279 dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11]; 1280 1281 dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17]; 1282 dk[13]+= uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17]; 1283 dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]; 1284 dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17]; 1285 dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17]; 1286 dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17]; 1287 1288 dk[18]+= uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23]; 1289 dk[19]+= uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23]; 1290 dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23]; 1291 dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]; 1292 dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23]; 1293 dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23]; 1294 1295 dk[24]+= uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29]; 1296 dk[25]+= uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29]; 1297 dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29]; 1298 dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29]; 1299 dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29]; 1300 dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29]; 1301 1302 dk[30]+= uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35]; 1303 dk[31]+= uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35]; 1304 dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35]; 1305 dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35]; 1306 dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35]; 1307 dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35]; 1308 1309 /* update -U(i,k) */ 1310 ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr); 1311 1312 /* add multiple of row i to k-th row ... */ 1313 jmin = ili + 1; jmax = bi[i+1]; 1314 if (jmin < jmax){ 1315 for (j=jmin; j<jmax; j++) { 1316 /* w += -U(i,k)^T * U_bar(i,j) */ 1317 wp = w + bj[j]*36; 1318 u = ba + j*36; 1319 wp[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5]; 1320 wp[1] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5]; 1321 wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5]; 1322 wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5]; 1323 wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5]; 1324 wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5]; 1325 1326 wp[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11]; 1327 wp[7] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]; 1328 wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11]; 1329 wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11]; 1330 wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11]; 1331 wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11]; 1332 1333 wp[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17]; 1334 wp[13]+= uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17]; 1335 wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]; 1336 wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17]; 1337 wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17]; 1338 wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17]; 1339 1340 wp[18]+= uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23]; 1341 wp[19]+= uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23]; 1342 wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23]; 1343 wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]; 1344 wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23]; 1345 wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23]; 1346 1347 wp[24]+= uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29]; 1348 wp[25]+= uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29]; 1349 wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29]; 1350 wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29]; 1351 wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29]; 1352 wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29]; 1353 1354 wp[30]+= uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35]; 1355 wp[31]+= uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35]; 1356 wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35]; 1357 wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35]; 1358 wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35]; 1359 wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35]; 1360 } 1361 1362 /* ... add i to row list for next nonzero entry */ 1363 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 1364 j = bj[jmin]; 1365 jl[i] = jl[j]; jl[j] = i; /* update jl */ 1366 } 1367 i = nexti; 1368 } 1369 1370 /* save nonzero entries in k-th row of U ... */ 1371 1372 /* invert diagonal block */ 1373 d = ba+k*36; 1374 ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr); 1375 ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr); 1376 1377 jmin = bi[k]; jmax = bi[k+1]; 1378 if (jmin < jmax) { 1379 for (j=jmin; j<jmax; j++){ 1380 vj = bj[j]; /* block col. index of U */ 1381 u = ba + j*36; 1382 wp = w + vj*36; 1383 for (k1=0; k1<36; k1++){ 1384 *u++ = *wp; 1385 *wp++ = 0.0; 1386 } 1387 } 1388 1389 /* ... add k to row list for first nonzero entry in k-th row */ 1390 il[k] = jmin; 1391 i = bj[jmin]; 1392 jl[k] = jl[i]; jl[i] = k; 1393 } 1394 } 1395 1396 ierr = PetscFree(w);CHKERRQ(ierr); 1397 ierr = PetscFree(il);CHKERRQ(ierr); 1398 ierr = PetscFree(jl);CHKERRQ(ierr); 1399 ierr = PetscFree(dk);CHKERRQ(ierr); 1400 ierr = PetscFree(uik);CHKERRQ(ierr); 1401 if (a->permute){ 1402 ierr = PetscFree(aa);CHKERRQ(ierr); 1403 } 1404 1405 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 1406 C->factor = FACTOR_CHOLESKY; 1407 C->assembled = PETSC_TRUE; 1408 C->preallocated = PETSC_TRUE; 1409 PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */ 1410 PetscFunctionReturn(0); 1411 } 1412 1413 /* 1414 Version for when blocks are 6 by 6 Using natural ordering 1415 */ 1416 #undef __FUNC__ 1417 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering" 1418 int MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B) 1419 { 1420 Mat C = *B; 1421 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 1422 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 1423 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 1424 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 1425 MatScalar *u,*d,*w,*wp; 1426 1427 PetscFunctionBegin; 1428 1429 /* initialization */ 1430 w = (MatScalar*)PetscMalloc(36*mbs*sizeof(MatScalar));CHKPTRQ(w); 1431 ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr); 1432 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 1433 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 1434 for (i=0; i<mbs; i++) { 1435 jl[i] = mbs; il[0] = 0; 1436 } 1437 dk = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(dk); 1438 uik = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(uik); 1439 1440 ai = a->i; aj = a->j; aa = a->a; 1441 1442 /* for each row k */ 1443 for (k = 0; k<mbs; k++){ 1444 1445 /*initialize k-th row with elements nonzero in row k of A */ 1446 jmin = ai[k]; jmax = ai[k+1]; 1447 if (jmin < jmax) { 1448 ap = aa + jmin*36; 1449 for (j = jmin; j < jmax; j++){ 1450 vj = aj[j]; /* block col. index */ 1451 wp = w + vj*36; 1452 for (i=0; i<36; i++) *wp++ = *ap++; 1453 } 1454 } 1455 1456 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 1457 ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr); 1458 i = jl[k]; /* first row to be added to k_th row */ 1459 1460 while (i < mbs){ 1461 nexti = jl[i]; /* next row to be added to k_th row */ 1462 1463 /* compute multiplier */ 1464 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1465 1466 /* uik = -inv(Di)*U_bar(i,k) */ 1467 d = ba + i*36; 1468 u = ba + ili*36; 1469 1470 uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]); 1471 uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]); 1472 uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]); 1473 uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]); 1474 uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]); 1475 uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]); 1476 1477 uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]); 1478 uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]); 1479 uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]); 1480 uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]); 1481 uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]); 1482 uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]); 1483 1484 uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]); 1485 uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]); 1486 uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]); 1487 uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]); 1488 uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]); 1489 uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]); 1490 1491 uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]); 1492 uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]); 1493 uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]); 1494 uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]); 1495 uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]); 1496 uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]); 1497 1498 uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]); 1499 uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]); 1500 uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]); 1501 uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]); 1502 uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]); 1503 uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]); 1504 1505 uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]); 1506 uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]); 1507 uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]); 1508 uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]); 1509 uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]); 1510 uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]); 1511 1512 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 1513 dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5]; 1514 dk[1] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5]; 1515 dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5]; 1516 dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5]; 1517 dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5]; 1518 dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5]; 1519 1520 dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11]; 1521 dk[7] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]; 1522 dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11]; 1523 dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11]; 1524 dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11]; 1525 dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11]; 1526 1527 dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17]; 1528 dk[13]+= uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17]; 1529 dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]; 1530 dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17]; 1531 dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17]; 1532 dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17]; 1533 1534 dk[18]+= uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23]; 1535 dk[19]+= uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23]; 1536 dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23]; 1537 dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]; 1538 dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23]; 1539 dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23]; 1540 1541 dk[24]+= uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29]; 1542 dk[25]+= uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29]; 1543 dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29]; 1544 dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29]; 1545 dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29]; 1546 dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29]; 1547 1548 dk[30]+= uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35]; 1549 dk[31]+= uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35]; 1550 dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35]; 1551 dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35]; 1552 dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35]; 1553 dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35]; 1554 1555 /* update -U(i,k) */ 1556 ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr); 1557 1558 /* add multiple of row i to k-th row ... */ 1559 jmin = ili + 1; jmax = bi[i+1]; 1560 if (jmin < jmax){ 1561 for (j=jmin; j<jmax; j++) { 1562 /* w += -U(i,k)^T * U_bar(i,j) */ 1563 wp = w + bj[j]*36; 1564 u = ba + j*36; 1565 wp[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5]; 1566 wp[1] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5]; 1567 wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5]; 1568 wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5]; 1569 wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5]; 1570 wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5]; 1571 1572 wp[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11]; 1573 wp[7] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]; 1574 wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11]; 1575 wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11]; 1576 wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11]; 1577 wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11]; 1578 1579 wp[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17]; 1580 wp[13]+= uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17]; 1581 wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]; 1582 wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17]; 1583 wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17]; 1584 wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17]; 1585 1586 wp[18]+= uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23]; 1587 wp[19]+= uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23]; 1588 wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23]; 1589 wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]; 1590 wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23]; 1591 wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23]; 1592 1593 wp[24]+= uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29]; 1594 wp[25]+= uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29]; 1595 wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29]; 1596 wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29]; 1597 wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29]; 1598 wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29]; 1599 1600 wp[30]+= uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35]; 1601 wp[31]+= uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35]; 1602 wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35]; 1603 wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35]; 1604 wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35]; 1605 wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35]; 1606 } 1607 1608 /* ... add i to row list for next nonzero entry */ 1609 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 1610 j = bj[jmin]; 1611 jl[i] = jl[j]; jl[j] = i; /* update jl */ 1612 } 1613 i = nexti; 1614 } 1615 1616 /* save nonzero entries in k-th row of U ... */ 1617 1618 /* invert diagonal block */ 1619 d = ba+k*36; 1620 ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr); 1621 ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr); 1622 1623 jmin = bi[k]; jmax = bi[k+1]; 1624 if (jmin < jmax) { 1625 for (j=jmin; j<jmax; j++){ 1626 vj = bj[j]; /* block col. index of U */ 1627 u = ba + j*36; 1628 wp = w + vj*36; 1629 for (k1=0; k1<36; k1++){ 1630 *u++ = *wp; 1631 *wp++ = 0.0; 1632 } 1633 } 1634 1635 /* ... add k to row list for first nonzero entry in k-th row */ 1636 il[k] = jmin; 1637 i = bj[jmin]; 1638 jl[k] = jl[i]; jl[i] = k; 1639 } 1640 } 1641 1642 ierr = PetscFree(w);CHKERRQ(ierr); 1643 ierr = PetscFree(il);CHKERRQ(ierr); 1644 ierr = PetscFree(jl);CHKERRQ(ierr); 1645 ierr = PetscFree(dk);CHKERRQ(ierr); 1646 ierr = PetscFree(uik);CHKERRQ(ierr); 1647 1648 C->factor = FACTOR_CHOLESKY; 1649 C->assembled = PETSC_TRUE; 1650 C->preallocated = PETSC_TRUE; 1651 PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */ 1652 PetscFunctionReturn(0); 1653 } 1654 1655 /* Version for when blocks are 5 by 5 */ 1656 #undef __FUNC__ 1657 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5" 1658 int MatCholeskyFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B) 1659 { 1660 Mat C = *B; 1661 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 1662 IS perm = b->row; 1663 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 1664 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 1665 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 1666 MatScalar *u,*d,*rtmp,*rtmp_ptr; 1667 1668 PetscFunctionBegin; 1669 /* initialization */ 1670 rtmp = (MatScalar*)PetscMalloc(25*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 1671 ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr); 1672 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 1673 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 1674 for (i=0; i<mbs; i++) { 1675 jl[i] = mbs; il[0] = 0; 1676 } 1677 dk = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(dk); 1678 uik = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(uik); 1679 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 1680 1681 /* check permutation */ 1682 if (!a->permute){ 1683 ai = a->i; aj = a->j; aa = a->a; 1684 } else { 1685 ai = a->inew; aj = a->jnew; 1686 aa = (MatScalar*)PetscMalloc(25*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa); 1687 ierr = PetscMemcpy(aa,a->a,25*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1688 a2anew = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew); 1689 ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 1690 1691 for (i=0; i<mbs; i++){ 1692 jmin = ai[i]; jmax = ai[i+1]; 1693 for (j=jmin; j<jmax; j++){ 1694 while (a2anew[j] != j){ 1695 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 1696 for (k1=0; k1<25; k1++){ 1697 dk[k1] = aa[k*25+k1]; 1698 aa[k*25+k1] = aa[j*25+k1]; 1699 aa[j*25+k1] = dk[k1]; 1700 } 1701 } 1702 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 1703 if (i > aj[j]){ 1704 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 1705 ap = aa + j*25; /* ptr to the beginning of j-th block of aa */ 1706 for (k=0; k<25; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 1707 for (k=0; k<5; k++){ /* j-th block of aa <- dk^T */ 1708 for (k1=0; k1<5; k1++) *ap++ = dk[k + 5*k1]; 1709 } 1710 } 1711 } 1712 } 1713 ierr = PetscFree(a2anew);CHKERRA(ierr); 1714 } 1715 1716 /* for each row k */ 1717 for (k = 0; k<mbs; k++){ 1718 1719 /*initialize k-th row with elements nonzero in row perm(k) of A */ 1720 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 1721 if (jmin < jmax) { 1722 ap = aa + jmin*25; 1723 for (j = jmin; j < jmax; j++){ 1724 vj = perm_ptr[aj[j]]; /* block col. index */ 1725 rtmp_ptr = rtmp + vj*25; 1726 for (i=0; i<25; i++) *rtmp_ptr++ = *ap++; 1727 } 1728 } 1729 1730 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 1731 ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr); 1732 i = jl[k]; /* first row to be added to k_th row */ 1733 1734 while (i < mbs){ 1735 nexti = jl[i]; /* next row to be added to k_th row */ 1736 1737 /* compute multiplier */ 1738 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1739 1740 /* uik = -inv(Di)*U_bar(i,k) */ 1741 d = ba + i*25; 1742 u = ba + ili*25; 1743 1744 uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]); 1745 uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]); 1746 uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]); 1747 uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]); 1748 uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]); 1749 1750 uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]); 1751 uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]); 1752 uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]); 1753 uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]); 1754 uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]); 1755 1756 uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]); 1757 uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]); 1758 uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]); 1759 uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]); 1760 uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]); 1761 1762 uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]); 1763 uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]); 1764 uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]); 1765 uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]); 1766 uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]); 1767 1768 uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]); 1769 uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]); 1770 uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]); 1771 uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]); 1772 uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]); 1773 1774 1775 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 1776 dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4]; 1777 dk[1] += uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4]; 1778 dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4]; 1779 dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4]; 1780 dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4]; 1781 1782 dk[5] += uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9]; 1783 dk[6] += uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]; 1784 dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9]; 1785 dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9]; 1786 dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9]; 1787 1788 dk[10] += uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14]; 1789 dk[11] += uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14]; 1790 dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]; 1791 dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14]; 1792 dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14]; 1793 1794 dk[15] += uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19]; 1795 dk[16] += uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19]; 1796 dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19]; 1797 dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]; 1798 dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19]; 1799 1800 dk[20] += uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24]; 1801 dk[21] += uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24]; 1802 dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24]; 1803 dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24]; 1804 dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]; 1805 1806 /* update -U(i,k) */ 1807 ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr); 1808 1809 /* add multiple of row i to k-th row ... */ 1810 jmin = ili + 1; jmax = bi[i+1]; 1811 if (jmin < jmax){ 1812 for (j=jmin; j<jmax; j++) { 1813 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 1814 rtmp_ptr = rtmp + bj[j]*25; 1815 u = ba + j*25; 1816 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4]; 1817 rtmp_ptr[1] += uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4]; 1818 rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4]; 1819 rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4]; 1820 rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4]; 1821 1822 rtmp_ptr[5] += uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9]; 1823 rtmp_ptr[6] += uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]; 1824 rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9]; 1825 rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9]; 1826 rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9]; 1827 1828 rtmp_ptr[10] += uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14]; 1829 rtmp_ptr[11] += uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14]; 1830 rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]; 1831 rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14]; 1832 rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14]; 1833 1834 rtmp_ptr[15] += uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19]; 1835 rtmp_ptr[16] += uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19]; 1836 rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19]; 1837 rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]; 1838 rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19]; 1839 1840 rtmp_ptr[20] += uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24]; 1841 rtmp_ptr[21] += uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24]; 1842 rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24]; 1843 rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24]; 1844 rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]; 1845 } 1846 1847 /* ... add i to row list for next nonzero entry */ 1848 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 1849 j = bj[jmin]; 1850 jl[i] = jl[j]; jl[j] = i; /* update jl */ 1851 } 1852 i = nexti; 1853 } 1854 1855 /* save nonzero entries in k-th row of U ... */ 1856 1857 /* invert diagonal block */ 1858 d = ba+k*25; 1859 ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr); 1860 ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr); 1861 1862 jmin = bi[k]; jmax = bi[k+1]; 1863 if (jmin < jmax) { 1864 for (j=jmin; j<jmax; j++){ 1865 vj = bj[j]; /* block col. index of U */ 1866 u = ba + j*25; 1867 rtmp_ptr = rtmp + vj*25; 1868 for (k1=0; k1<25; k1++){ 1869 *u++ = *rtmp_ptr; 1870 *rtmp_ptr++ = 0.0; 1871 } 1872 } 1873 1874 /* ... add k to row list for first nonzero entry in k-th row */ 1875 il[k] = jmin; 1876 i = bj[jmin]; 1877 jl[k] = jl[i]; jl[i] = k; 1878 } 1879 } 1880 1881 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1882 ierr = PetscFree(il);CHKERRQ(ierr); 1883 ierr = PetscFree(jl);CHKERRQ(ierr); 1884 ierr = PetscFree(dk);CHKERRQ(ierr); 1885 ierr = PetscFree(uik);CHKERRQ(ierr); 1886 if (a->permute){ 1887 ierr = PetscFree(aa);CHKERRQ(ierr); 1888 } 1889 1890 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 1891 C->factor = FACTOR_CHOLESKY; 1892 C->assembled = PETSC_TRUE; 1893 C->preallocated = PETSC_TRUE; 1894 PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */ 1895 PetscFunctionReturn(0); 1896 } 1897 1898 /* 1899 Version for when blocks are 5 by 5 Using natural ordering 1900 */ 1901 #undef __FUNC__ 1902 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering" 1903 int MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B) 1904 { 1905 Mat C = *B; 1906 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 1907 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 1908 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 1909 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 1910 MatScalar *u,*d,*rtmp,*rtmp_ptr; 1911 1912 PetscFunctionBegin; 1913 1914 /* initialization */ 1915 rtmp = (MatScalar*)PetscMalloc(25*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 1916 ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr); 1917 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 1918 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 1919 for (i=0; i<mbs; i++) { 1920 jl[i] = mbs; il[0] = 0; 1921 } 1922 dk = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(dk); 1923 uik = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(uik); 1924 1925 ai = a->i; aj = a->j; aa = a->a; 1926 1927 /* for each row k */ 1928 for (k = 0; k<mbs; k++){ 1929 1930 /*initialize k-th row with elements nonzero in row k of A */ 1931 jmin = ai[k]; jmax = ai[k+1]; 1932 if (jmin < jmax) { 1933 ap = aa + jmin*25; 1934 for (j = jmin; j < jmax; j++){ 1935 vj = aj[j]; /* block col. index */ 1936 rtmp_ptr = rtmp + vj*25; 1937 for (i=0; i<25; i++) *rtmp_ptr++ = *ap++; 1938 } 1939 } 1940 1941 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 1942 ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr); 1943 i = jl[k]; /* first row to be added to k_th row */ 1944 1945 while (i < mbs){ 1946 nexti = jl[i]; /* next row to be added to k_th row */ 1947 1948 /* compute multiplier */ 1949 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1950 1951 /* uik = -inv(Di)*U_bar(i,k) */ 1952 d = ba + i*25; 1953 u = ba + ili*25; 1954 1955 uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]); 1956 uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]); 1957 uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]); 1958 uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]); 1959 uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]); 1960 1961 uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]); 1962 uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]); 1963 uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]); 1964 uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]); 1965 uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]); 1966 1967 uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]); 1968 uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]); 1969 uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]); 1970 uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]); 1971 uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]); 1972 1973 uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]); 1974 uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]); 1975 uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]); 1976 uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]); 1977 uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]); 1978 1979 uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]); 1980 uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]); 1981 uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]); 1982 uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]); 1983 uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]); 1984 1985 1986 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 1987 dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4]; 1988 dk[1] += uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4]; 1989 dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4]; 1990 dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4]; 1991 dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4]; 1992 1993 dk[5] += uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9]; 1994 dk[6] += uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]; 1995 dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9]; 1996 dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9]; 1997 dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9]; 1998 1999 dk[10] += uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14]; 2000 dk[11] += uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14]; 2001 dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]; 2002 dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14]; 2003 dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14]; 2004 2005 dk[15] += uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19]; 2006 dk[16] += uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19]; 2007 dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19]; 2008 dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]; 2009 dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19]; 2010 2011 dk[20] += uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24]; 2012 dk[21] += uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24]; 2013 dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24]; 2014 dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24]; 2015 dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]; 2016 2017 /* update -U(i,k) */ 2018 ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr); 2019 2020 /* add multiple of row i to k-th row ... */ 2021 jmin = ili + 1; jmax = bi[i+1]; 2022 if (jmin < jmax){ 2023 for (j=jmin; j<jmax; j++) { 2024 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 2025 rtmp_ptr = rtmp + bj[j]*25; 2026 u = ba + j*25; 2027 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4]; 2028 rtmp_ptr[1] += uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4]; 2029 rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4]; 2030 rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4]; 2031 rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4]; 2032 2033 rtmp_ptr[5] += uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9]; 2034 rtmp_ptr[6] += uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]; 2035 rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9]; 2036 rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9]; 2037 rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9]; 2038 2039 rtmp_ptr[10] += uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14]; 2040 rtmp_ptr[11] += uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14]; 2041 rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]; 2042 rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14]; 2043 rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14]; 2044 2045 rtmp_ptr[15] += uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19]; 2046 rtmp_ptr[16] += uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19]; 2047 rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19]; 2048 rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]; 2049 rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19]; 2050 2051 rtmp_ptr[20] += uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24]; 2052 rtmp_ptr[21] += uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24]; 2053 rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24]; 2054 rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24]; 2055 rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]; 2056 } 2057 2058 /* ... add i to row list for next nonzero entry */ 2059 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 2060 j = bj[jmin]; 2061 jl[i] = jl[j]; jl[j] = i; /* update jl */ 2062 } 2063 i = nexti; 2064 } 2065 2066 /* save nonzero entries in k-th row of U ... */ 2067 2068 /* invert diagonal block */ 2069 d = ba+k*25; 2070 ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr); 2071 ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr); 2072 2073 jmin = bi[k]; jmax = bi[k+1]; 2074 if (jmin < jmax) { 2075 for (j=jmin; j<jmax; j++){ 2076 vj = bj[j]; /* block col. index of U */ 2077 u = ba + j*25; 2078 rtmp_ptr = rtmp + vj*25; 2079 for (k1=0; k1<25; k1++){ 2080 *u++ = *rtmp_ptr; 2081 *rtmp_ptr++ = 0.0; 2082 } 2083 } 2084 2085 /* ... add k to row list for first nonzero entry in k-th row */ 2086 il[k] = jmin; 2087 i = bj[jmin]; 2088 jl[k] = jl[i]; jl[i] = k; 2089 } 2090 } 2091 2092 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2093 ierr = PetscFree(il);CHKERRQ(ierr); 2094 ierr = PetscFree(jl);CHKERRQ(ierr); 2095 ierr = PetscFree(dk);CHKERRQ(ierr); 2096 ierr = PetscFree(uik);CHKERRQ(ierr); 2097 2098 C->factor = FACTOR_CHOLESKY; 2099 C->assembled = PETSC_TRUE; 2100 C->preallocated = PETSC_TRUE; 2101 PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */ 2102 PetscFunctionReturn(0); 2103 } 2104 2105 /* 2106 Version for when blocks are 4 by 4 Using natural ordering 2107 */ 2108 #undef __FUNC__ 2109 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering" 2110 int MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B) 2111 { 2112 Mat C = *B; 2113 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 2114 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 2115 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 2116 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 2117 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 2118 2119 PetscFunctionBegin; 2120 2121 /* initialization */ 2122 rtmp = (MatScalar*)PetscMalloc(16*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 2123 ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr); 2124 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 2125 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2126 for (i=0; i<mbs; i++) { 2127 jl[i] = mbs; il[0] = 0; 2128 } 2129 dk = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(dk); 2130 uik = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(uik); 2131 2132 ai = a->i; aj = a->j; aa = a->a; 2133 2134 /* for each row k */ 2135 for (k = 0; k<mbs; k++){ 2136 2137 /*initialize k-th row with elements nonzero in row k of A */ 2138 jmin = ai[k]; jmax = ai[k+1]; 2139 if (jmin < jmax) { 2140 ap = aa + jmin*16; 2141 for (j = jmin; j < jmax; j++){ 2142 vj = aj[j]; /* block col. index */ 2143 rtmp_ptr = rtmp + vj*16; 2144 for (i=0; i<16; i++) *rtmp_ptr++ = *ap++; 2145 } 2146 } 2147 2148 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 2149 ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr); 2150 i = jl[k]; /* first row to be added to k_th row */ 2151 2152 while (i < mbs){ 2153 nexti = jl[i]; /* next row to be added to k_th row */ 2154 2155 /* compute multiplier */ 2156 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 2157 2158 /* uik = -inv(Di)*U_bar(i,k) */ 2159 diag = ba + i*16; 2160 u = ba + ili*16; 2161 2162 uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]); 2163 uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]); 2164 uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]); 2165 uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]); 2166 2167 uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]); 2168 uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]); 2169 uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]); 2170 uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]); 2171 2172 uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]); 2173 uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]); 2174 uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]); 2175 uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]); 2176 2177 uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]); 2178 uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]); 2179 uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]); 2180 uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]); 2181 2182 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 2183 dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3]; 2184 dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3]; 2185 dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3]; 2186 dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]; 2187 2188 dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7]; 2189 dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7]; 2190 dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7]; 2191 dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7]; 2192 2193 dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11]; 2194 dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11]; 2195 dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11]; 2196 dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11]; 2197 2198 dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15]; 2199 dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15]; 2200 dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15]; 2201 dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]; 2202 2203 /* update -U(i,k) */ 2204 ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr); 2205 2206 /* add multiple of row i to k-th row ... */ 2207 jmin = ili + 1; jmax = bi[i+1]; 2208 if (jmin < jmax){ 2209 for (j=jmin; j<jmax; j++) { 2210 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 2211 rtmp_ptr = rtmp + bj[j]*16; 2212 u = ba + j*16; 2213 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3]; 2214 rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3]; 2215 rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3]; 2216 rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]; 2217 2218 rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7]; 2219 rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7]; 2220 rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7]; 2221 rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7]; 2222 2223 rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11]; 2224 rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11]; 2225 rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11]; 2226 rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11]; 2227 2228 rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15]; 2229 rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15]; 2230 rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15]; 2231 rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]; 2232 } 2233 2234 /* ... add i to row list for next nonzero entry */ 2235 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 2236 j = bj[jmin]; 2237 jl[i] = jl[j]; jl[j] = i; /* update jl */ 2238 } 2239 i = nexti; 2240 } 2241 2242 /* save nonzero entries in k-th row of U ... */ 2243 2244 /* invert diagonal block */ 2245 diag = ba+k*16; 2246 ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr); 2247 ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr); 2248 2249 jmin = bi[k]; jmax = bi[k+1]; 2250 if (jmin < jmax) { 2251 for (j=jmin; j<jmax; j++){ 2252 vj = bj[j]; /* block col. index of U */ 2253 u = ba + j*16; 2254 rtmp_ptr = rtmp + vj*16; 2255 for (k1=0; k1<16; k1++){ 2256 *u++ = *rtmp_ptr; 2257 *rtmp_ptr++ = 0.0; 2258 } 2259 } 2260 2261 /* ... add k to row list for first nonzero entry in k-th row */ 2262 il[k] = jmin; 2263 i = bj[jmin]; 2264 jl[k] = jl[i]; jl[i] = k; 2265 } 2266 } 2267 2268 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2269 ierr = PetscFree(il);CHKERRQ(ierr); 2270 ierr = PetscFree(jl);CHKERRQ(ierr); 2271 ierr = PetscFree(dk);CHKERRQ(ierr); 2272 ierr = PetscFree(uik);CHKERRQ(ierr); 2273 2274 C->factor = FACTOR_CHOLESKY; 2275 C->assembled = PETSC_TRUE; 2276 C->preallocated = PETSC_TRUE; 2277 PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */ 2278 PetscFunctionReturn(0); 2279 } 2280 2281 /* Version for when blocks are 4 by 4 */ 2282 #undef __FUNC__ 2283 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4" 2284 int MatCholeskyFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B) 2285 { 2286 Mat C = *B; 2287 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 2288 IS perm = b->row; 2289 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 2290 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 2291 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 2292 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 2293 2294 PetscFunctionBegin; 2295 /* initialization */ 2296 rtmp = (MatScalar*)PetscMalloc(16*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 2297 ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr); 2298 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 2299 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2300 for (i=0; i<mbs; i++) { 2301 jl[i] = mbs; il[0] = 0; 2302 } 2303 dk = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(dk); 2304 uik = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(uik); 2305 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 2306 2307 /* check permutation */ 2308 if (!a->permute){ 2309 ai = a->i; aj = a->j; aa = a->a; 2310 } else { 2311 ai = a->inew; aj = a->jnew; 2312 aa = (MatScalar*)PetscMalloc(16*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa); 2313 ierr = PetscMemcpy(aa,a->a,16*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2314 a2anew = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew); 2315 ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 2316 2317 for (i=0; i<mbs; i++){ 2318 jmin = ai[i]; jmax = ai[i+1]; 2319 for (j=jmin; j<jmax; j++){ 2320 while (a2anew[j] != j){ 2321 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 2322 for (k1=0; k1<16; k1++){ 2323 dk[k1] = aa[k*16+k1]; 2324 aa[k*16+k1] = aa[j*16+k1]; 2325 aa[j*16+k1] = dk[k1]; 2326 } 2327 } 2328 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 2329 if (i > aj[j]){ 2330 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 2331 ap = aa + j*16; /* ptr to the beginning of j-th block of aa */ 2332 for (k=0; k<16; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 2333 for (k=0; k<4; k++){ /* j-th block of aa <- dk^T */ 2334 for (k1=0; k1<4; k1++) *ap++ = dk[k + 4*k1]; 2335 } 2336 } 2337 } 2338 } 2339 ierr = PetscFree(a2anew);CHKERRA(ierr); 2340 } 2341 2342 /* for each row k */ 2343 for (k = 0; k<mbs; k++){ 2344 2345 /*initialize k-th row with elements nonzero in row perm(k) of A */ 2346 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 2347 if (jmin < jmax) { 2348 ap = aa + jmin*16; 2349 for (j = jmin; j < jmax; j++){ 2350 vj = perm_ptr[aj[j]]; /* block col. index */ 2351 rtmp_ptr = rtmp + vj*16; 2352 for (i=0; i<16; i++) *rtmp_ptr++ = *ap++; 2353 } 2354 } 2355 2356 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 2357 ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr); 2358 i = jl[k]; /* first row to be added to k_th row */ 2359 2360 while (i < mbs){ 2361 nexti = jl[i]; /* next row to be added to k_th row */ 2362 2363 /* compute multiplier */ 2364 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 2365 2366 /* uik = -inv(Di)*U_bar(i,k) */ 2367 diag = ba + i*16; 2368 u = ba + ili*16; 2369 2370 uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]); 2371 uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]); 2372 uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]); 2373 uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]); 2374 2375 uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]); 2376 uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]); 2377 uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]); 2378 uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]); 2379 2380 uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]); 2381 uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]); 2382 uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]); 2383 uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]); 2384 2385 uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]); 2386 uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]); 2387 uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]); 2388 uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]); 2389 2390 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 2391 dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3]; 2392 dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3]; 2393 dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3]; 2394 dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]; 2395 2396 dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7]; 2397 dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7]; 2398 dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7]; 2399 dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7]; 2400 2401 dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11]; 2402 dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11]; 2403 dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11]; 2404 dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11]; 2405 2406 dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15]; 2407 dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15]; 2408 dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15]; 2409 dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]; 2410 2411 /* update -U(i,k) */ 2412 ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr); 2413 2414 /* add multiple of row i to k-th row ... */ 2415 jmin = ili + 1; jmax = bi[i+1]; 2416 if (jmin < jmax){ 2417 for (j=jmin; j<jmax; j++) { 2418 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 2419 rtmp_ptr = rtmp + bj[j]*16; 2420 u = ba + j*16; 2421 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3]; 2422 rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3]; 2423 rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3]; 2424 rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]; 2425 2426 rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7]; 2427 rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7]; 2428 rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7]; 2429 rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7]; 2430 2431 rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11]; 2432 rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11]; 2433 rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11]; 2434 rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11]; 2435 2436 rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15]; 2437 rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15]; 2438 rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15]; 2439 rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]; 2440 } 2441 2442 /* ... add i to row list for next nonzero entry */ 2443 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 2444 j = bj[jmin]; 2445 jl[i] = jl[j]; jl[j] = i; /* update jl */ 2446 } 2447 i = nexti; 2448 } 2449 2450 /* save nonzero entries in k-th row of U ... */ 2451 2452 /* invert diagonal block */ 2453 diag = ba+k*16; 2454 ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr); 2455 ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr); 2456 2457 jmin = bi[k]; jmax = bi[k+1]; 2458 if (jmin < jmax) { 2459 for (j=jmin; j<jmax; j++){ 2460 vj = bj[j]; /* block col. index of U */ 2461 u = ba + j*16; 2462 rtmp_ptr = rtmp + vj*16; 2463 for (k1=0; k1<16; k1++){ 2464 *u++ = *rtmp_ptr; 2465 *rtmp_ptr++ = 0.0; 2466 } 2467 } 2468 2469 /* ... add k to row list for first nonzero entry in k-th row */ 2470 il[k] = jmin; 2471 i = bj[jmin]; 2472 jl[k] = jl[i]; jl[i] = k; 2473 } 2474 } 2475 2476 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2477 ierr = PetscFree(il);CHKERRQ(ierr); 2478 ierr = PetscFree(jl);CHKERRQ(ierr); 2479 ierr = PetscFree(dk);CHKERRQ(ierr); 2480 ierr = PetscFree(uik);CHKERRQ(ierr); 2481 if (a->permute){ 2482 ierr = PetscFree(aa);CHKERRQ(ierr); 2483 } 2484 2485 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 2486 C->factor = FACTOR_CHOLESKY; 2487 C->assembled = PETSC_TRUE; 2488 C->preallocated = PETSC_TRUE; 2489 PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */ 2490 PetscFunctionReturn(0); 2491 } 2492 2493 /* Version for when blocks are 3 by 3 */ 2494 #undef __FUNC__ 2495 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3" 2496 int MatCholeskyFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B) 2497 { 2498 Mat C = *B; 2499 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 2500 IS perm = b->row; 2501 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 2502 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 2503 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 2504 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 2505 2506 PetscFunctionBegin; 2507 2508 /* initialization */ 2509 rtmp = (MatScalar*)PetscMalloc(9*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 2510 ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr); 2511 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 2512 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2513 for (i=0; i<mbs; i++) { 2514 jl[i] = mbs; il[0] = 0; 2515 } 2516 dk = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(dk); 2517 uik = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(uik); 2518 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 2519 2520 /* check permutation */ 2521 if (!a->permute){ 2522 ai = a->i; aj = a->j; aa = a->a; 2523 } else { 2524 ai = a->inew; aj = a->jnew; 2525 aa = (MatScalar*)PetscMalloc(9*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa); 2526 ierr = PetscMemcpy(aa,a->a,9*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2527 a2anew = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew); 2528 ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 2529 2530 for (i=0; i<mbs; i++){ 2531 jmin = ai[i]; jmax = ai[i+1]; 2532 for (j=jmin; j<jmax; j++){ 2533 while (a2anew[j] != j){ 2534 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 2535 for (k1=0; k1<9; k1++){ 2536 dk[k1] = aa[k*9+k1]; 2537 aa[k*9+k1] = aa[j*9+k1]; 2538 aa[j*9+k1] = dk[k1]; 2539 } 2540 } 2541 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 2542 if (i > aj[j]){ 2543 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 2544 ap = aa + j*9; /* ptr to the beginning of j-th block of aa */ 2545 for (k=0; k<9; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 2546 for (k=0; k<3; k++){ /* j-th block of aa <- dk^T */ 2547 for (k1=0; k1<3; k1++) *ap++ = dk[k + 3*k1]; 2548 } 2549 } 2550 } 2551 } 2552 ierr = PetscFree(a2anew);CHKERRA(ierr); 2553 } 2554 2555 /* for each row k */ 2556 for (k = 0; k<mbs; k++){ 2557 2558 /*initialize k-th row with elements nonzero in row perm(k) of A */ 2559 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 2560 if (jmin < jmax) { 2561 ap = aa + jmin*9; 2562 for (j = jmin; j < jmax; j++){ 2563 vj = perm_ptr[aj[j]]; /* block col. index */ 2564 rtmp_ptr = rtmp + vj*9; 2565 for (i=0; i<9; i++) *rtmp_ptr++ = *ap++; 2566 } 2567 } 2568 2569 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 2570 ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr); 2571 i = jl[k]; /* first row to be added to k_th row */ 2572 2573 while (i < mbs){ 2574 nexti = jl[i]; /* next row to be added to k_th row */ 2575 2576 /* compute multiplier */ 2577 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 2578 2579 /* uik = -inv(Di)*U_bar(i,k) */ 2580 diag = ba + i*9; 2581 u = ba + ili*9; 2582 2583 uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]); 2584 uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]); 2585 uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]); 2586 2587 uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]); 2588 uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]); 2589 uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]); 2590 2591 uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]); 2592 uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]); 2593 uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]); 2594 2595 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 2596 dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2]; 2597 dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2]; 2598 dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2]; 2599 2600 dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5]; 2601 dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5]; 2602 dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5]; 2603 2604 dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8]; 2605 dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8]; 2606 dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8]; 2607 2608 /* update -U(i,k) */ 2609 ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr); 2610 2611 /* add multiple of row i to k-th row ... */ 2612 jmin = ili + 1; jmax = bi[i+1]; 2613 if (jmin < jmax){ 2614 for (j=jmin; j<jmax; j++) { 2615 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 2616 rtmp_ptr = rtmp + bj[j]*9; 2617 u = ba + j*9; 2618 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2]; 2619 rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2]; 2620 rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2]; 2621 2622 rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5]; 2623 rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5]; 2624 rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5]; 2625 2626 rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8]; 2627 rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8]; 2628 rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8]; 2629 } 2630 2631 /* ... add i to row list for next nonzero entry */ 2632 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 2633 j = bj[jmin]; 2634 jl[i] = jl[j]; jl[j] = i; /* update jl */ 2635 } 2636 i = nexti; 2637 } 2638 2639 /* save nonzero entries in k-th row of U ... */ 2640 2641 /* invert diagonal block */ 2642 diag = ba+k*9; 2643 ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr); 2644 ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr); 2645 2646 jmin = bi[k]; jmax = bi[k+1]; 2647 if (jmin < jmax) { 2648 for (j=jmin; j<jmax; j++){ 2649 vj = bj[j]; /* block col. index of U */ 2650 u = ba + j*9; 2651 rtmp_ptr = rtmp + vj*9; 2652 for (k1=0; k1<9; k1++){ 2653 *u++ = *rtmp_ptr; 2654 *rtmp_ptr++ = 0.0; 2655 } 2656 } 2657 2658 /* ... add k to row list for first nonzero entry in k-th row */ 2659 il[k] = jmin; 2660 i = bj[jmin]; 2661 jl[k] = jl[i]; jl[i] = k; 2662 } 2663 } 2664 2665 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2666 ierr = PetscFree(il);CHKERRQ(ierr); 2667 ierr = PetscFree(jl);CHKERRQ(ierr); 2668 ierr = PetscFree(dk);CHKERRQ(ierr); 2669 ierr = PetscFree(uik);CHKERRQ(ierr); 2670 if (a->permute){ 2671 ierr = PetscFree(aa);CHKERRQ(ierr); 2672 } 2673 2674 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 2675 C->factor = FACTOR_CHOLESKY; 2676 C->assembled = PETSC_TRUE; 2677 C->preallocated = PETSC_TRUE; 2678 PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */ 2679 PetscFunctionReturn(0); 2680 } 2681 2682 /* 2683 Version for when blocks are 3 by 3 Using natural ordering 2684 */ 2685 #undef __FUNC__ 2686 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering" 2687 int MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B) 2688 { 2689 Mat C = *B; 2690 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 2691 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 2692 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 2693 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 2694 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 2695 2696 PetscFunctionBegin; 2697 2698 /* initialization */ 2699 rtmp = (MatScalar*)PetscMalloc(9*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 2700 ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr); 2701 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 2702 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2703 for (i=0; i<mbs; i++) { 2704 jl[i] = mbs; il[0] = 0; 2705 } 2706 dk = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(dk); 2707 uik = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(uik); 2708 2709 ai = a->i; aj = a->j; aa = a->a; 2710 2711 /* for each row k */ 2712 for (k = 0; k<mbs; k++){ 2713 2714 /*initialize k-th row with elements nonzero in row k of A */ 2715 jmin = ai[k]; jmax = ai[k+1]; 2716 if (jmin < jmax) { 2717 ap = aa + jmin*9; 2718 for (j = jmin; j < jmax; j++){ 2719 vj = aj[j]; /* block col. index */ 2720 rtmp_ptr = rtmp + vj*9; 2721 for (i=0; i<9; i++) *rtmp_ptr++ = *ap++; 2722 } 2723 } 2724 2725 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 2726 ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr); 2727 i = jl[k]; /* first row to be added to k_th row */ 2728 2729 while (i < mbs){ 2730 nexti = jl[i]; /* next row to be added to k_th row */ 2731 2732 /* compute multiplier */ 2733 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 2734 2735 /* uik = -inv(Di)*U_bar(i,k) */ 2736 diag = ba + i*9; 2737 u = ba + ili*9; 2738 2739 uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]); 2740 uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]); 2741 uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]); 2742 2743 uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]); 2744 uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]); 2745 uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]); 2746 2747 uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]); 2748 uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]); 2749 uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]); 2750 2751 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 2752 dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2]; 2753 dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2]; 2754 dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2]; 2755 2756 dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5]; 2757 dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5]; 2758 dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5]; 2759 2760 dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8]; 2761 dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8]; 2762 dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8]; 2763 2764 /* update -U(i,k) */ 2765 ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr); 2766 2767 /* add multiple of row i to k-th row ... */ 2768 jmin = ili + 1; jmax = bi[i+1]; 2769 if (jmin < jmax){ 2770 for (j=jmin; j<jmax; j++) { 2771 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 2772 rtmp_ptr = rtmp + bj[j]*9; 2773 u = ba + j*9; 2774 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2]; 2775 rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2]; 2776 rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2]; 2777 2778 rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5]; 2779 rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5]; 2780 rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5]; 2781 2782 rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8]; 2783 rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8]; 2784 rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8]; 2785 } 2786 2787 /* ... add i to row list for next nonzero entry */ 2788 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 2789 j = bj[jmin]; 2790 jl[i] = jl[j]; jl[j] = i; /* update jl */ 2791 } 2792 i = nexti; 2793 } 2794 2795 /* save nonzero entries in k-th row of U ... */ 2796 2797 /* invert diagonal block */ 2798 diag = ba+k*9; 2799 ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr); 2800 ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr); 2801 2802 jmin = bi[k]; jmax = bi[k+1]; 2803 if (jmin < jmax) { 2804 for (j=jmin; j<jmax; j++){ 2805 vj = bj[j]; /* block col. index of U */ 2806 u = ba + j*9; 2807 rtmp_ptr = rtmp + vj*9; 2808 for (k1=0; k1<9; k1++){ 2809 *u++ = *rtmp_ptr; 2810 *rtmp_ptr++ = 0.0; 2811 } 2812 } 2813 2814 /* ... add k to row list for first nonzero entry in k-th row */ 2815 il[k] = jmin; 2816 i = bj[jmin]; 2817 jl[k] = jl[i]; jl[i] = k; 2818 } 2819 } 2820 2821 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2822 ierr = PetscFree(il);CHKERRQ(ierr); 2823 ierr = PetscFree(jl);CHKERRQ(ierr); 2824 ierr = PetscFree(dk);CHKERRQ(ierr); 2825 ierr = PetscFree(uik);CHKERRQ(ierr); 2826 2827 C->factor = FACTOR_CHOLESKY; 2828 C->assembled = PETSC_TRUE; 2829 C->preallocated = PETSC_TRUE; 2830 PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */ 2831 PetscFunctionReturn(0); 2832 } 2833 2834 /* 2835 Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP. 2836 Version for blocks 2 by 2. 2837 */ 2838 #undef __FUNC__ 2839 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2" 2840 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B) 2841 { 2842 Mat C = *B; 2843 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 2844 IS perm = b->row; 2845 int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 2846 int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 2847 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 2848 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 2849 2850 PetscFunctionBegin; 2851 2852 /* initialization */ 2853 /* il and jl record the first nonzero element in each row of the accessing 2854 window U(0:k, k:mbs-1). 2855 jl: list of rows to be added to uneliminated rows 2856 i>= k: jl(i) is the first row to be added to row i 2857 i< k: jl(i) is the row following row i in some list of rows 2858 jl(i) = mbs indicates the end of a list 2859 il(i): points to the first nonzero element in columns k,...,mbs-1 of 2860 row i of U */ 2861 rtmp = (MatScalar*)PetscMalloc(4*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 2862 ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr); 2863 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 2864 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2865 for (i=0; i<mbs; i++) { 2866 jl[i] = mbs; il[0] = 0; 2867 } 2868 dk = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(dk); 2869 uik = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(uik); 2870 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 2871 2872 /* check permutation */ 2873 if (!a->permute){ 2874 ai = a->i; aj = a->j; aa = a->a; 2875 } else { 2876 ai = a->inew; aj = a->jnew; 2877 aa = (MatScalar*)PetscMalloc(4*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa); 2878 ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2879 a2anew = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew); 2880 ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 2881 2882 for (i=0; i<mbs; i++){ 2883 jmin = ai[i]; jmax = ai[i+1]; 2884 for (j=jmin; j<jmax; j++){ 2885 while (a2anew[j] != j){ 2886 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 2887 for (k1=0; k1<4; k1++){ 2888 dk[k1] = aa[k*4+k1]; 2889 aa[k*4+k1] = aa[j*4+k1]; 2890 aa[j*4+k1] = dk[k1]; 2891 } 2892 } 2893 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 2894 if (i > aj[j]){ 2895 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 2896 ap = aa + j*4; /* ptr to the beginning of the block */ 2897 dk[1] = ap[1]; /* swap ap[1] and ap[2] */ 2898 ap[1] = ap[2]; 2899 ap[2] = dk[1]; 2900 } 2901 } 2902 } 2903 ierr = PetscFree(a2anew);CHKERRA(ierr); 2904 } 2905 2906 /* for each row k */ 2907 for (k = 0; k<mbs; k++){ 2908 2909 /*initialize k-th row with elements nonzero in row perm(k) of A */ 2910 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 2911 if (jmin < jmax) { 2912 ap = aa + jmin*4; 2913 for (j = jmin; j < jmax; j++){ 2914 vj = perm_ptr[aj[j]]; /* block col. index */ 2915 rtmp_ptr = rtmp + vj*4; 2916 for (i=0; i<4; i++) *rtmp_ptr++ = *ap++; 2917 } 2918 } 2919 2920 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 2921 ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr); 2922 i = jl[k]; /* first row to be added to k_th row */ 2923 2924 while (i < mbs){ 2925 nexti = jl[i]; /* next row to be added to k_th row */ 2926 2927 /* compute multiplier */ 2928 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 2929 2930 /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */ 2931 diag = ba + i*4; 2932 u = ba + ili*4; 2933 uik[0] = -(diag[0]*u[0] + diag[2]*u[1]); 2934 uik[1] = -(diag[1]*u[0] + diag[3]*u[1]); 2935 uik[2] = -(diag[0]*u[2] + diag[2]*u[3]); 2936 uik[3] = -(diag[1]*u[2] + diag[3]*u[3]); 2937 2938 /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */ 2939 dk[0] += uik[0]*u[0] + uik[1]*u[1]; 2940 dk[1] += uik[2]*u[0] + uik[3]*u[1]; 2941 dk[2] += uik[0]*u[2] + uik[1]*u[3]; 2942 dk[3] += uik[2]*u[2] + uik[3]*u[3]; 2943 2944 /* update -U(i,k): ba[ili] = uik */ 2945 ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr); 2946 2947 /* add multiple of row i to k-th row ... */ 2948 jmin = ili + 1; jmax = bi[i+1]; 2949 if (jmin < jmax){ 2950 for (j=jmin; j<jmax; j++) { 2951 /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */ 2952 rtmp_ptr = rtmp + bj[j]*4; 2953 u = ba + j*4; 2954 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1]; 2955 rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1]; 2956 rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3]; 2957 rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3]; 2958 } 2959 2960 /* ... add i to row list for next nonzero entry */ 2961 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 2962 j = bj[jmin]; 2963 jl[i] = jl[j]; jl[j] = i; /* update jl */ 2964 } 2965 i = nexti; 2966 } 2967 2968 /* save nonzero entries in k-th row of U ... */ 2969 2970 /* invert diagonal block */ 2971 diag = ba+k*4; 2972 ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr); 2973 ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr); 2974 2975 jmin = bi[k]; jmax = bi[k+1]; 2976 if (jmin < jmax) { 2977 for (j=jmin; j<jmax; j++){ 2978 vj = bj[j]; /* block col. index of U */ 2979 u = ba + j*4; 2980 rtmp_ptr = rtmp + vj*4; 2981 for (k1=0; k1<4; k1++){ 2982 *u++ = *rtmp_ptr; 2983 *rtmp_ptr++ = 0.0; 2984 } 2985 } 2986 2987 /* ... add k to row list for first nonzero entry in k-th row */ 2988 il[k] = jmin; 2989 i = bj[jmin]; 2990 jl[k] = jl[i]; jl[i] = k; 2991 } 2992 } 2993 2994 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2995 ierr = PetscFree(il);CHKERRQ(ierr); 2996 ierr = PetscFree(jl);CHKERRQ(ierr); 2997 ierr = PetscFree(dk);CHKERRQ(ierr); 2998 ierr = PetscFree(uik);CHKERRQ(ierr); 2999 if (a->permute) { 3000 ierr = PetscFree(aa);CHKERRQ(ierr); 3001 } 3002 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 3003 C->factor = FACTOR_CHOLESKY; 3004 C->assembled = PETSC_TRUE; 3005 C->preallocated = PETSC_TRUE; 3006 PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 3007 PetscFunctionReturn(0); 3008 } 3009 3010 /* 3011 Version for when blocks are 2 by 2 Using natural ordering 3012 */ 3013 #undef __FUNC__ 3014 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering" 3015 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B) 3016 { 3017 Mat C = *B; 3018 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 3019 int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 3020 int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 3021 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 3022 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 3023 3024 PetscFunctionBegin; 3025 3026 /* initialization */ 3027 /* il and jl record the first nonzero element in each row of the accessing 3028 window U(0:k, k:mbs-1). 3029 jl: list of rows to be added to uneliminated rows 3030 i>= k: jl(i) is the first row to be added to row i 3031 i< k: jl(i) is the row following row i in some list of rows 3032 jl(i) = mbs indicates the end of a list 3033 il(i): points to the first nonzero element in columns k,...,mbs-1 of 3034 row i of U */ 3035 rtmp = (MatScalar*)PetscMalloc(4*mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 3036 ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr); 3037 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 3038 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 3039 for (i=0; i<mbs; i++) { 3040 jl[i] = mbs; il[0] = 0; 3041 } 3042 dk = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(dk); 3043 uik = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(uik); 3044 3045 ai = a->i; aj = a->j; aa = a->a; 3046 3047 /* for each row k */ 3048 for (k = 0; k<mbs; k++){ 3049 3050 /*initialize k-th row with elements nonzero in row k of A */ 3051 jmin = ai[k]; jmax = ai[k+1]; 3052 if (jmin < jmax) { 3053 ap = aa + jmin*4; 3054 for (j = jmin; j < jmax; j++){ 3055 vj = aj[j]; /* block col. index */ 3056 rtmp_ptr = rtmp + vj*4; 3057 for (i=0; i<4; i++) *rtmp_ptr++ = *ap++; 3058 } 3059 } 3060 3061 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 3062 ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr); 3063 i = jl[k]; /* first row to be added to k_th row */ 3064 3065 while (i < mbs){ 3066 nexti = jl[i]; /* next row to be added to k_th row */ 3067 3068 /* compute multiplier */ 3069 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 3070 3071 /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */ 3072 diag = ba + i*4; 3073 u = ba + ili*4; 3074 uik[0] = -(diag[0]*u[0] + diag[2]*u[1]); 3075 uik[1] = -(diag[1]*u[0] + diag[3]*u[1]); 3076 uik[2] = -(diag[0]*u[2] + diag[2]*u[3]); 3077 uik[3] = -(diag[1]*u[2] + diag[3]*u[3]); 3078 3079 /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */ 3080 dk[0] += uik[0]*u[0] + uik[1]*u[1]; 3081 dk[1] += uik[2]*u[0] + uik[3]*u[1]; 3082 dk[2] += uik[0]*u[2] + uik[1]*u[3]; 3083 dk[3] += uik[2]*u[2] + uik[3]*u[3]; 3084 3085 /* update -U(i,k): ba[ili] = uik */ 3086 ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr); 3087 3088 /* add multiple of row i to k-th row ... */ 3089 jmin = ili + 1; jmax = bi[i+1]; 3090 if (jmin < jmax){ 3091 for (j=jmin; j<jmax; j++) { 3092 /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */ 3093 rtmp_ptr = rtmp + bj[j]*4; 3094 u = ba + j*4; 3095 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1]; 3096 rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1]; 3097 rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3]; 3098 rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3]; 3099 } 3100 3101 /* ... add i to row list for next nonzero entry */ 3102 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 3103 j = bj[jmin]; 3104 jl[i] = jl[j]; jl[j] = i; /* update jl */ 3105 } 3106 i = nexti; 3107 } 3108 3109 /* save nonzero entries in k-th row of U ... */ 3110 3111 /* invert diagonal block */ 3112 diag = ba+k*4; 3113 ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr); 3114 ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr); 3115 3116 jmin = bi[k]; jmax = bi[k+1]; 3117 if (jmin < jmax) { 3118 for (j=jmin; j<jmax; j++){ 3119 vj = bj[j]; /* block col. index of U */ 3120 u = ba + j*4; 3121 rtmp_ptr = rtmp + vj*4; 3122 for (k1=0; k1<4; k1++){ 3123 *u++ = *rtmp_ptr; 3124 *rtmp_ptr++ = 0.0; 3125 } 3126 } 3127 3128 /* ... add k to row list for first nonzero entry in k-th row */ 3129 il[k] = jmin; 3130 i = bj[jmin]; 3131 jl[k] = jl[i]; jl[i] = k; 3132 } 3133 } 3134 3135 ierr = PetscFree(rtmp);CHKERRQ(ierr); 3136 ierr = PetscFree(il);CHKERRQ(ierr); 3137 ierr = PetscFree(jl);CHKERRQ(ierr); 3138 ierr = PetscFree(dk);CHKERRQ(ierr); 3139 ierr = PetscFree(uik);CHKERRQ(ierr); 3140 3141 C->factor = FACTOR_CHOLESKY; 3142 C->assembled = PETSC_TRUE; 3143 C->preallocated = PETSC_TRUE; 3144 PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 3145 PetscFunctionReturn(0); 3146 } 3147 3148 /* 3149 Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP. 3150 Version for blocks are 1 by 1. 3151 */ 3152 #undef __FUNC__ 3153 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1" 3154 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B) 3155 { 3156 Mat C = *B; 3157 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 3158 IS ip = b->row; 3159 int *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j; 3160 int *ai,*aj,*r; 3161 int k,jmin,jmax,*jl,*il,vj,nexti,ili; 3162 MatScalar *rtmp; 3163 MatScalar *ba = b->a,*aa,ak; 3164 MatScalar dk,uikdi; 3165 3166 PetscFunctionBegin; 3167 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 3168 if (!a->permute){ 3169 ai = a->i; aj = a->j; aa = a->a; 3170 } else { 3171 ai = a->inew; aj = a->jnew; 3172 aa = (MatScalar*)PetscMalloc(ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa); 3173 ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 3174 r = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(r); 3175 ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr); 3176 3177 jmin = ai[0]; jmax = ai[mbs]; 3178 for (j=jmin; j<jmax; j++){ 3179 while (r[j] != j){ 3180 k = r[j]; r[j] = r[k]; r[k] = k; 3181 ak = aa[k]; aa[k] = aa[j]; aa[j] = ak; 3182 } 3183 } 3184 ierr = PetscFree(r);CHKERRA(ierr); 3185 } 3186 3187 /* initialization */ 3188 /* il and jl record the first nonzero element in each row of the accessing 3189 window U(0:k, k:mbs-1). 3190 jl: list of rows to be added to uneliminated rows 3191 i>= k: jl(i) is the first row to be added to row i 3192 i< k: jl(i) is the row following row i in some list of rows 3193 jl(i) = mbs indicates the end of a list 3194 il(i): points to the first nonzero element in columns k,...,mbs-1 of 3195 row i of U */ 3196 rtmp = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 3197 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 3198 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 3199 for (i=0; i<mbs; i++) { 3200 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 3201 } 3202 3203 /* for each row k */ 3204 for (k = 0; k<mbs; k++){ 3205 3206 /*initialize k-th row with elements nonzero in row perm(k) of A */ 3207 jmin = ai[rip[k]]; jmax = ai[rip[k]+1]; 3208 if (jmin < jmax) { 3209 for (j = jmin; j < jmax; j++){ 3210 vj = rip[aj[j]]; 3211 rtmp[vj] = aa[j]; 3212 } 3213 } 3214 3215 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 3216 dk = rtmp[k]; 3217 i = jl[k]; /* first row to be added to k_th row */ 3218 3219 while (i < mbs){ 3220 nexti = jl[i]; /* next row to be added to k_th row */ 3221 3222 /* compute multiplier, update D(k) and U(i,k) */ 3223 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 3224 uikdi = - ba[ili]*ba[i]; 3225 dk += uikdi*ba[ili]; 3226 ba[ili] = uikdi; /* -U(i,k) */ 3227 3228 /* add multiple of row i to k-th row ... */ 3229 jmin = ili + 1; jmax = bi[i+1]; 3230 if (jmin < jmax){ 3231 for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j]; 3232 /* ... add i to row list for next nonzero entry */ 3233 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 3234 j = bj[jmin]; 3235 jl[i] = jl[j]; jl[j] = i; /* update jl */ 3236 } 3237 i = nexti; 3238 } 3239 3240 /* check for zero pivot and save diagoanl element */ 3241 if (dk == 0.0){ 3242 SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot"); 3243 /* 3244 }else if (PetscRealPart(dk) < 0){ 3245 ierr = PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %g\n",k,dk); 3246 */ 3247 } 3248 3249 /* save nonzero entries in k-th row of U ... */ 3250 ba[k] = 1.0/dk; 3251 jmin = bi[k]; jmax = bi[k+1]; 3252 if (jmin < jmax) { 3253 for (j=jmin; j<jmax; j++){ 3254 vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0; 3255 } 3256 /* ... add k to row list for first nonzero entry in k-th row */ 3257 il[k] = jmin; 3258 i = bj[jmin]; 3259 jl[k] = jl[i]; jl[i] = k; 3260 } 3261 } 3262 3263 ierr = PetscFree(rtmp);CHKERRQ(ierr); 3264 ierr = PetscFree(il);CHKERRQ(ierr); 3265 ierr = PetscFree(jl);CHKERRQ(ierr); 3266 if (a->permute){ 3267 ierr = PetscFree(aa);CHKERRQ(ierr); 3268 } 3269 3270 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 3271 C->factor = FACTOR_CHOLESKY; 3272 C->assembled = PETSC_TRUE; 3273 C->preallocated = PETSC_TRUE; 3274 PLogFlops(b->mbs); 3275 PetscFunctionReturn(0); 3276 } 3277 3278 /* 3279 Version for when blocks are 1 by 1 Using natural ordering 3280 */ 3281 #undef __FUNC__ 3282 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering" 3283 int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B) 3284 { 3285 Mat C = *B; 3286 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 3287 int ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j; 3288 int *ai,*aj,*r; 3289 int k,jmin,jmax,*jl,*il,vj,nexti,ili; 3290 MatScalar *rtmp; 3291 MatScalar *ba = b->a,*aa,ak; 3292 MatScalar dk,uikdi; 3293 3294 PetscFunctionBegin; 3295 3296 /* initialization */ 3297 /* il and jl record the first nonzero element in each row of the accessing 3298 window U(0:k, k:mbs-1). 3299 jl: list of rows to be added to uneliminated rows 3300 i>= k: jl(i) is the first row to be added to row i 3301 i< k: jl(i) is the row following row i in some list of rows 3302 jl(i) = mbs indicates the end of a list 3303 il(i): points to the first nonzero element in columns k,...,mbs-1 of 3304 row i of U */ 3305 rtmp = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 3306 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 3307 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 3308 for (i=0; i<mbs; i++) { 3309 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 3310 } 3311 3312 ai = a->i; aj = a->j; aa = a->a; 3313 3314 /* for each row k */ 3315 for (k = 0; k<mbs; k++){ 3316 3317 /*initialize k-th row with elements nonzero in row perm(k) of A */ 3318 jmin = ai[k]; jmax = ai[k+1]; 3319 if (jmin < jmax) { 3320 for (j = jmin; j < jmax; j++){ 3321 vj = aj[j]; 3322 rtmp[vj] = aa[j]; 3323 } 3324 } 3325 3326 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 3327 dk = rtmp[k]; 3328 i = jl[k]; /* first row to be added to k_th row */ 3329 3330 while (i < mbs){ 3331 nexti = jl[i]; /* next row to be added to k_th row */ 3332 3333 /* compute multiplier, update D(k) and U(i,k) */ 3334 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 3335 uikdi = - ba[ili]*ba[i]; 3336 dk += uikdi*ba[ili]; 3337 ba[ili] = uikdi; /* -U(i,k) */ 3338 3339 /* add multiple of row i to k-th row ... */ 3340 jmin = ili + 1; jmax = bi[i+1]; 3341 if (jmin < jmax){ 3342 for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j]; 3343 /* ... add i to row list for next nonzero entry */ 3344 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 3345 j = bj[jmin]; 3346 jl[i] = jl[j]; jl[j] = i; /* update jl */ 3347 } 3348 i = nexti; 3349 } 3350 3351 /* check for zero pivot and save diagoanl element */ 3352 if (dk == 0.0){ 3353 SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot"); 3354 /* 3355 }else if (PetscRealPart(dk) < 0){ 3356 ierr = PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %g\n",k,dk); 3357 */ 3358 } 3359 3360 /* save nonzero entries in k-th row of U ... */ 3361 ba[k] = 1.0/dk; 3362 jmin = bi[k]; jmax = bi[k+1]; 3363 if (jmin < jmax) { 3364 for (j=jmin; j<jmax; j++){ 3365 vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0; 3366 } 3367 /* ... add k to row list for first nonzero entry in k-th row */ 3368 il[k] = jmin; 3369 i = bj[jmin]; 3370 jl[k] = jl[i]; jl[i] = k; 3371 } 3372 } 3373 3374 ierr = PetscFree(rtmp);CHKERRQ(ierr); 3375 ierr = PetscFree(il);CHKERRQ(ierr); 3376 ierr = PetscFree(jl);CHKERRQ(ierr); 3377 3378 C->factor = FACTOR_CHOLESKY; 3379 C->assembled = PETSC_TRUE; 3380 C->preallocated = PETSC_TRUE; 3381 PLogFlops(b->mbs); 3382 PetscFunctionReturn(0); 3383 } 3384 3385 #undef __FUNC__ 3386 #define __FUNC__ "MatCholeskyFactor_SeqSBAIJ" 3387 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f) 3388 { 3389 int ierr; 3390 Mat C; 3391 3392 PetscFunctionBegin; 3393 ierr = MatCholeskyFactorSymbolic(A,perm,f,&C);CHKERRQ(ierr); 3394 ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr); 3395 ierr = MatHeaderCopy(A,C);CHKERRQ(ierr); 3396 PetscFunctionReturn(0); 3397 } 3398 3399 3400