1 /* Using Modified Sparse Row (MSR) storage. 2 See page 85, "Iterative Methods ..." by Saad. */ 3 4 /*$Id: sbaijfact.c,v 1.13 2000/09/07 16:13:53 hzhang Exp hzhang $*/ 5 /* 6 Factorization code for SBAIJ format. 7 */ 8 #include "sbaij.h" 9 #include "src/mat/impls/baij/seq/baij.h" 10 #include "src/vec/vecimpl.h" 11 #include "src/inline/ilu.h" 12 #include "include/petscis.h" 13 14 #undef __FUNC__ 15 #define __FUNC__ "MatCholeskyFactorSymbolic_SeqSBAIJ" 16 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B) 17 { 18 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 19 IS iperm; 20 int *rip,*riip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j; 21 int *jutmp,bs = a->bs,bs2=a->bs2; 22 int m,nzi,realloc = 0; 23 int *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 24 PetscTruth *ident; 25 26 PetscFunctionBegin; 27 PetscValidHeaderSpecific(perm,IS_COOKIE); 28 if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,0,"matrix must be square"); 29 30 ierr = ISIdentity(perm,ident);CHKERRQ(ierr); 31 if (!*ident) { /* for a non-trivial perm, the matrix A in SBAIJ format needs to be 32 re-indexed so that A(perm(i),iperm(k)) is stored in the upper 33 triangle. */ 34 SETERRQ(PETSC_ERR_ARG_CORRUPT,0,"Call MatReIndexingSeqSBAIJ() to re-indexing (ai,aj,a)"); 35 } 36 37 ierr = ISInvertPermutation(perm,PETSC_DECIDE,&iperm);CHKERRQ(ierr); 38 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 39 ierr = ISGetIndices(iperm,&riip);CHKERRQ(ierr); 40 41 /* initialization */ 42 /* Don't know how many column pointers are needed so estimate. 43 Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */ 44 iu = (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(iu); 45 umax = (int)(f*ai[mbs] + 1); umax += mbs + 1; 46 ju = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju); 47 iu[0] = mbs+1; 48 juptr = mbs; 49 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 50 q = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q); 51 for (i=0; i<mbs; i++){ 52 jl[i] = mbs; q[i] = 0; 53 } 54 55 /* for each row k */ 56 for (k=0; k<mbs; k++){ 57 nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */ 58 q[k] = mbs; 59 /* initialize nonzero structure of k-th row to row rip[k] of A */ 60 jmin = ai[rip[k]]; 61 jmax = ai[rip[k]+1]; 62 for (j=jmin; j<jmax; j++){ 63 vj = riip[aj[j]]; /* col. value */ 64 if(vj > k){ 65 qm = k; 66 do { 67 m = qm; qm = q[m]; 68 } while(qm < vj); 69 if (qm == vj) { 70 printf(" error: duplicate entry in A\n"); break; 71 } 72 nzk++; 73 q[m] = vj; 74 q[vj] = qm; 75 } /* if(vj > k) */ 76 } /* for (j=jmin; j<jmax; j++) */ 77 78 /* modify nonzero structure of k-th row by computing fill-in 79 for each row i to be merged in */ 80 i = k; 81 i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */ 82 /* printf(" next pivot row i=%d\n",i); */ 83 while (i < mbs){ 84 /* merge row i into k-th row */ 85 nzi = iu[i+1] - (iu[i]+1); 86 jmin = iu[i] + 1; jmax = iu[i] + nzi; 87 qm = k; 88 for (j=jmin; j<jmax+1; j++){ 89 vj = ju[j]; 90 do { 91 m = qm; qm = q[m]; 92 } while (qm < vj); 93 if (qm != vj){ 94 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 95 } 96 } 97 i = jl[i]; /* next pivot row */ 98 } 99 100 /* add k to row list for first nonzero element in k-th row */ 101 if (nzk > 0){ 102 i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */ 103 jl[k] = jl[i]; jl[i] = k; 104 } 105 iu[k+1] = iu[k] + nzk; /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/ 106 107 /* allocate more space to ju if needed */ 108 if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax); 109 /* estimate how much additional space we will need */ 110 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 111 /* just double the memory each time */ 112 maxadd = umax; 113 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 114 umax += maxadd; 115 116 /* allocate a longer ju */ 117 jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp); 118 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 119 ierr = PetscFree(ju);CHKERRQ(ierr); 120 ju = jutmp; 121 realloc++; /* count how many times we realloc */ 122 } 123 124 /* save nonzero structure of k-th row in ju */ 125 i=k; 126 jumin = juptr + 1; juptr += nzk; 127 for (j=jumin; j<juptr+1; j++){ 128 i=q[i]; 129 ju[j]=i; 130 } 131 } 132 133 if (ai[mbs] != 0) { 134 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 135 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 136 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af); 137 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af); 138 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"); 139 } else { 140 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 141 } 142 143 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 144 ierr = ISRestoreIndices(iperm,&riip);CHKERRQ(ierr); 145 146 ierr = PetscFree(q);CHKERRQ(ierr); 147 ierr = PetscFree(jl);CHKERRQ(ierr); 148 149 /* put together the new matrix */ 150 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 151 PLogObjectParent(*B,iperm); 152 b = (Mat_SeqSBAIJ*)(*B)->data; 153 ierr = PetscFree(b->imax);CHKERRQ(ierr); 154 b->singlemalloc = PETSC_FALSE; 155 /* the next line frees the default space generated by the Create() */ 156 ierr = PetscFree(b->a);CHKERRQ(ierr); 157 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 158 b->a = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a); 159 b->j = ju; 160 b->i = iu; 161 b->diag = 0; 162 b->ilen = 0; 163 b->imax = 0; 164 b->row = perm; 165 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 166 b->icol = iperm; 167 b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 168 /* In b structure: Free imax, ilen, old a, old j. 169 Allocate idnew, solve_work, new a, new j */ 170 PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 171 b->s_maxnz = b->s_nz = iu[mbs]; 172 173 (*B)->factor = FACTOR_LU; 174 (*B)->info.factor_mallocs = realloc; 175 (*B)->info.fill_ratio_given = f; 176 if (ai[mbs] != 0) { 177 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 178 } else { 179 (*B)->info.fill_ratio_needed = 0.0; 180 } 181 182 183 PetscFunctionReturn(0); 184 } 185 186 #undef __FUNC__ 187 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N" 188 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B) 189 { 190 Mat C = *B; 191 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 192 IS isrow = b->row,isicol = b->icol; 193 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 194 int *ajtmpold,*ajtmp,nz,row,bslog,*ai=a->i,*aj=a->j,k,flg; 195 int *diag_offset=b->diag,diag,bs=a->bs,bs2 = a->bs2,*v_pivots,*pj; 196 MatScalar *ba = b->a,*aa = a->a,*pv,*v,*rtmp,*multiplier,*v_work,*pc,*w; 197 198 PetscFunctionBegin; 199 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 200 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 201 rtmp = (MatScalar*)PetscMalloc(bs2*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 202 ierr = PetscMemzero(rtmp,bs2*(n+1)*sizeof(MatScalar));CHKERRQ(ierr); 203 /* generate work space needed by dense LU factorization */ 204 v_work = (MatScalar*)PetscMalloc(bs*sizeof(int) + (bs+bs2)*sizeof(MatScalar));CHKPTRQ(v_work); 205 multiplier = v_work + bs; 206 v_pivots = (int*)(multiplier + bs2); 207 208 /* flops in while loop */ 209 bslog = 2*bs*bs2; 210 211 for (i=0; i<n; i++) { 212 nz = bi[i+1] - bi[i]; 213 ajtmp = bj + bi[i]; 214 for (j=0; j<nz; j++) { 215 ierr = PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 216 } 217 /* load in initial (unfactored row) */ 218 nz = ai[r[i]+1] - ai[r[i]]; 219 ajtmpold = aj + ai[r[i]]; 220 v = aa + bs2*ai[r[i]]; 221 for (j=0; j<nz; j++) { 222 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 223 } 224 row = *ajtmp++; 225 while (row < i) { 226 pc = rtmp + bs2*row; 227 /* if (*pc) { */ 228 for (flg=0,k=0; k<bs2; k++) { if (pc[k]!=0.0) { flg =1; break; }} 229 if (flg) { 230 pv = ba + bs2*diag_offset[row]; 231 pj = bj + diag_offset[row] + 1; 232 Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); 233 nz = bi[row+1] - diag_offset[row] - 1; 234 pv += bs2; 235 for (j=0; j<nz; j++) { 236 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 237 } 238 PLogFlops(bslog*(nz+1)-bs); 239 } 240 row = *ajtmp++; 241 } 242 /* finished row so stick it into b->a */ 243 pv = ba + bs2*bi[i]; 244 pj = bj + bi[i]; 245 nz = bi[i+1] - bi[i]; 246 for (j=0; j<nz; j++) { 247 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 248 } 249 diag = diag_offset[i] - bi[i]; 250 /* invert diagonal block */ 251 w = pv + bs2*diag; 252 Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work); 253 } 254 255 ierr = PetscFree(rtmp);CHKERRQ(ierr); 256 ierr = PetscFree(v_work);CHKERRQ(ierr); 257 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 258 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 259 C->factor = FACTOR_LU; 260 C->assembled = PETSC_TRUE; 261 PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 262 PetscFunctionReturn(0); 263 } 264 265 /* 266 Version for when blocks are 7 by 7 267 */ 268 #undef __FUNC__ 269 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7" 270 int MatCholeskyFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B) 271 { 272 Mat C = *B; 273 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 274 IS isrow = b->row,isicol = b->icol; 275 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 276 int *ajtmpold,*ajtmp,nz,row; 277 int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 278 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 279 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 280 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 281 MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 282 MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 283 MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 284 MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 285 MatScalar p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49; 286 MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 287 MatScalar x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49; 288 MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 289 MatScalar m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49; 290 MatScalar *ba = b->a,*aa = a->a; 291 292 PetscFunctionBegin; 293 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 294 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 295 rtmp = (MatScalar*)PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 296 297 for (i=0; i<n; i++) { 298 nz = bi[i+1] - bi[i]; 299 ajtmp = bj + bi[i]; 300 for (j=0; j<nz; j++) { 301 x = rtmp+49*ajtmp[j]; 302 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 303 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 304 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 305 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 306 x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ; 307 x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0 ; 308 } 309 /* load in initial (unfactored row) */ 310 idx = r[i]; 311 nz = ai[idx+1] - ai[idx]; 312 ajtmpold = aj + ai[idx]; 313 v = aa + 49*ai[idx]; 314 for (j=0; j<nz; j++) { 315 x = rtmp+49*ic[ajtmpold[j]]; 316 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 317 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 318 x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 319 x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 320 x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 321 x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 322 x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 323 x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 324 x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 325 x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39]; 326 x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43]; 327 x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47]; 328 x[48] = v[48]; 329 v += 49; 330 } 331 row = *ajtmp++; 332 while (row < i) { 333 pc = rtmp + 49*row; 334 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 335 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 336 p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 337 p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 338 p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 339 p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 340 p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 341 p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 342 p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 343 p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39]; 344 p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43]; 345 p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47]; 346 p49 = pc[48]; 347 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 348 p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 349 p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 350 p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 351 p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 352 p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 353 p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 354 p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 355 p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 || 356 p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 || 357 p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 || 358 p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 || 359 p49 != 0.0) { 360 pv = ba + 49*diag_offset[row]; 361 pj = bj + diag_offset[row] + 1; 362 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 363 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 364 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 365 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 366 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 367 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 368 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 369 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 370 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 371 x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 372 x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 373 x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 374 x49 = pv[48]; 375 pc[0] = m1 = p1*x1 + p8*x2 + p15*x3 + p22*x4 + p29*x5 + p36*x6 + p43*x7; 376 pc[1] = m2 = p2*x1 + p9*x2 + p16*x3 + p23*x4 + p30*x5 + p37*x6 + p44*x7; 377 pc[2] = m3 = p3*x1 + p10*x2 + p17*x3 + p24*x4 + p31*x5 + p38*x6 + p45*x7; 378 pc[3] = m4 = p4*x1 + p11*x2 + p18*x3 + p25*x4 + p32*x5 + p39*x6 + p46*x7; 379 pc[4] = m5 = p5*x1 + p12*x2 + p19*x3 + p26*x4 + p33*x5 + p40*x6 + p47*x7; 380 pc[5] = m6 = p6*x1 + p13*x2 + p20*x3 + p27*x4 + p34*x5 + p41*x6 + p48*x7; 381 pc[6] = m7 = p7*x1 + p14*x2 + p21*x3 + p28*x4 + p35*x5 + p42*x6 + p49*x7; 382 383 pc[7] = m8 = p1*x8 + p8*x9 + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14; 384 pc[8] = m9 = p2*x8 + p9*x9 + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14; 385 pc[9] = m10 = p3*x8 + p10*x9 + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14; 386 pc[10] = m11 = p4*x8 + p11*x9 + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14; 387 pc[11] = m12 = p5*x8 + p12*x9 + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14; 388 pc[12] = m13 = p6*x8 + p13*x9 + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14; 389 pc[13] = m14 = p7*x8 + p14*x9 + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14; 390 391 pc[14] = m15 = p1*x15 + p8*x16 + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21; 392 pc[15] = m16 = p2*x15 + p9*x16 + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21; 393 pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21; 394 pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21; 395 pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21; 396 pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21; 397 pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21; 398 399 pc[21] = m22 = p1*x22 + p8*x23 + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28; 400 pc[22] = m23 = p2*x22 + p9*x23 + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28; 401 pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28; 402 pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28; 403 pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28; 404 pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28; 405 pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28; 406 407 pc[28] = m29 = p1*x29 + p8*x30 + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35; 408 pc[29] = m30 = p2*x29 + p9*x30 + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35; 409 pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35; 410 pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35; 411 pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35; 412 pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35; 413 pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35; 414 415 pc[35] = m36 = p1*x36 + p8*x37 + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42; 416 pc[36] = m37 = p2*x36 + p9*x37 + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42; 417 pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42; 418 pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42; 419 pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42; 420 pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42; 421 pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42; 422 423 pc[42] = m43 = p1*x43 + p8*x44 + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49; 424 pc[43] = m44 = p2*x43 + p9*x44 + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49; 425 pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49; 426 pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49; 427 pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49; 428 pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49; 429 pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49; 430 431 nz = bi[row+1] - diag_offset[row] - 1; 432 pv += 49; 433 for (j=0; j<nz; j++) { 434 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 435 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 436 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 437 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 438 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 439 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 440 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 441 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 442 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 443 x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 444 x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 445 x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 446 x49 = pv[48]; 447 x = rtmp + 49*pj[j]; 448 x[0] -= m1*x1 + m8*x2 + m15*x3 + m22*x4 + m29*x5 + m36*x6 + m43*x7; 449 x[1] -= m2*x1 + m9*x2 + m16*x3 + m23*x4 + m30*x5 + m37*x6 + m44*x7; 450 x[2] -= m3*x1 + m10*x2 + m17*x3 + m24*x4 + m31*x5 + m38*x6 + m45*x7; 451 x[3] -= m4*x1 + m11*x2 + m18*x3 + m25*x4 + m32*x5 + m39*x6 + m46*x7; 452 x[4] -= m5*x1 + m12*x2 + m19*x3 + m26*x4 + m33*x5 + m40*x6 + m47*x7; 453 x[5] -= m6*x1 + m13*x2 + m20*x3 + m27*x4 + m34*x5 + m41*x6 + m48*x7; 454 x[6] -= m7*x1 + m14*x2 + m21*x3 + m28*x4 + m35*x5 + m42*x6 + m49*x7; 455 456 x[7] -= m1*x8 + m8*x9 + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14; 457 x[8] -= m2*x8 + m9*x9 + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14; 458 x[9] -= m3*x8 + m10*x9 + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14; 459 x[10] -= m4*x8 + m11*x9 + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14; 460 x[11] -= m5*x8 + m12*x9 + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14; 461 x[12] -= m6*x8 + m13*x9 + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14; 462 x[13] -= m7*x8 + m14*x9 + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14; 463 464 x[14] -= m1*x15 + m8*x16 + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21; 465 x[15] -= m2*x15 + m9*x16 + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21; 466 x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21; 467 x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21; 468 x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21; 469 x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21; 470 x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21; 471 472 x[21] -= m1*x22 + m8*x23 + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28; 473 x[22] -= m2*x22 + m9*x23 + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28; 474 x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28; 475 x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28; 476 x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28; 477 x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28; 478 x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28; 479 480 x[28] -= m1*x29 + m8*x30 + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35; 481 x[29] -= m2*x29 + m9*x30 + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35; 482 x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35; 483 x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35; 484 x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35; 485 x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35; 486 x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35; 487 488 x[35] -= m1*x36 + m8*x37 + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42; 489 x[36] -= m2*x36 + m9*x37 + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42; 490 x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42; 491 x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42; 492 x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42; 493 x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42; 494 x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42; 495 496 x[42] -= m1*x43 + m8*x44 + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49; 497 x[43] -= m2*x43 + m9*x44 + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49; 498 x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49; 499 x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49; 500 x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49; 501 x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49; 502 x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49; 503 pv += 49; 504 } 505 PLogFlops(686*nz+637); 506 } 507 row = *ajtmp++; 508 } 509 /* finished row so stick it into b->a */ 510 pv = ba + 49*bi[i]; 511 pj = bj + bi[i]; 512 nz = bi[i+1] - bi[i]; 513 for (j=0; j<nz; j++) { 514 x = rtmp+49*pj[j]; 515 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 516 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 517 pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 518 pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 519 pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 520 pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 521 pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 522 pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 523 pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 524 pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39]; 525 pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43]; 526 pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47]; 527 pv[48] = x[48]; 528 pv += 49; 529 } 530 /* invert diagonal block */ 531 w = ba + 49*diag_offset[i]; 532 ierr = Kernel_A_gets_inverse_A_7(w);CHKERRQ(ierr); 533 } 534 535 ierr = PetscFree(rtmp);CHKERRQ(ierr); 536 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 537 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 538 C->factor = FACTOR_LU; 539 C->assembled = PETSC_TRUE; 540 PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */ 541 PetscFunctionReturn(0); 542 } 543 544 /* 545 Version for when blocks are 7 by 7 Using natural ordering 546 */ 547 #undef __FUNC__ 548 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering" 549 int MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B) 550 { 551 Mat C = *B; 552 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 553 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 554 int *ajtmpold,*ajtmp,nz,row; 555 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 556 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 557 MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 558 MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 559 MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 560 MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 561 MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 562 MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 563 MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 564 MatScalar p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49; 565 MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 566 MatScalar x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49; 567 MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 568 MatScalar m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49; 569 MatScalar *ba = b->a,*aa = a->a; 570 571 PetscFunctionBegin; 572 rtmp = (MatScalar*)PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 573 for (i=0; i<n; i++) { 574 nz = bi[i+1] - bi[i]; 575 ajtmp = bj + bi[i]; 576 for (j=0; j<nz; j++) { 577 x = rtmp+49*ajtmp[j]; 578 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 579 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 580 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 581 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 582 x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ; 583 x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0 ; 584 } 585 /* load in initial (unfactored row) */ 586 nz = ai[i+1] - ai[i]; 587 ajtmpold = aj + ai[i]; 588 v = aa + 49*ai[i]; 589 for (j=0; j<nz; j++) { 590 x = rtmp+49*ajtmpold[j]; 591 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 592 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 593 x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 594 x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 595 x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 596 x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 597 x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 598 x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 599 x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 600 x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39]; 601 x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43]; 602 x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47]; 603 x[48] = v[48]; 604 v += 49; 605 } 606 row = *ajtmp++; 607 while (row < i) { 608 pc = rtmp + 49*row; 609 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 610 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 611 p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 612 p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 613 p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 614 p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 615 p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 616 p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 617 p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 618 p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39]; 619 p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43]; 620 p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47]; 621 p49 = pc[48]; 622 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 623 p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 624 p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 625 p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 626 p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 627 p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 628 p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 629 p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 630 p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 || 631 p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 || 632 p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 || 633 p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 || 634 p49 != 0.0) { 635 pv = ba + 49*diag_offset[row]; 636 pj = bj + diag_offset[row] + 1; 637 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 638 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 639 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 640 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 641 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 642 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 643 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 644 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 645 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 646 x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 647 x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 648 x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 649 x49 = pv[48]; 650 pc[0] = m1 = p1*x1 + p8*x2 + p15*x3 + p22*x4 + p29*x5 + p36*x6 + p43*x7; 651 pc[1] = m2 = p2*x1 + p9*x2 + p16*x3 + p23*x4 + p30*x5 + p37*x6 + p44*x7; 652 pc[2] = m3 = p3*x1 + p10*x2 + p17*x3 + p24*x4 + p31*x5 + p38*x6 + p45*x7; 653 pc[3] = m4 = p4*x1 + p11*x2 + p18*x3 + p25*x4 + p32*x5 + p39*x6 + p46*x7; 654 pc[4] = m5 = p5*x1 + p12*x2 + p19*x3 + p26*x4 + p33*x5 + p40*x6 + p47*x7; 655 pc[5] = m6 = p6*x1 + p13*x2 + p20*x3 + p27*x4 + p34*x5 + p41*x6 + p48*x7; 656 pc[6] = m7 = p7*x1 + p14*x2 + p21*x3 + p28*x4 + p35*x5 + p42*x6 + p49*x7; 657 658 pc[7] = m8 = p1*x8 + p8*x9 + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14; 659 pc[8] = m9 = p2*x8 + p9*x9 + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14; 660 pc[9] = m10 = p3*x8 + p10*x9 + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14; 661 pc[10] = m11 = p4*x8 + p11*x9 + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14; 662 pc[11] = m12 = p5*x8 + p12*x9 + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14; 663 pc[12] = m13 = p6*x8 + p13*x9 + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14; 664 pc[13] = m14 = p7*x8 + p14*x9 + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14; 665 666 pc[14] = m15 = p1*x15 + p8*x16 + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21; 667 pc[15] = m16 = p2*x15 + p9*x16 + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21; 668 pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21; 669 pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21; 670 pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21; 671 pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21; 672 pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21; 673 674 pc[21] = m22 = p1*x22 + p8*x23 + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28; 675 pc[22] = m23 = p2*x22 + p9*x23 + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28; 676 pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28; 677 pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28; 678 pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28; 679 pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28; 680 pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28; 681 682 pc[28] = m29 = p1*x29 + p8*x30 + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35; 683 pc[29] = m30 = p2*x29 + p9*x30 + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35; 684 pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35; 685 pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35; 686 pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35; 687 pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35; 688 pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35; 689 690 pc[35] = m36 = p1*x36 + p8*x37 + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42; 691 pc[36] = m37 = p2*x36 + p9*x37 + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42; 692 pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42; 693 pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42; 694 pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42; 695 pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42; 696 pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42; 697 698 pc[42] = m43 = p1*x43 + p8*x44 + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49; 699 pc[43] = m44 = p2*x43 + p9*x44 + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49; 700 pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49; 701 pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49; 702 pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49; 703 pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49; 704 pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49; 705 706 nz = bi[row+1] - diag_offset[row] - 1; 707 pv += 49; 708 for (j=0; j<nz; j++) { 709 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 710 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 711 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 712 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 713 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 714 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 715 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 716 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 717 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 718 x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 719 x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 720 x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 721 x49 = pv[48]; 722 x = rtmp + 49*pj[j]; 723 x[0] -= m1*x1 + m8*x2 + m15*x3 + m22*x4 + m29*x5 + m36*x6 + m43*x7; 724 x[1] -= m2*x1 + m9*x2 + m16*x3 + m23*x4 + m30*x5 + m37*x6 + m44*x7; 725 x[2] -= m3*x1 + m10*x2 + m17*x3 + m24*x4 + m31*x5 + m38*x6 + m45*x7; 726 x[3] -= m4*x1 + m11*x2 + m18*x3 + m25*x4 + m32*x5 + m39*x6 + m46*x7; 727 x[4] -= m5*x1 + m12*x2 + m19*x3 + m26*x4 + m33*x5 + m40*x6 + m47*x7; 728 x[5] -= m6*x1 + m13*x2 + m20*x3 + m27*x4 + m34*x5 + m41*x6 + m48*x7; 729 x[6] -= m7*x1 + m14*x2 + m21*x3 + m28*x4 + m35*x5 + m42*x6 + m49*x7; 730 731 x[7] -= m1*x8 + m8*x9 + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14; 732 x[8] -= m2*x8 + m9*x9 + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14; 733 x[9] -= m3*x8 + m10*x9 + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14; 734 x[10] -= m4*x8 + m11*x9 + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14; 735 x[11] -= m5*x8 + m12*x9 + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14; 736 x[12] -= m6*x8 + m13*x9 + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14; 737 x[13] -= m7*x8 + m14*x9 + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14; 738 739 x[14] -= m1*x15 + m8*x16 + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21; 740 x[15] -= m2*x15 + m9*x16 + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21; 741 x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21; 742 x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21; 743 x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21; 744 x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21; 745 x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21; 746 747 x[21] -= m1*x22 + m8*x23 + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28; 748 x[22] -= m2*x22 + m9*x23 + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28; 749 x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28; 750 x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28; 751 x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28; 752 x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28; 753 x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28; 754 755 x[28] -= m1*x29 + m8*x30 + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35; 756 x[29] -= m2*x29 + m9*x30 + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35; 757 x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35; 758 x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35; 759 x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35; 760 x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35; 761 x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35; 762 763 x[35] -= m1*x36 + m8*x37 + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42; 764 x[36] -= m2*x36 + m9*x37 + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42; 765 x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42; 766 x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42; 767 x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42; 768 x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42; 769 x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42; 770 771 x[42] -= m1*x43 + m8*x44 + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49; 772 x[43] -= m2*x43 + m9*x44 + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49; 773 x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49; 774 x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49; 775 x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49; 776 x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49; 777 x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49; 778 pv += 49; 779 } 780 PLogFlops(686*nz+637); 781 } 782 row = *ajtmp++; 783 } 784 /* finished row so stick it into b->a */ 785 pv = ba + 49*bi[i]; 786 pj = bj + bi[i]; 787 nz = bi[i+1] - bi[i]; 788 for (j=0; j<nz; j++) { 789 x = rtmp+49*pj[j]; 790 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 791 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 792 pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 793 pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 794 pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 795 pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 796 pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 797 pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 798 pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 799 pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39]; 800 pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43]; 801 pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47]; 802 pv[48] = x[48]; 803 pv += 49; 804 } 805 /* invert diagonal block */ 806 w = ba + 49*diag_offset[i]; 807 ierr = Kernel_A_gets_inverse_A_7(w);CHKERRQ(ierr); 808 } 809 810 ierr = PetscFree(rtmp);CHKERRQ(ierr); 811 C->factor = FACTOR_LU; 812 C->assembled = PETSC_TRUE; 813 PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */ 814 PetscFunctionReturn(0); 815 } 816 817 /* ------------------------------------------------------------*/ 818 /* 819 Version for when blocks are 6 by 6 820 */ 821 #undef __FUNC__ 822 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6" 823 int MatCholeskyFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B) 824 { 825 Mat C = *B; 826 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 827 IS isrow = b->row,isicol = b->icol; 828 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 829 int *ajtmpold,*ajtmp,nz,row; 830 int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 831 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 832 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 833 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 834 MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 835 MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 836 MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 837 MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 838 MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 839 MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 840 MatScalar *ba = b->a,*aa = a->a; 841 842 PetscFunctionBegin; 843 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 844 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 845 rtmp = (MatScalar*)PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 846 847 for (i=0; i<n; i++) { 848 nz = bi[i+1] - bi[i]; 849 ajtmp = bj + bi[i]; 850 for (j=0; j<nz; j++) { 851 x = rtmp+36*ajtmp[j]; 852 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 853 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 854 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 855 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 856 x[34] = x[35] = 0.0 ; 857 } 858 /* load in initial (unfactored row) */ 859 idx = r[i]; 860 nz = ai[idx+1] - ai[idx]; 861 ajtmpold = aj + ai[idx]; 862 v = aa + 36*ai[idx]; 863 for (j=0; j<nz; j++) { 864 x = rtmp+36*ic[ajtmpold[j]]; 865 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 866 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 867 x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 868 x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 869 x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 870 x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 871 x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 872 x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 873 x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 874 v += 36; 875 } 876 row = *ajtmp++; 877 while (row < i) { 878 pc = rtmp + 36*row; 879 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 880 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 881 p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 882 p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 883 p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 884 p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 885 p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 886 p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 887 p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 888 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 889 p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 890 p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 891 p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 892 p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 893 p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 894 p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 895 p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 896 p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) { 897 pv = ba + 36*diag_offset[row]; 898 pj = bj + diag_offset[row] + 1; 899 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 900 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 901 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 902 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 903 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 904 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 905 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 906 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 907 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 908 pc[0] = m1 = p1*x1 + p7*x2 + p13*x3 + p19*x4 + p25*x5 + p31*x6; 909 pc[1] = m2 = p2*x1 + p8*x2 + p14*x3 + p20*x4 + p26*x5 + p32*x6; 910 pc[2] = m3 = p3*x1 + p9*x2 + p15*x3 + p21*x4 + p27*x5 + p33*x6; 911 pc[3] = m4 = p4*x1 + p10*x2 + p16*x3 + p22*x4 + p28*x5 + p34*x6; 912 pc[4] = m5 = p5*x1 + p11*x2 + p17*x3 + p23*x4 + p29*x5 + p35*x6; 913 pc[5] = m6 = p6*x1 + p12*x2 + p18*x3 + p24*x4 + p30*x5 + p36*x6; 914 915 pc[6] = m7 = p1*x7 + p7*x8 + p13*x9 + p19*x10 + p25*x11 + p31*x12; 916 pc[7] = m8 = p2*x7 + p8*x8 + p14*x9 + p20*x10 + p26*x11 + p32*x12; 917 pc[8] = m9 = p3*x7 + p9*x8 + p15*x9 + p21*x10 + p27*x11 + p33*x12; 918 pc[9] = m10 = p4*x7 + p10*x8 + p16*x9 + p22*x10 + p28*x11 + p34*x12; 919 pc[10] = m11 = p5*x7 + p11*x8 + p17*x9 + p23*x10 + p29*x11 + p35*x12; 920 pc[11] = m12 = p6*x7 + p12*x8 + p18*x9 + p24*x10 + p30*x11 + p36*x12; 921 922 pc[12] = m13 = p1*x13 + p7*x14 + p13*x15 + p19*x16 + p25*x17 + p31*x18; 923 pc[13] = m14 = p2*x13 + p8*x14 + p14*x15 + p20*x16 + p26*x17 + p32*x18; 924 pc[14] = m15 = p3*x13 + p9*x14 + p15*x15 + p21*x16 + p27*x17 + p33*x18; 925 pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18; 926 pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18; 927 pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18; 928 929 pc[18] = m19 = p1*x19 + p7*x20 + p13*x21 + p19*x22 + p25*x23 + p31*x24; 930 pc[19] = m20 = p2*x19 + p8*x20 + p14*x21 + p20*x22 + p26*x23 + p32*x24; 931 pc[20] = m21 = p3*x19 + p9*x20 + p15*x21 + p21*x22 + p27*x23 + p33*x24; 932 pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24; 933 pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24; 934 pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24; 935 936 pc[24] = m25 = p1*x25 + p7*x26 + p13*x27 + p19*x28 + p25*x29 + p31*x30; 937 pc[25] = m26 = p2*x25 + p8*x26 + p14*x27 + p20*x28 + p26*x29 + p32*x30; 938 pc[26] = m27 = p3*x25 + p9*x26 + p15*x27 + p21*x28 + p27*x29 + p33*x30; 939 pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30; 940 pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30; 941 pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30; 942 943 pc[30] = m31 = p1*x31 + p7*x32 + p13*x33 + p19*x34 + p25*x35 + p31*x36; 944 pc[31] = m32 = p2*x31 + p8*x32 + p14*x33 + p20*x34 + p26*x35 + p32*x36; 945 pc[32] = m33 = p3*x31 + p9*x32 + p15*x33 + p21*x34 + p27*x35 + p33*x36; 946 pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36; 947 pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36; 948 pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36; 949 950 nz = bi[row+1] - diag_offset[row] - 1; 951 pv += 36; 952 for (j=0; j<nz; j++) { 953 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 954 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 955 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 956 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 957 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 958 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 959 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 960 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 961 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 962 x = rtmp + 36*pj[j]; 963 x[0] -= m1*x1 + m7*x2 + m13*x3 + m19*x4 + m25*x5 + m31*x6; 964 x[1] -= m2*x1 + m8*x2 + m14*x3 + m20*x4 + m26*x5 + m32*x6; 965 x[2] -= m3*x1 + m9*x2 + m15*x3 + m21*x4 + m27*x5 + m33*x6; 966 x[3] -= m4*x1 + m10*x2 + m16*x3 + m22*x4 + m28*x5 + m34*x6; 967 x[4] -= m5*x1 + m11*x2 + m17*x3 + m23*x4 + m29*x5 + m35*x6; 968 x[5] -= m6*x1 + m12*x2 + m18*x3 + m24*x4 + m30*x5 + m36*x6; 969 970 x[6] -= m1*x7 + m7*x8 + m13*x9 + m19*x10 + m25*x11 + m31*x12; 971 x[7] -= m2*x7 + m8*x8 + m14*x9 + m20*x10 + m26*x11 + m32*x12; 972 x[8] -= m3*x7 + m9*x8 + m15*x9 + m21*x10 + m27*x11 + m33*x12; 973 x[9] -= m4*x7 + m10*x8 + m16*x9 + m22*x10 + m28*x11 + m34*x12; 974 x[10] -= m5*x7 + m11*x8 + m17*x9 + m23*x10 + m29*x11 + m35*x12; 975 x[11] -= m6*x7 + m12*x8 + m18*x9 + m24*x10 + m30*x11 + m36*x12; 976 977 x[12] -= m1*x13 + m7*x14 + m13*x15 + m19*x16 + m25*x17 + m31*x18; 978 x[13] -= m2*x13 + m8*x14 + m14*x15 + m20*x16 + m26*x17 + m32*x18; 979 x[14] -= m3*x13 + m9*x14 + m15*x15 + m21*x16 + m27*x17 + m33*x18; 980 x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18; 981 x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18; 982 x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18; 983 984 x[18] -= m1*x19 + m7*x20 + m13*x21 + m19*x22 + m25*x23 + m31*x24; 985 x[19] -= m2*x19 + m8*x20 + m14*x21 + m20*x22 + m26*x23 + m32*x24; 986 x[20] -= m3*x19 + m9*x20 + m15*x21 + m21*x22 + m27*x23 + m33*x24; 987 x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24; 988 x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24; 989 x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24; 990 991 x[24] -= m1*x25 + m7*x26 + m13*x27 + m19*x28 + m25*x29 + m31*x30; 992 x[25] -= m2*x25 + m8*x26 + m14*x27 + m20*x28 + m26*x29 + m32*x30; 993 x[26] -= m3*x25 + m9*x26 + m15*x27 + m21*x28 + m27*x29 + m33*x30; 994 x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30; 995 x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30; 996 x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30; 997 998 x[30] -= m1*x31 + m7*x32 + m13*x33 + m19*x34 + m25*x35 + m31*x36; 999 x[31] -= m2*x31 + m8*x32 + m14*x33 + m20*x34 + m26*x35 + m32*x36; 1000 x[32] -= m3*x31 + m9*x32 + m15*x33 + m21*x34 + m27*x35 + m33*x36; 1001 x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36; 1002 x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36; 1003 x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36; 1004 1005 pv += 36; 1006 } 1007 PLogFlops(432*nz+396); 1008 } 1009 row = *ajtmp++; 1010 } 1011 /* finished row so stick it into b->a */ 1012 pv = ba + 36*bi[i]; 1013 pj = bj + bi[i]; 1014 nz = bi[i+1] - bi[i]; 1015 for (j=0; j<nz; j++) { 1016 x = rtmp+36*pj[j]; 1017 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1018 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 1019 pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 1020 pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1021 pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 1022 pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 1023 pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 1024 pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 1025 pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 1026 pv += 36; 1027 } 1028 /* invert diagonal block */ 1029 w = ba + 36*diag_offset[i]; 1030 ierr = Kernel_A_gets_inverse_A_6(w);CHKERRQ(ierr); 1031 } 1032 1033 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1034 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1035 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1036 C->factor = FACTOR_LU; 1037 C->assembled = PETSC_TRUE; 1038 PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */ 1039 PetscFunctionReturn(0); 1040 } 1041 /* 1042 Version for when blocks are 6 by 6 Using natural ordering 1043 */ 1044 #undef __FUNC__ 1045 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering" 1046 int MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B) 1047 { 1048 Mat C = *B; 1049 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1050 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1051 int *ajtmpold,*ajtmp,nz,row; 1052 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 1053 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1054 MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 1055 MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 1056 MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 1057 MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 1058 MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 1059 MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 1060 MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 1061 MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 1062 MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 1063 MatScalar *ba = b->a,*aa = a->a; 1064 1065 PetscFunctionBegin; 1066 rtmp = (MatScalar*)PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1067 for (i=0; i<n; i++) { 1068 nz = bi[i+1] - bi[i]; 1069 ajtmp = bj + bi[i]; 1070 for (j=0; j<nz; j++) { 1071 x = rtmp+36*ajtmp[j]; 1072 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1073 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 1074 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 1075 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 1076 x[34] = x[35] = 0.0 ; 1077 } 1078 /* load in initial (unfactored row) */ 1079 nz = ai[i+1] - ai[i]; 1080 ajtmpold = aj + ai[i]; 1081 v = aa + 36*ai[i]; 1082 for (j=0; j<nz; j++) { 1083 x = rtmp+36*ajtmpold[j]; 1084 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1085 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 1086 x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 1087 x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 1088 x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 1089 x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 1090 x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 1091 x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 1092 x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 1093 v += 36; 1094 } 1095 row = *ajtmp++; 1096 while (row < i) { 1097 pc = rtmp + 36*row; 1098 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1099 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 1100 p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 1101 p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 1102 p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 1103 p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 1104 p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 1105 p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 1106 p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 1107 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 1108 p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 1109 p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 1110 p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 1111 p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 1112 p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 1113 p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 1114 p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 1115 p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) { 1116 pv = ba + 36*diag_offset[row]; 1117 pj = bj + diag_offset[row] + 1; 1118 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1119 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 1120 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 1121 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1122 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 1123 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1124 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 1125 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 1126 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 1127 pc[0] = m1 = p1*x1 + p7*x2 + p13*x3 + p19*x4 + p25*x5 + p31*x6; 1128 pc[1] = m2 = p2*x1 + p8*x2 + p14*x3 + p20*x4 + p26*x5 + p32*x6; 1129 pc[2] = m3 = p3*x1 + p9*x2 + p15*x3 + p21*x4 + p27*x5 + p33*x6; 1130 pc[3] = m4 = p4*x1 + p10*x2 + p16*x3 + p22*x4 + p28*x5 + p34*x6; 1131 pc[4] = m5 = p5*x1 + p11*x2 + p17*x3 + p23*x4 + p29*x5 + p35*x6; 1132 pc[5] = m6 = p6*x1 + p12*x2 + p18*x3 + p24*x4 + p30*x5 + p36*x6; 1133 1134 pc[6] = m7 = p1*x7 + p7*x8 + p13*x9 + p19*x10 + p25*x11 + p31*x12; 1135 pc[7] = m8 = p2*x7 + p8*x8 + p14*x9 + p20*x10 + p26*x11 + p32*x12; 1136 pc[8] = m9 = p3*x7 + p9*x8 + p15*x9 + p21*x10 + p27*x11 + p33*x12; 1137 pc[9] = m10 = p4*x7 + p10*x8 + p16*x9 + p22*x10 + p28*x11 + p34*x12; 1138 pc[10] = m11 = p5*x7 + p11*x8 + p17*x9 + p23*x10 + p29*x11 + p35*x12; 1139 pc[11] = m12 = p6*x7 + p12*x8 + p18*x9 + p24*x10 + p30*x11 + p36*x12; 1140 1141 pc[12] = m13 = p1*x13 + p7*x14 + p13*x15 + p19*x16 + p25*x17 + p31*x18; 1142 pc[13] = m14 = p2*x13 + p8*x14 + p14*x15 + p20*x16 + p26*x17 + p32*x18; 1143 pc[14] = m15 = p3*x13 + p9*x14 + p15*x15 + p21*x16 + p27*x17 + p33*x18; 1144 pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18; 1145 pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18; 1146 pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18; 1147 1148 pc[18] = m19 = p1*x19 + p7*x20 + p13*x21 + p19*x22 + p25*x23 + p31*x24; 1149 pc[19] = m20 = p2*x19 + p8*x20 + p14*x21 + p20*x22 + p26*x23 + p32*x24; 1150 pc[20] = m21 = p3*x19 + p9*x20 + p15*x21 + p21*x22 + p27*x23 + p33*x24; 1151 pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24; 1152 pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24; 1153 pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24; 1154 1155 pc[24] = m25 = p1*x25 + p7*x26 + p13*x27 + p19*x28 + p25*x29 + p31*x30; 1156 pc[25] = m26 = p2*x25 + p8*x26 + p14*x27 + p20*x28 + p26*x29 + p32*x30; 1157 pc[26] = m27 = p3*x25 + p9*x26 + p15*x27 + p21*x28 + p27*x29 + p33*x30; 1158 pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30; 1159 pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30; 1160 pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30; 1161 1162 pc[30] = m31 = p1*x31 + p7*x32 + p13*x33 + p19*x34 + p25*x35 + p31*x36; 1163 pc[31] = m32 = p2*x31 + p8*x32 + p14*x33 + p20*x34 + p26*x35 + p32*x36; 1164 pc[32] = m33 = p3*x31 + p9*x32 + p15*x33 + p21*x34 + p27*x35 + p33*x36; 1165 pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36; 1166 pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36; 1167 pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36; 1168 1169 nz = bi[row+1] - diag_offset[row] - 1; 1170 pv += 36; 1171 for (j=0; j<nz; j++) { 1172 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1173 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 1174 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 1175 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1176 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 1177 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1178 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 1179 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 1180 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 1181 x = rtmp + 36*pj[j]; 1182 x[0] -= m1*x1 + m7*x2 + m13*x3 + m19*x4 + m25*x5 + m31*x6; 1183 x[1] -= m2*x1 + m8*x2 + m14*x3 + m20*x4 + m26*x5 + m32*x6; 1184 x[2] -= m3*x1 + m9*x2 + m15*x3 + m21*x4 + m27*x5 + m33*x6; 1185 x[3] -= m4*x1 + m10*x2 + m16*x3 + m22*x4 + m28*x5 + m34*x6; 1186 x[4] -= m5*x1 + m11*x2 + m17*x3 + m23*x4 + m29*x5 + m35*x6; 1187 x[5] -= m6*x1 + m12*x2 + m18*x3 + m24*x4 + m30*x5 + m36*x6; 1188 1189 x[6] -= m1*x7 + m7*x8 + m13*x9 + m19*x10 + m25*x11 + m31*x12; 1190 x[7] -= m2*x7 + m8*x8 + m14*x9 + m20*x10 + m26*x11 + m32*x12; 1191 x[8] -= m3*x7 + m9*x8 + m15*x9 + m21*x10 + m27*x11 + m33*x12; 1192 x[9] -= m4*x7 + m10*x8 + m16*x9 + m22*x10 + m28*x11 + m34*x12; 1193 x[10] -= m5*x7 + m11*x8 + m17*x9 + m23*x10 + m29*x11 + m35*x12; 1194 x[11] -= m6*x7 + m12*x8 + m18*x9 + m24*x10 + m30*x11 + m36*x12; 1195 1196 x[12] -= m1*x13 + m7*x14 + m13*x15 + m19*x16 + m25*x17 + m31*x18; 1197 x[13] -= m2*x13 + m8*x14 + m14*x15 + m20*x16 + m26*x17 + m32*x18; 1198 x[14] -= m3*x13 + m9*x14 + m15*x15 + m21*x16 + m27*x17 + m33*x18; 1199 x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18; 1200 x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18; 1201 x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18; 1202 1203 x[18] -= m1*x19 + m7*x20 + m13*x21 + m19*x22 + m25*x23 + m31*x24; 1204 x[19] -= m2*x19 + m8*x20 + m14*x21 + m20*x22 + m26*x23 + m32*x24; 1205 x[20] -= m3*x19 + m9*x20 + m15*x21 + m21*x22 + m27*x23 + m33*x24; 1206 x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24; 1207 x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24; 1208 x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24; 1209 1210 x[24] -= m1*x25 + m7*x26 + m13*x27 + m19*x28 + m25*x29 + m31*x30; 1211 x[25] -= m2*x25 + m8*x26 + m14*x27 + m20*x28 + m26*x29 + m32*x30; 1212 x[26] -= m3*x25 + m9*x26 + m15*x27 + m21*x28 + m27*x29 + m33*x30; 1213 x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30; 1214 x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30; 1215 x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30; 1216 1217 x[30] -= m1*x31 + m7*x32 + m13*x33 + m19*x34 + m25*x35 + m31*x36; 1218 x[31] -= m2*x31 + m8*x32 + m14*x33 + m20*x34 + m26*x35 + m32*x36; 1219 x[32] -= m3*x31 + m9*x32 + m15*x33 + m21*x34 + m27*x35 + m33*x36; 1220 x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36; 1221 x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36; 1222 x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36; 1223 1224 pv += 36; 1225 } 1226 PLogFlops(432*nz+396); 1227 } 1228 row = *ajtmp++; 1229 } 1230 /* finished row so stick it into b->a */ 1231 pv = ba + 36*bi[i]; 1232 pj = bj + bi[i]; 1233 nz = bi[i+1] - bi[i]; 1234 for (j=0; j<nz; j++) { 1235 x = rtmp+36*pj[j]; 1236 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1237 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 1238 pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 1239 pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1240 pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 1241 pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 1242 pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 1243 pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 1244 pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 1245 pv += 36; 1246 } 1247 /* invert diagonal block */ 1248 w = ba + 36*diag_offset[i]; 1249 ierr = Kernel_A_gets_inverse_A_6(w);CHKERRQ(ierr); 1250 } 1251 1252 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1253 C->factor = FACTOR_LU; 1254 C->assembled = PETSC_TRUE; 1255 PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */ 1256 PetscFunctionReturn(0); 1257 } 1258 1259 /* 1260 Version for when blocks are 5 by 5 1261 */ 1262 #undef __FUNC__ 1263 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5" 1264 int MatCholeskyFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B) 1265 { 1266 Mat C = *B; 1267 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1268 IS isrow = b->row,isicol = b->icol; 1269 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1270 int *ajtmpold,*ajtmp,nz,row; 1271 int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 1272 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1273 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 1274 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 1275 MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 1276 MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 1277 MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 1278 MatScalar *ba = b->a,*aa = a->a; 1279 1280 PetscFunctionBegin; 1281 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1282 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1283 rtmp = (MatScalar*)PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1284 1285 for (i=0; i<n; i++) { 1286 nz = bi[i+1] - bi[i]; 1287 ajtmp = bj + bi[i]; 1288 for (j=0; j<nz; j++) { 1289 x = rtmp+25*ajtmp[j]; 1290 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1291 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 1292 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0; 1293 } 1294 /* load in initial (unfactored row) */ 1295 idx = r[i]; 1296 nz = ai[idx+1] - ai[idx]; 1297 ajtmpold = aj + ai[idx]; 1298 v = aa + 25*ai[idx]; 1299 for (j=0; j<nz; j++) { 1300 x = rtmp+25*ic[ajtmpold[j]]; 1301 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1302 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1303 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1304 x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17]; 1305 x[18] = v[18]; x[19] = v[19]; x[20] = v[20]; x[21] = v[21]; 1306 x[22] = v[22]; x[23] = v[23]; x[24] = v[24]; 1307 v += 25; 1308 } 1309 row = *ajtmp++; 1310 while (row < i) { 1311 pc = rtmp + 25*row; 1312 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1313 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1314 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1315 p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; 1316 p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 1317 p25 = pc[24]; 1318 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1319 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1320 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1321 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || 1322 p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || 1323 p24 != 0.0 || p25 != 0.0) { 1324 pv = ba + 25*diag_offset[row]; 1325 pj = bj + diag_offset[row] + 1; 1326 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1327 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1328 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1329 x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; 1330 x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; 1331 x23 = pv[22]; x24 = pv[23]; x25 = pv[24]; 1332 pc[0] = m1 = p1*x1 + p6*x2 + p11*x3 + p16*x4 + p21*x5; 1333 pc[1] = m2 = p2*x1 + p7*x2 + p12*x3 + p17*x4 + p22*x5; 1334 pc[2] = m3 = p3*x1 + p8*x2 + p13*x3 + p18*x4 + p23*x5; 1335 pc[3] = m4 = p4*x1 + p9*x2 + p14*x3 + p19*x4 + p24*x5; 1336 pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5; 1337 1338 pc[5] = m6 = p1*x6 + p6*x7 + p11*x8 + p16*x9 + p21*x10; 1339 pc[6] = m7 = p2*x6 + p7*x7 + p12*x8 + p17*x9 + p22*x10; 1340 pc[7] = m8 = p3*x6 + p8*x7 + p13*x8 + p18*x9 + p23*x10; 1341 pc[8] = m9 = p4*x6 + p9*x7 + p14*x8 + p19*x9 + p24*x10; 1342 pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10; 1343 1344 pc[10] = m11 = p1*x11 + p6*x12 + p11*x13 + p16*x14 + p21*x15; 1345 pc[11] = m12 = p2*x11 + p7*x12 + p12*x13 + p17*x14 + p22*x15; 1346 pc[12] = m13 = p3*x11 + p8*x12 + p13*x13 + p18*x14 + p23*x15; 1347 pc[13] = m14 = p4*x11 + p9*x12 + p14*x13 + p19*x14 + p24*x15; 1348 pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15; 1349 1350 pc[15] = m16 = p1*x16 + p6*x17 + p11*x18 + p16*x19 + p21*x20; 1351 pc[16] = m17 = p2*x16 + p7*x17 + p12*x18 + p17*x19 + p22*x20; 1352 pc[17] = m18 = p3*x16 + p8*x17 + p13*x18 + p18*x19 + p23*x20; 1353 pc[18] = m19 = p4*x16 + p9*x17 + p14*x18 + p19*x19 + p24*x20; 1354 pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20; 1355 1356 pc[20] = m21 = p1*x21 + p6*x22 + p11*x23 + p16*x24 + p21*x25; 1357 pc[21] = m22 = p2*x21 + p7*x22 + p12*x23 + p17*x24 + p22*x25; 1358 pc[22] = m23 = p3*x21 + p8*x22 + p13*x23 + p18*x24 + p23*x25; 1359 pc[23] = m24 = p4*x21 + p9*x22 + p14*x23 + p19*x24 + p24*x25; 1360 pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25; 1361 1362 nz = bi[row+1] - diag_offset[row] - 1; 1363 pv += 25; 1364 for (j=0; j<nz; j++) { 1365 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1366 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1367 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1368 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; 1369 x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; 1370 x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; x25 = pv[24]; 1371 x = rtmp + 25*pj[j]; 1372 x[0] -= m1*x1 + m6*x2 + m11*x3 + m16*x4 + m21*x5; 1373 x[1] -= m2*x1 + m7*x2 + m12*x3 + m17*x4 + m22*x5; 1374 x[2] -= m3*x1 + m8*x2 + m13*x3 + m18*x4 + m23*x5; 1375 x[3] -= m4*x1 + m9*x2 + m14*x3 + m19*x4 + m24*x5; 1376 x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5; 1377 1378 x[5] -= m1*x6 + m6*x7 + m11*x8 + m16*x9 + m21*x10; 1379 x[6] -= m2*x6 + m7*x7 + m12*x8 + m17*x9 + m22*x10; 1380 x[7] -= m3*x6 + m8*x7 + m13*x8 + m18*x9 + m23*x10; 1381 x[8] -= m4*x6 + m9*x7 + m14*x8 + m19*x9 + m24*x10; 1382 x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10; 1383 1384 x[10] -= m1*x11 + m6*x12 + m11*x13 + m16*x14 + m21*x15; 1385 x[11] -= m2*x11 + m7*x12 + m12*x13 + m17*x14 + m22*x15; 1386 x[12] -= m3*x11 + m8*x12 + m13*x13 + m18*x14 + m23*x15; 1387 x[13] -= m4*x11 + m9*x12 + m14*x13 + m19*x14 + m24*x15; 1388 x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15; 1389 1390 x[15] -= m1*x16 + m6*x17 + m11*x18 + m16*x19 + m21*x20; 1391 x[16] -= m2*x16 + m7*x17 + m12*x18 + m17*x19 + m22*x20; 1392 x[17] -= m3*x16 + m8*x17 + m13*x18 + m18*x19 + m23*x20; 1393 x[18] -= m4*x16 + m9*x17 + m14*x18 + m19*x19 + m24*x20; 1394 x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20; 1395 1396 x[20] -= m1*x21 + m6*x22 + m11*x23 + m16*x24 + m21*x25; 1397 x[21] -= m2*x21 + m7*x22 + m12*x23 + m17*x24 + m22*x25; 1398 x[22] -= m3*x21 + m8*x22 + m13*x23 + m18*x24 + m23*x25; 1399 x[23] -= m4*x21 + m9*x22 + m14*x23 + m19*x24 + m24*x25; 1400 x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25; 1401 1402 pv += 25; 1403 } 1404 PLogFlops(250*nz+225); 1405 } 1406 row = *ajtmp++; 1407 } 1408 /* finished row so stick it into b->a */ 1409 pv = ba + 25*bi[i]; 1410 pj = bj + bi[i]; 1411 nz = bi[i+1] - bi[i]; 1412 for (j=0; j<nz; j++) { 1413 x = rtmp+25*pj[j]; 1414 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1415 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1416 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1417 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16]; 1418 pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20]; 1419 pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; pv[24] = x[24]; 1420 pv += 25; 1421 } 1422 /* invert diagonal block */ 1423 w = ba + 25*diag_offset[i]; 1424 ierr = Kernel_A_gets_inverse_A_5(w);CHKERRQ(ierr); 1425 } 1426 1427 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1428 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1429 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1430 C->factor = FACTOR_LU; 1431 C->assembled = PETSC_TRUE; 1432 PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */ 1433 PetscFunctionReturn(0); 1434 } 1435 /* 1436 Version for when blocks are 5 by 5 Using natural ordering 1437 */ 1438 #undef __FUNC__ 1439 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering" 1440 int MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B) 1441 { 1442 Mat C = *B; 1443 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1444 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1445 int *ajtmpold,*ajtmp,nz,row; 1446 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 1447 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1448 MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 1449 MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 1450 MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 1451 MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 1452 MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 1453 MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 1454 MatScalar *ba = b->a,*aa = a->a; 1455 1456 PetscFunctionBegin; 1457 rtmp = (MatScalar*)PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1458 for (i=0; i<n; i++) { 1459 nz = bi[i+1] - bi[i]; 1460 ajtmp = bj + bi[i]; 1461 for (j=0; j<nz; j++) { 1462 x = rtmp+25*ajtmp[j]; 1463 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1464 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 1465 x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0; 1466 } 1467 /* load in initial (unfactored row) */ 1468 nz = ai[i+1] - ai[i]; 1469 ajtmpold = aj + ai[i]; 1470 v = aa + 25*ai[i]; 1471 for (j=0; j<nz; j++) { 1472 x = rtmp+25*ajtmpold[j]; 1473 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1474 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1475 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1476 x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; 1477 x[19] = v[19]; x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 1478 x[24] = v[24]; 1479 v += 25; 1480 } 1481 row = *ajtmp++; 1482 while (row < i) { 1483 pc = rtmp + 25*row; 1484 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1485 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1486 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1487 p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; 1488 p19 = pc[18]; p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; 1489 p24 = pc[23]; p25 = pc[24]; 1490 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1491 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1492 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1493 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 1494 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) { 1495 pv = ba + 25*diag_offset[row]; 1496 pj = bj + diag_offset[row] + 1; 1497 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1498 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1499 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1500 x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; 1501 x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1502 x25 = pv[24]; 1503 pc[0] = m1 = p1*x1 + p6*x2 + p11*x3 + p16*x4 + p21*x5; 1504 pc[1] = m2 = p2*x1 + p7*x2 + p12*x3 + p17*x4 + p22*x5; 1505 pc[2] = m3 = p3*x1 + p8*x2 + p13*x3 + p18*x4 + p23*x5; 1506 pc[3] = m4 = p4*x1 + p9*x2 + p14*x3 + p19*x4 + p24*x5; 1507 pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5; 1508 1509 pc[5] = m6 = p1*x6 + p6*x7 + p11*x8 + p16*x9 + p21*x10; 1510 pc[6] = m7 = p2*x6 + p7*x7 + p12*x8 + p17*x9 + p22*x10; 1511 pc[7] = m8 = p3*x6 + p8*x7 + p13*x8 + p18*x9 + p23*x10; 1512 pc[8] = m9 = p4*x6 + p9*x7 + p14*x8 + p19*x9 + p24*x10; 1513 pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10; 1514 1515 pc[10] = m11 = p1*x11 + p6*x12 + p11*x13 + p16*x14 + p21*x15; 1516 pc[11] = m12 = p2*x11 + p7*x12 + p12*x13 + p17*x14 + p22*x15; 1517 pc[12] = m13 = p3*x11 + p8*x12 + p13*x13 + p18*x14 + p23*x15; 1518 pc[13] = m14 = p4*x11 + p9*x12 + p14*x13 + p19*x14 + p24*x15; 1519 pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15; 1520 1521 pc[15] = m16 = p1*x16 + p6*x17 + p11*x18 + p16*x19 + p21*x20; 1522 pc[16] = m17 = p2*x16 + p7*x17 + p12*x18 + p17*x19 + p22*x20; 1523 pc[17] = m18 = p3*x16 + p8*x17 + p13*x18 + p18*x19 + p23*x20; 1524 pc[18] = m19 = p4*x16 + p9*x17 + p14*x18 + p19*x19 + p24*x20; 1525 pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20; 1526 1527 pc[20] = m21 = p1*x21 + p6*x22 + p11*x23 + p16*x24 + p21*x25; 1528 pc[21] = m22 = p2*x21 + p7*x22 + p12*x23 + p17*x24 + p22*x25; 1529 pc[22] = m23 = p3*x21 + p8*x22 + p13*x23 + p18*x24 + p23*x25; 1530 pc[23] = m24 = p4*x21 + p9*x22 + p14*x23 + p19*x24 + p24*x25; 1531 pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25; 1532 1533 nz = bi[row+1] - diag_offset[row] - 1; 1534 pv += 25; 1535 for (j=0; j<nz; j++) { 1536 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1537 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1538 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1539 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; 1540 x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; 1541 x24 = pv[23]; x25 = pv[24]; 1542 x = rtmp + 25*pj[j]; 1543 x[0] -= m1*x1 + m6*x2 + m11*x3 + m16*x4 + m21*x5; 1544 x[1] -= m2*x1 + m7*x2 + m12*x3 + m17*x4 + m22*x5; 1545 x[2] -= m3*x1 + m8*x2 + m13*x3 + m18*x4 + m23*x5; 1546 x[3] -= m4*x1 + m9*x2 + m14*x3 + m19*x4 + m24*x5; 1547 x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5; 1548 1549 x[5] -= m1*x6 + m6*x7 + m11*x8 + m16*x9 + m21*x10; 1550 x[6] -= m2*x6 + m7*x7 + m12*x8 + m17*x9 + m22*x10; 1551 x[7] -= m3*x6 + m8*x7 + m13*x8 + m18*x9 + m23*x10; 1552 x[8] -= m4*x6 + m9*x7 + m14*x8 + m19*x9 + m24*x10; 1553 x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10; 1554 1555 x[10] -= m1*x11 + m6*x12 + m11*x13 + m16*x14 + m21*x15; 1556 x[11] -= m2*x11 + m7*x12 + m12*x13 + m17*x14 + m22*x15; 1557 x[12] -= m3*x11 + m8*x12 + m13*x13 + m18*x14 + m23*x15; 1558 x[13] -= m4*x11 + m9*x12 + m14*x13 + m19*x14 + m24*x15; 1559 x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15; 1560 1561 x[15] -= m1*x16 + m6*x17 + m11*x18 + m16*x19 + m21*x20; 1562 x[16] -= m2*x16 + m7*x17 + m12*x18 + m17*x19 + m22*x20; 1563 x[17] -= m3*x16 + m8*x17 + m13*x18 + m18*x19 + m23*x20; 1564 x[18] -= m4*x16 + m9*x17 + m14*x18 + m19*x19 + m24*x20; 1565 x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20; 1566 1567 x[20] -= m1*x21 + m6*x22 + m11*x23 + m16*x24 + m21*x25; 1568 x[21] -= m2*x21 + m7*x22 + m12*x23 + m17*x24 + m22*x25; 1569 x[22] -= m3*x21 + m8*x22 + m13*x23 + m18*x24 + m23*x25; 1570 x[23] -= m4*x21 + m9*x22 + m14*x23 + m19*x24 + m24*x25; 1571 x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25; 1572 pv += 25; 1573 } 1574 PLogFlops(250*nz+225); 1575 } 1576 row = *ajtmp++; 1577 } 1578 /* finished row so stick it into b->a */ 1579 pv = ba + 25*bi[i]; 1580 pj = bj + bi[i]; 1581 nz = bi[i+1] - bi[i]; 1582 for (j=0; j<nz; j++) { 1583 x = rtmp+25*pj[j]; 1584 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1585 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1586 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1587 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16]; pv[17] = x[17]; 1588 pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; 1589 pv[23] = x[23]; pv[24] = x[24]; 1590 pv += 25; 1591 } 1592 /* invert diagonal block */ 1593 w = ba + 25*diag_offset[i]; 1594 ierr = Kernel_A_gets_inverse_A_5(w);CHKERRQ(ierr); 1595 } 1596 1597 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1598 C->factor = FACTOR_LU; 1599 C->assembled = PETSC_TRUE; 1600 PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */ 1601 PetscFunctionReturn(0); 1602 } 1603 1604 /* 1605 Version for when blocks are 4 by 4 1606 */ 1607 #undef __FUNC__ 1608 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4" 1609 int MatCholeskyFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B) 1610 { 1611 Mat C = *B; 1612 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1613 IS isrow = b->row,isicol = b->icol; 1614 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1615 int *ajtmpold,*ajtmp,nz,row; 1616 int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 1617 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1618 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 1619 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 1620 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 1621 MatScalar m13,m14,m15,m16; 1622 MatScalar *ba = b->a,*aa = a->a; 1623 1624 PetscFunctionBegin; 1625 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1626 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1627 rtmp = (MatScalar*)PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1628 1629 for (i=0; i<n; i++) { 1630 nz = bi[i+1] - bi[i]; 1631 ajtmp = bj + bi[i]; 1632 for (j=0; j<nz; j++) { 1633 x = rtmp+16*ajtmp[j]; 1634 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1635 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 1636 } 1637 /* load in initial (unfactored row) */ 1638 idx = r[i]; 1639 nz = ai[idx+1] - ai[idx]; 1640 ajtmpold = aj + ai[idx]; 1641 v = aa + 16*ai[idx]; 1642 for (j=0; j<nz; j++) { 1643 x = rtmp+16*ic[ajtmpold[j]]; 1644 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1645 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1646 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1647 x[14] = v[14]; x[15] = v[15]; 1648 v += 16; 1649 } 1650 row = *ajtmp++; 1651 while (row < i) { 1652 pc = rtmp + 16*row; 1653 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1654 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1655 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1656 p15 = pc[14]; p16 = pc[15]; 1657 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1658 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1659 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1660 || p16 != 0.0) { 1661 pv = ba + 16*diag_offset[row]; 1662 pj = bj + diag_offset[row] + 1; 1663 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1664 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1665 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1666 x15 = pv[14]; x16 = pv[15]; 1667 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 1668 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 1669 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 1670 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 1671 1672 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 1673 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 1674 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 1675 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 1676 1677 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 1678 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 1679 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 1680 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 1681 1682 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 1683 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 1684 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 1685 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 1686 1687 nz = bi[row+1] - diag_offset[row] - 1; 1688 pv += 16; 1689 for (j=0; j<nz; j++) { 1690 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1691 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1692 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1693 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1694 x = rtmp + 16*pj[j]; 1695 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 1696 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 1697 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 1698 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 1699 1700 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 1701 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 1702 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 1703 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 1704 1705 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 1706 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 1707 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 1708 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 1709 1710 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 1711 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 1712 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 1713 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 1714 1715 pv += 16; 1716 } 1717 PLogFlops(128*nz+112); 1718 } 1719 row = *ajtmp++; 1720 } 1721 /* finished row so stick it into b->a */ 1722 pv = ba + 16*bi[i]; 1723 pj = bj + bi[i]; 1724 nz = bi[i+1] - bi[i]; 1725 for (j=0; j<nz; j++) { 1726 x = rtmp+16*pj[j]; 1727 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1728 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1729 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1730 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1731 pv += 16; 1732 } 1733 /* invert diagonal block */ 1734 w = ba + 16*diag_offset[i]; 1735 ierr = Kernel_A_gets_inverse_A_4(w);CHKERRQ(ierr); 1736 } 1737 1738 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1739 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1740 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1741 C->factor = FACTOR_LU; 1742 C->assembled = PETSC_TRUE; 1743 PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */ 1744 PetscFunctionReturn(0); 1745 } 1746 /* 1747 Version for when blocks are 4 by 4 Using natural ordering 1748 */ 1749 #undef __FUNC__ 1750 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering" 1751 int MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B) 1752 { 1753 Mat C = *B; 1754 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1755 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1756 int *ajtmpold,*ajtmp,nz,row; 1757 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 1758 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1759 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 1760 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 1761 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 1762 MatScalar m13,m14,m15,m16; 1763 MatScalar *ba = b->a,*aa = a->a; 1764 1765 PetscFunctionBegin; 1766 rtmp = (MatScalar*)PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1767 1768 for (i=0; i<n; i++) { 1769 nz = bi[i+1] - bi[i]; 1770 ajtmp = bj + bi[i]; 1771 for (j=0; j<nz; j++) { 1772 x = rtmp+16*ajtmp[j]; 1773 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1774 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 1775 } 1776 /* load in initial (unfactored row) */ 1777 nz = ai[i+1] - ai[i]; 1778 ajtmpold = aj + ai[i]; 1779 v = aa + 16*ai[i]; 1780 for (j=0; j<nz; j++) { 1781 x = rtmp+16*ajtmpold[j]; 1782 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1783 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1784 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1785 x[14] = v[14]; x[15] = v[15]; 1786 v += 16; 1787 } 1788 row = *ajtmp++; 1789 while (row < i) { 1790 pc = rtmp + 16*row; 1791 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1792 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1793 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1794 p15 = pc[14]; p16 = pc[15]; 1795 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1796 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1797 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1798 || p16 != 0.0) { 1799 pv = ba + 16*diag_offset[row]; 1800 pj = bj + diag_offset[row] + 1; 1801 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1802 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1803 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1804 x15 = pv[14]; x16 = pv[15]; 1805 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 1806 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 1807 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 1808 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 1809 1810 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 1811 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 1812 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 1813 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 1814 1815 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 1816 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 1817 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 1818 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 1819 1820 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 1821 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 1822 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 1823 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 1824 1825 nz = bi[row+1] - diag_offset[row] - 1; 1826 pv += 16; 1827 for (j=0; j<nz; j++) { 1828 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1829 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1830 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1831 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1832 x = rtmp + 16*pj[j]; 1833 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 1834 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 1835 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 1836 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 1837 1838 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 1839 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 1840 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 1841 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 1842 1843 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 1844 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 1845 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 1846 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 1847 1848 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 1849 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 1850 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 1851 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 1852 1853 pv += 16; 1854 } 1855 PLogFlops(128*nz+112); 1856 } 1857 row = *ajtmp++; 1858 } 1859 /* finished row so stick it into b->a */ 1860 pv = ba + 16*bi[i]; 1861 pj = bj + bi[i]; 1862 nz = bi[i+1] - bi[i]; 1863 for (j=0; j<nz; j++) { 1864 x = rtmp+16*pj[j]; 1865 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1866 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1867 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1868 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1869 pv += 16; 1870 } 1871 /* invert diagonal block */ 1872 w = ba + 16*diag_offset[i]; 1873 ierr = Kernel_A_gets_inverse_A_4(w);CHKERRQ(ierr); 1874 } 1875 1876 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1877 C->factor = FACTOR_LU; 1878 C->assembled = PETSC_TRUE; 1879 PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */ 1880 PetscFunctionReturn(0); 1881 } 1882 1883 /* 1884 Version for when blocks are 3 by 3 1885 */ 1886 #undef __FUNC__ 1887 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3" 1888 int MatCholeskyFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B) 1889 { 1890 Mat C = *B; 1891 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1892 IS isrow = b->row,isicol = b->icol; 1893 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1894 int *ajtmpold,*ajtmp,nz,row,*ai=a->i,*aj=a->j; 1895 int *diag_offset = b->diag,idx,*pj; 1896 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1897 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 1898 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9; 1899 MatScalar *ba = b->a,*aa = a->a; 1900 1901 PetscFunctionBegin; 1902 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1903 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1904 rtmp = (MatScalar*)PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1905 1906 for (i=0; i<n; i++) { 1907 nz = bi[i+1] - bi[i]; 1908 ajtmp = bj + bi[i]; 1909 for (j=0; j<nz; j++) { 1910 x = rtmp + 9*ajtmp[j]; 1911 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 1912 } 1913 /* load in initial (unfactored row) */ 1914 idx = r[i]; 1915 nz = ai[idx+1] - ai[idx]; 1916 ajtmpold = aj + ai[idx]; 1917 v = aa + 9*ai[idx]; 1918 for (j=0; j<nz; j++) { 1919 x = rtmp + 9*ic[ajtmpold[j]]; 1920 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1921 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1922 v += 9; 1923 } 1924 row = *ajtmp++; 1925 while (row < i) { 1926 pc = rtmp + 9*row; 1927 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1928 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1929 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1930 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 1931 pv = ba + 9*diag_offset[row]; 1932 pj = bj + diag_offset[row] + 1; 1933 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1934 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1935 pc[0] = m1 = p1*x1 + p4*x2 + p7*x3; 1936 pc[1] = m2 = p2*x1 + p5*x2 + p8*x3; 1937 pc[2] = m3 = p3*x1 + p6*x2 + p9*x3; 1938 1939 pc[3] = m4 = p1*x4 + p4*x5 + p7*x6; 1940 pc[4] = m5 = p2*x4 + p5*x5 + p8*x6; 1941 pc[5] = m6 = p3*x4 + p6*x5 + p9*x6; 1942 1943 pc[6] = m7 = p1*x7 + p4*x8 + p7*x9; 1944 pc[7] = m8 = p2*x7 + p5*x8 + p8*x9; 1945 pc[8] = m9 = p3*x7 + p6*x8 + p9*x9; 1946 nz = bi[row+1] - diag_offset[row] - 1; 1947 pv += 9; 1948 for (j=0; j<nz; j++) { 1949 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1950 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1951 x = rtmp + 9*pj[j]; 1952 x[0] -= m1*x1 + m4*x2 + m7*x3; 1953 x[1] -= m2*x1 + m5*x2 + m8*x3; 1954 x[2] -= m3*x1 + m6*x2 + m9*x3; 1955 1956 x[3] -= m1*x4 + m4*x5 + m7*x6; 1957 x[4] -= m2*x4 + m5*x5 + m8*x6; 1958 x[5] -= m3*x4 + m6*x5 + m9*x6; 1959 1960 x[6] -= m1*x7 + m4*x8 + m7*x9; 1961 x[7] -= m2*x7 + m5*x8 + m8*x9; 1962 x[8] -= m3*x7 + m6*x8 + m9*x9; 1963 pv += 9; 1964 } 1965 PLogFlops(54*nz+36); 1966 } 1967 row = *ajtmp++; 1968 } 1969 /* finished row so stick it into b->a */ 1970 pv = ba + 9*bi[i]; 1971 pj = bj + bi[i]; 1972 nz = bi[i+1] - bi[i]; 1973 for (j=0; j<nz; j++) { 1974 x = rtmp + 9*pj[j]; 1975 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1976 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1977 pv += 9; 1978 } 1979 /* invert diagonal block */ 1980 w = ba + 9*diag_offset[i]; 1981 ierr = Kernel_A_gets_inverse_A_3(w);CHKERRQ(ierr); 1982 } 1983 1984 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1985 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1986 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1987 C->factor = FACTOR_LU; 1988 C->assembled = PETSC_TRUE; 1989 PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */ 1990 PetscFunctionReturn(0); 1991 } 1992 /* 1993 Version for when blocks are 3 by 3 Using natural ordering 1994 */ 1995 #undef __FUNC__ 1996 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering" 1997 int MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B) 1998 { 1999 Mat C = *B; 2000 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 2001 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 2002 int *ajtmpold,*ajtmp,nz,row; 2003 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 2004 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2005 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 2006 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9; 2007 MatScalar *ba = b->a,*aa = a->a; 2008 2009 PetscFunctionBegin; 2010 rtmp = (MatScalar*)PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 2011 2012 for (i=0; i<n; i++) { 2013 nz = bi[i+1] - bi[i]; 2014 ajtmp = bj + bi[i]; 2015 for (j=0; j<nz; j++) { 2016 x = rtmp+9*ajtmp[j]; 2017 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 2018 } 2019 /* load in initial (unfactored row) */ 2020 nz = ai[i+1] - ai[i]; 2021 ajtmpold = aj + ai[i]; 2022 v = aa + 9*ai[i]; 2023 for (j=0; j<nz; j++) { 2024 x = rtmp+9*ajtmpold[j]; 2025 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 2026 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 2027 v += 9; 2028 } 2029 row = *ajtmp++; 2030 while (row < i) { 2031 pc = rtmp + 9*row; 2032 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 2033 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 2034 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 2035 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 2036 pv = ba + 9*diag_offset[row]; 2037 pj = bj + diag_offset[row] + 1; 2038 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2039 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 2040 pc[0] = m1 = p1*x1 + p4*x2 + p7*x3; 2041 pc[1] = m2 = p2*x1 + p5*x2 + p8*x3; 2042 pc[2] = m3 = p3*x1 + p6*x2 + p9*x3; 2043 2044 pc[3] = m4 = p1*x4 + p4*x5 + p7*x6; 2045 pc[4] = m5 = p2*x4 + p5*x5 + p8*x6; 2046 pc[5] = m6 = p3*x4 + p6*x5 + p9*x6; 2047 2048 pc[6] = m7 = p1*x7 + p4*x8 + p7*x9; 2049 pc[7] = m8 = p2*x7 + p5*x8 + p8*x9; 2050 pc[8] = m9 = p3*x7 + p6*x8 + p9*x9; 2051 2052 nz = bi[row+1] - diag_offset[row] - 1; 2053 pv += 9; 2054 for (j=0; j<nz; j++) { 2055 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2056 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 2057 x = rtmp + 9*pj[j]; 2058 x[0] -= m1*x1 + m4*x2 + m7*x3; 2059 x[1] -= m2*x1 + m5*x2 + m8*x3; 2060 x[2] -= m3*x1 + m6*x2 + m9*x3; 2061 2062 x[3] -= m1*x4 + m4*x5 + m7*x6; 2063 x[4] -= m2*x4 + m5*x5 + m8*x6; 2064 x[5] -= m3*x4 + m6*x5 + m9*x6; 2065 2066 x[6] -= m1*x7 + m4*x8 + m7*x9; 2067 x[7] -= m2*x7 + m5*x8 + m8*x9; 2068 x[8] -= m3*x7 + m6*x8 + m9*x9; 2069 pv += 9; 2070 } 2071 PLogFlops(54*nz+36); 2072 } 2073 row = *ajtmp++; 2074 } 2075 /* finished row so stick it into b->a */ 2076 pv = ba + 9*bi[i]; 2077 pj = bj + bi[i]; 2078 nz = bi[i+1] - bi[i]; 2079 for (j=0; j<nz; j++) { 2080 x = rtmp+9*pj[j]; 2081 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 2082 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 2083 pv += 9; 2084 } 2085 /* invert diagonal block */ 2086 w = ba + 9*diag_offset[i]; 2087 ierr = Kernel_A_gets_inverse_A_3(w);CHKERRQ(ierr); 2088 } 2089 2090 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2091 C->factor = FACTOR_LU; 2092 C->assembled = PETSC_TRUE; 2093 PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */ 2094 PetscFunctionReturn(0); 2095 } 2096 2097 /* 2098 Version for when blocks are 2 by 2 2099 */ 2100 #undef __FUNC__ 2101 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2" 2102 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B) 2103 { 2104 Mat C = *B; 2105 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 2106 IS isrow = b->row,isicol = b->icol; 2107 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 2108 int *ajtmpold,*ajtmp,nz,row; 2109 int *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj; 2110 MatScalar *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4; 2111 MatScalar p1,p2,p3,p4; 2112 MatScalar *ba = b->a,*aa = a->a; 2113 2114 PetscFunctionBegin; 2115 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 2116 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 2117 rtmp = (MatScalar*)PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 2118 2119 for (i=0; i<n; i++) { 2120 nz = bi[i+1] - bi[i]; 2121 ajtmp = bj + bi[i]; 2122 for (j=0; j<nz; j++) { 2123 x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0; 2124 } 2125 /* load in initial (unfactored row) */ 2126 idx = r[i]; 2127 nz = ai[idx+1] - ai[idx]; 2128 ajtmpold = aj + ai[idx]; 2129 v = aa + 4*ai[idx]; 2130 for (j=0; j<nz; j++) { 2131 x = rtmp+4*ic[ajtmpold[j]]; 2132 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 2133 v += 4; 2134 } 2135 row = *ajtmp++; 2136 while (row < i) { 2137 pc = rtmp + 4*row; 2138 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 2139 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 2140 pv = ba + 4*diag_offset[row]; 2141 pj = bj + diag_offset[row] + 1; 2142 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2143 pc[0] = m1 = p1*x1 + p3*x2; 2144 pc[1] = m2 = p2*x1 + p4*x2; 2145 pc[2] = m3 = p1*x3 + p3*x4; 2146 pc[3] = m4 = p2*x3 + p4*x4; 2147 nz = bi[row+1] - diag_offset[row] - 1; 2148 pv += 4; 2149 for (j=0; j<nz; j++) { 2150 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2151 x = rtmp + 4*pj[j]; 2152 x[0] -= m1*x1 + m3*x2; 2153 x[1] -= m2*x1 + m4*x2; 2154 x[2] -= m1*x3 + m3*x4; 2155 x[3] -= m2*x3 + m4*x4; 2156 pv += 4; 2157 } 2158 PLogFlops(16*nz+12); 2159 } 2160 row = *ajtmp++; 2161 } 2162 /* finished row so stick it into b->a */ 2163 pv = ba + 4*bi[i]; 2164 pj = bj + bi[i]; 2165 nz = bi[i+1] - bi[i]; 2166 for (j=0; j<nz; j++) { 2167 x = rtmp+4*pj[j]; 2168 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 2169 pv += 4; 2170 } 2171 /* invert diagonal block */ 2172 w = ba + 4*diag_offset[i]; 2173 ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr); 2174 /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/ 2175 } 2176 2177 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2178 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2179 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2180 C->factor = FACTOR_LU; 2181 C->assembled = PETSC_TRUE; 2182 PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 2183 PetscFunctionReturn(0); 2184 } 2185 /* 2186 Version for when blocks are 2 by 2 Using natural ordering 2187 */ 2188 #undef __FUNC__ 2189 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering" 2190 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B) 2191 { 2192 Mat C = *B; 2193 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 2194 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 2195 int *ajtmpold,*ajtmp,nz,row; 2196 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 2197 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2198 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4; 2199 MatScalar *ba = b->a,*aa = a->a; 2200 2201 PetscFunctionBegin; 2202 rtmp = (MatScalar*)PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 2203 2204 for (i=0; i<n; i++) { 2205 nz = bi[i+1] - bi[i]; 2206 ajtmp = bj + bi[i]; 2207 for (j=0; j<nz; j++) { 2208 x = rtmp+4*ajtmp[j]; 2209 x[0] = x[1] = x[2] = x[3] = 0.0; 2210 } 2211 /* load in initial (unfactored row) */ 2212 nz = ai[i+1] - ai[i]; 2213 ajtmpold = aj + ai[i]; 2214 v = aa + 4*ai[i]; 2215 for (j=0; j<nz; j++) { 2216 x = rtmp+4*ajtmpold[j]; 2217 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 2218 v += 4; 2219 } 2220 row = *ajtmp++; 2221 while (row < i) { 2222 pc = rtmp + 4*row; 2223 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 2224 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 2225 pv = ba + 4*diag_offset[row]; 2226 pj = bj + diag_offset[row] + 1; 2227 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2228 pc[0] = m1 = p1*x1 + p3*x2; 2229 pc[1] = m2 = p2*x1 + p4*x2; 2230 pc[2] = m3 = p1*x3 + p3*x4; 2231 pc[3] = m4 = p2*x3 + p4*x4; 2232 nz = bi[row+1] - diag_offset[row] - 1; 2233 pv += 4; 2234 for (j=0; j<nz; j++) { 2235 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2236 x = rtmp + 4*pj[j]; 2237 x[0] -= m1*x1 + m3*x2; 2238 x[1] -= m2*x1 + m4*x2; 2239 x[2] -= m1*x3 + m3*x4; 2240 x[3] -= m2*x3 + m4*x4; 2241 pv += 4; 2242 } 2243 PLogFlops(16*nz+12); 2244 } 2245 row = *ajtmp++; 2246 } 2247 /* finished row so stick it into b->a */ 2248 pv = ba + 4*bi[i]; 2249 pj = bj + bi[i]; 2250 nz = bi[i+1] - bi[i]; 2251 for (j=0; j<nz; j++) { 2252 x = rtmp+4*pj[j]; 2253 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 2254 pv += 4; 2255 } 2256 /* invert diagonal block */ 2257 w = ba + 4*diag_offset[i]; 2258 ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr); 2259 /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/ 2260 } 2261 2262 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2263 C->factor = FACTOR_LU; 2264 C->assembled = PETSC_TRUE; 2265 PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 2266 PetscFunctionReturn(0); 2267 } 2268 2269 /* 2270 Version for when blocks are 1 by 1. 2271 */ 2272 #undef __FUNC__ 2273 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1" 2274 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B) 2275 { 2276 Mat C = *B; 2277 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 2278 IS ip = b->row; 2279 int *rip,*riip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j; 2280 int *ai = a->i,*aj = a->j; 2281 MatScalar *rtmp; 2282 MatScalar *ba = b->a,*aa = a->a; 2283 MatScalar dk,uikdi; 2284 int k,jmin,jmax,*jl,*il,vj,nexti,juj,ili; 2285 2286 PetscFunctionBegin; 2287 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 2288 riip = rip; 2289 2290 /* INITIALIZATION */ 2291 /* il and jl record the first nonzero element in each row of the accessing 2292 window U(0:k, k:mbs-1). 2293 jl: list of rows to be added to uneliminated rows 2294 i>= k: jl(i) is the first row to be added to row i 2295 i< k: jl(i) is the row following row i in some list of rows 2296 jl(i) = mbs indicates the end of a list 2297 il(i): points to the first nonzero element in columns k,...,mbs-1 of 2298 row i of U */ 2299 rtmp = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 2300 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 2301 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2302 for (i=0; i<mbs; i++) { 2303 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 2304 } 2305 2306 /* FOR EACH ROW K */ 2307 for (k = 0; k<mbs; k++){ 2308 2309 /* INITIALIZE K-TH ROW WITH ELEMENTS NONZERO IN ROW P(K) OF A */ 2310 jmin = ai[rip[k]]; jmax = ai[rip[k]+1]; 2311 if (jmin < jmax) { 2312 for (j = jmin; j < jmax; j++){ 2313 vj = riip[aj[j]]; 2314 if (k <= vj) rtmp[vj] = aa[j]; 2315 } 2316 } 2317 2318 /* MODIFY K-TH ROW BY ADDING IN THOSE ROWS I WITH U(I,K) NE 0 2319 FOR EACH ROW I TO BE ADDED IN */ 2320 dk = rtmp[k]; 2321 i = jl[k]; /* first row to be added to k_th row */ 2322 /* printf(" k=%d, pivot row = %d\n",k,i); */ 2323 2324 while (i < mbs){ 2325 nexti = jl[i]; /* next row to be added to k_th row */ 2326 /* printf(" pivot row = %d\n", nexti); */ 2327 2328 /* COMPUTE MULTIPLIER AND UPDATE DIAGONAL ELEMENT */ 2329 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 2330 uikdi = - ba[ili]*ba[i]; 2331 dk += uikdi*ba[ili]; 2332 ba[ili] = uikdi; /* update U(i,k) */ 2333 2334 /* ADD MULTIPLE OF ROW I TO K-TH ROW ... */ 2335 jmin = ili + 1; jmax = bi[i+1]; 2336 if (jmin < jmax){ 2337 for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j]; 2338 /* ... AND ADD I TO ROW LIST FOR NEXT NONZERO ENTRY */ 2339 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 2340 j = bj[jmin]; 2341 jl[i] = jl[j]; jl[j] = i; /* update jl */ 2342 } 2343 i = nexti; 2344 /* printf(" pivot row i=%d\n",i); */ 2345 } 2346 2347 /* CHECK FOR ZERO PIVOT AND SAVE DIAGONAL ELEMENT */ 2348 if (dk == 0.0){ 2349 SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,0,"Zero pivot"); 2350 } 2351 2352 /* SAVE NONZERO ENTRIES IN K-TH ROW OF U ... */ 2353 ba[k] = 1.0/dk; 2354 jmin = bi[k]; jmax = bi[k+1]; 2355 if (jmin < jmax) { 2356 for (j=jmin; j<jmax; j++){ 2357 juj = bj[j]; ba[j] = rtmp[juj]; rtmp[juj] = 0.0; 2358 } 2359 2360 /* ... AND ADD K TO ROW LIST FOR FIRST NONZERO ENTRY IN K-TH ROW */ 2361 il[k] = jmin; 2362 i = bj[jmin]; 2363 jl[k] = jl[i]; jl[i] = k; 2364 } 2365 } 2366 2367 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2368 ierr = PetscFree(il);CHKERRQ(ierr); 2369 ierr = PetscFree(jl);CHKERRQ(ierr); 2370 2371 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 2372 C->factor = FACTOR_LU; 2373 C->assembled = PETSC_TRUE; 2374 PLogFlops(b->mbs); 2375 PetscFunctionReturn(0); 2376 } 2377 2378 #undef __FUNC__ 2379 #define __FUNC__ "MatCholeskyFactor_SeqSBAIJ" 2380 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f) 2381 { 2382 Mat_SeqSBAIJ *mat = (Mat_SeqSBAIJ*)A->data; 2383 int ierr,refct; 2384 Mat C; 2385 PetscOps *Abops; 2386 MatOps Aops; 2387 2388 PetscFunctionBegin; 2389 ierr = MatCholeskyFactorSymbolic(A,perm,f,&C);CHKERRQ(ierr); 2390 ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr); 2391 2392 /* free all the data structures from mat */ 2393 ierr = PetscFree(mat->a);CHKERRQ(ierr); 2394 if (!mat->singlemalloc) { 2395 ierr = PetscFree(mat->i);CHKERRQ(ierr); 2396 ierr = PetscFree(mat->j);CHKERRQ(ierr); 2397 } 2398 if (mat->diag) {ierr = PetscFree(mat->diag);CHKERRQ(ierr);} 2399 if (mat->ilen) {ierr = PetscFree(mat->ilen);CHKERRQ(ierr);} 2400 if (mat->imax) {ierr = PetscFree(mat->imax);CHKERRQ(ierr);} 2401 if (mat->solve_work) {ierr = PetscFree(mat->solve_work);CHKERRQ(ierr);} 2402 if (mat->mult_work) {ierr = PetscFree(mat->mult_work);CHKERRQ(ierr);} 2403 if (mat->icol) {ierr = ISDestroy(mat->icol);CHKERRQ(ierr);} 2404 ierr = PetscFree(mat);CHKERRQ(ierr); 2405 2406 ierr = MapDestroy(A->rmap);CHKERRQ(ierr); 2407 ierr = MapDestroy(A->cmap);CHKERRQ(ierr); 2408 2409 /* 2410 This is horrible,horrible code. We need to keep the 2411 A pointers for the bops and ops but copy everything 2412 else from C. 2413 */ 2414 Abops = A->bops; 2415 Aops = A->ops; 2416 refct = A->refct; 2417 ierr = PetscMemcpy(A,C,sizeof(struct _p_Mat));CHKERRQ(ierr); 2418 mat = (Mat_SeqSBAIJ*)A->data; 2419 PLogObjectParent(A,mat->icol); 2420 2421 A->bops = Abops; 2422 A->ops = Aops; 2423 A->qlist = 0; 2424 A->refct = refct; 2425 /* copy over the type_name and name */ 2426 ierr = PetscStrallocpy(C->type_name,&A->type_name);CHKERRQ(ierr); 2427 ierr = PetscStrallocpy(C->name,&A->name);CHKERRQ(ierr); 2428 2429 PetscHeaderDestroy(C); 2430 PetscFunctionReturn(0); 2431 } 2432 2433 2434