1 /* Using Modified Sparse Row (MSR) storage. 2 See page 85, "Iterative Methods ..." by Saad. */ 3 4 /*$Id: sbaijfact.c,v 1.7 2000/07/25 13:47:46 hzhang Exp hzhang $*/ 5 /* 6 Factorization code for SBAIJ format. 7 */ 8 #include "sbaij.h" 9 #include "src/mat/impls/baij/seq/baij.h" 10 #include "src/vec/vecimpl.h" 11 #include "src/inline/ilu.h" 12 13 #undef __FUNC__ 14 #define __FUNC__ "MatCholeskyFactorSymbolic_SeqSBAIJ" 15 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS iscol,PetscReal f,Mat *B) 16 { 17 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 18 IS isicol,isrow; 19 int *rip,*riip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j; 20 int *jutmp,bs = a->bs,bs2=a->bs2; 21 int m,nzi,realloc = 0; 22 int *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 23 /* PetscReal f = 1.0; */ 24 25 PetscFunctionBegin; 26 isrow = iscol; /* remove isrow later! */ 27 PetscValidHeaderSpecific(isrow,IS_COOKIE); 28 PetscValidHeaderSpecific(iscol,IS_COOKIE); 29 /* if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,0,"matrix must be square");*/ 30 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 31 ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr); 32 ierr = ISGetIndices(isicol,&riip);CHKERRQ(ierr); 33 34 for (k=0; k<mbs; k++) { 35 if ( rip[k] - riip[k] != 0 ) { 36 printf("Non-symm. permutation, use symm. permutation or general matrix format\n"); 37 break; 38 } 39 } 40 41 /* initialization */ 42 /* Don't know how many column pointers are needed so estimate. 43 Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */ 44 /* if (info) f = info->fill; */ 45 umax = (int)(f*ai[mbs] + 1); umax += mbs + 1; 46 ju = iu = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju); 47 iu[0] = mbs+1; 48 juptr = mbs; 49 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 50 q = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q); 51 for (i=0; i<mbs; i++){ 52 jl[i] = mbs; q[i] = 0; 53 } 54 55 /* for each row k */ 56 for (k=0; k<mbs; k++){ 57 nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */ 58 q[k] = mbs; 59 /* initialize nonzero structure of k-th row to row rip[k] of A */ 60 jmin = ai[rip[k]]; 61 jmax = ai[rip[k]+1]; 62 for (j=jmin; j<jmax; j++){ 63 vj = riip[aj[j]]; /* col. value */ 64 if(vj > k){ 65 qm = k; 66 do { 67 m = qm; qm = q[m]; 68 } while(qm < vj); 69 if (qm == vj) { 70 printf(" error: duplicate entry in A\n"); break; 71 } 72 nzk++; 73 q[m] = vj; 74 q[vj] = qm; 75 } /* if(vj > k) */ 76 } /* for (j=jmin; j<jmax; j++) */ 77 78 /* modify nonzero structure of k-th row by computing fill-in 79 for each row i to be merged in */ 80 i = k; 81 i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */ 82 /* printf(" next pivot row i=%d\n",i); */ 83 while (i < mbs){ 84 /* merge row i into k-th row */ 85 nzi = iu[i+1] - (iu[i]+1); 86 jmin = iu[i] + 1; jmax = iu[i] + nzi; 87 qm = k; 88 for (j=jmin; j<jmax+1; j++){ 89 vj = ju[j]; 90 do { 91 m = qm; qm = q[m]; 92 } while (qm < vj); 93 if (qm != vj){ 94 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 95 } 96 } 97 i = jl[i]; /* next pivot row */ 98 } 99 100 /* add k to row list for first nonzero element in k-th row */ 101 if (nzk > 0){ 102 i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */ 103 jl[k] = jl[i]; jl[i] = k; 104 } 105 iu[k+1] = iu[k] + nzk; /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/ 106 107 /* allocate more space to ju if needed */ 108 if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax); 109 /* estimate how much additional space we will need */ 110 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 111 /* just double the memory each time */ 112 maxadd = umax; 113 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 114 umax += maxadd; 115 116 /* allocate a longer ju (NOTE: iu poits to the beginning of ju) */ 117 jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp); 118 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 119 ierr = PetscFree(ju);CHKERRQ(ierr); 120 ju = iu = jutmp; 121 realloc++; /* count how many times we realloc */ 122 } 123 124 /* save nonzero structure of k-th row in ju */ 125 i=k; 126 jumin = juptr + 1; juptr += nzk; 127 for (j=jumin; j<juptr+1; j++){ 128 i=q[i]; 129 ju[j]=i; 130 /* printf(" k=%d, ju[%d]=%d\n",k,j,ju[j]);*/ 131 } 132 /* printf("\n"); */ 133 } /* for (k=0; k<mbs; k++) */ 134 135 if (ai[mbs] != 0) { 136 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 137 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 138 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af); 139 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af); 140 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"); 141 } else { 142 PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 143 } 144 145 ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr); 146 ierr = ISRestoreIndices(isicol,&riip);CHKERRQ(ierr); 147 148 ierr = PetscFree(q);CHKERRQ(ierr); 149 ierr = PetscFree(jl);CHKERRQ(ierr); 150 151 /* put together the new matrix */ 152 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 153 PLogObjectParent(*B,isicol); 154 b = (Mat_SeqSBAIJ*)(*B)->data; 155 ierr = PetscFree(b->imax);CHKERRQ(ierr); 156 b->singlemalloc = PETSC_FALSE; 157 /* the next line frees the default space generated by the Create() */ 158 ierr = PetscFree(b->a);CHKERRQ(ierr); 159 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 160 b->a = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a); 161 b->j = ju; 162 b->i = iu; 163 b->diag = 0; 164 b->ilen = 0; 165 b->imax = 0; 166 b->row = isrow; 167 b->col = iscol; 168 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 169 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 170 b->icol = isicol; 171 b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 172 /* In b structure: Free imax, ilen, old a, old j. 173 Allocate idnew, solve_work, new a, new j */ 174 PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 175 b->s_maxnz = b->s_nz = iu[mbs]; 176 177 (*B)->factor = FACTOR_LU; 178 (*B)->info.factor_mallocs = realloc; 179 (*B)->info.fill_ratio_given = f; 180 if (ai[mbs] != 0) { 181 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 182 } else { 183 (*B)->info.fill_ratio_needed = 0.0; 184 } 185 186 187 PetscFunctionReturn(0); 188 } 189 190 /* ----------------------------------------------------------- */ 191 #undef __FUNC__ 192 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N" 193 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B) 194 { 195 Mat C = *B; 196 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 197 IS isrow = b->row,isicol = b->icol; 198 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 199 int *ajtmpold,*ajtmp,nz,row,bslog,*ai=a->i,*aj=a->j,k,flg; 200 int *diag_offset=b->diag,diag,bs=a->bs,bs2 = a->bs2,*v_pivots,*pj; 201 MatScalar *ba = b->a,*aa = a->a,*pv,*v,*rtmp,*multiplier,*v_work,*pc,*w; 202 203 PetscFunctionBegin; 204 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 205 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 206 rtmp = (MatScalar*)PetscMalloc(bs2*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 207 ierr = PetscMemzero(rtmp,bs2*(n+1)*sizeof(MatScalar));CHKERRQ(ierr); 208 /* generate work space needed by dense LU factorization */ 209 v_work = (MatScalar*)PetscMalloc(bs*sizeof(int) + (bs+bs2)*sizeof(MatScalar));CHKPTRQ(v_work); 210 multiplier = v_work + bs; 211 v_pivots = (int*)(multiplier + bs2); 212 213 /* flops in while loop */ 214 bslog = 2*bs*bs2; 215 216 for (i=0; i<n; i++) { 217 nz = bi[i+1] - bi[i]; 218 ajtmp = bj + bi[i]; 219 for (j=0; j<nz; j++) { 220 ierr = PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 221 } 222 /* load in initial (unfactored row) */ 223 nz = ai[r[i]+1] - ai[r[i]]; 224 ajtmpold = aj + ai[r[i]]; 225 v = aa + bs2*ai[r[i]]; 226 for (j=0; j<nz; j++) { 227 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 228 } 229 row = *ajtmp++; 230 while (row < i) { 231 pc = rtmp + bs2*row; 232 /* if (*pc) { */ 233 for (flg=0,k=0; k<bs2; k++) { if (pc[k]!=0.0) { flg =1; break; }} 234 if (flg) { 235 pv = ba + bs2*diag_offset[row]; 236 pj = bj + diag_offset[row] + 1; 237 Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); 238 nz = bi[row+1] - diag_offset[row] - 1; 239 pv += bs2; 240 for (j=0; j<nz; j++) { 241 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 242 } 243 PLogFlops(bslog*(nz+1)-bs); 244 } 245 row = *ajtmp++; 246 } 247 /* finished row so stick it into b->a */ 248 pv = ba + bs2*bi[i]; 249 pj = bj + bi[i]; 250 nz = bi[i+1] - bi[i]; 251 for (j=0; j<nz; j++) { 252 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 253 } 254 diag = diag_offset[i] - bi[i]; 255 /* invert diagonal block */ 256 w = pv + bs2*diag; 257 Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work); 258 } 259 260 ierr = PetscFree(rtmp);CHKERRQ(ierr); 261 ierr = PetscFree(v_work);CHKERRQ(ierr); 262 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 263 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 264 C->factor = FACTOR_LU; 265 C->assembled = PETSC_TRUE; 266 PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 267 PetscFunctionReturn(0); 268 } 269 /* ------------------------------------------------------------*/ 270 /* 271 Version for when blocks are 7 by 7 272 */ 273 #undef __FUNC__ 274 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7" 275 int MatCholeskyFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B) 276 { 277 Mat C = *B; 278 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 279 IS isrow = b->row,isicol = b->icol; 280 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 281 int *ajtmpold,*ajtmp,nz,row; 282 int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 283 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 284 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 285 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 286 MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 287 MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 288 MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 289 MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 290 MatScalar p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49; 291 MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 292 MatScalar x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49; 293 MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 294 MatScalar m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49; 295 MatScalar *ba = b->a,*aa = a->a; 296 297 PetscFunctionBegin; 298 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 299 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 300 rtmp = (MatScalar*)PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 301 302 for (i=0; i<n; i++) { 303 nz = bi[i+1] - bi[i]; 304 ajtmp = bj + bi[i]; 305 for (j=0; j<nz; j++) { 306 x = rtmp+49*ajtmp[j]; 307 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 308 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 309 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 310 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 311 x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ; 312 x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0 ; 313 } 314 /* load in initial (unfactored row) */ 315 idx = r[i]; 316 nz = ai[idx+1] - ai[idx]; 317 ajtmpold = aj + ai[idx]; 318 v = aa + 49*ai[idx]; 319 for (j=0; j<nz; j++) { 320 x = rtmp+49*ic[ajtmpold[j]]; 321 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 322 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 323 x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 324 x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 325 x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 326 x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 327 x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 328 x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 329 x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 330 x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39]; 331 x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43]; 332 x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47]; 333 x[48] = v[48]; 334 v += 49; 335 } 336 row = *ajtmp++; 337 while (row < i) { 338 pc = rtmp + 49*row; 339 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 340 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 341 p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 342 p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 343 p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 344 p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 345 p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 346 p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 347 p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 348 p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39]; 349 p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43]; 350 p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47]; 351 p49 = pc[48]; 352 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 353 p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 354 p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 355 p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 356 p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 357 p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 358 p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 359 p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 360 p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 || 361 p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 || 362 p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 || 363 p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 || 364 p49 != 0.0) { 365 pv = ba + 49*diag_offset[row]; 366 pj = bj + diag_offset[row] + 1; 367 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 368 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 369 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 370 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 371 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 372 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 373 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 374 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 375 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 376 x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 377 x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 378 x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 379 x49 = pv[48]; 380 pc[0] = m1 = p1*x1 + p8*x2 + p15*x3 + p22*x4 + p29*x5 + p36*x6 + p43*x7; 381 pc[1] = m2 = p2*x1 + p9*x2 + p16*x3 + p23*x4 + p30*x5 + p37*x6 + p44*x7; 382 pc[2] = m3 = p3*x1 + p10*x2 + p17*x3 + p24*x4 + p31*x5 + p38*x6 + p45*x7; 383 pc[3] = m4 = p4*x1 + p11*x2 + p18*x3 + p25*x4 + p32*x5 + p39*x6 + p46*x7; 384 pc[4] = m5 = p5*x1 + p12*x2 + p19*x3 + p26*x4 + p33*x5 + p40*x6 + p47*x7; 385 pc[5] = m6 = p6*x1 + p13*x2 + p20*x3 + p27*x4 + p34*x5 + p41*x6 + p48*x7; 386 pc[6] = m7 = p7*x1 + p14*x2 + p21*x3 + p28*x4 + p35*x5 + p42*x6 + p49*x7; 387 388 pc[7] = m8 = p1*x8 + p8*x9 + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14; 389 pc[8] = m9 = p2*x8 + p9*x9 + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14; 390 pc[9] = m10 = p3*x8 + p10*x9 + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14; 391 pc[10] = m11 = p4*x8 + p11*x9 + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14; 392 pc[11] = m12 = p5*x8 + p12*x9 + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14; 393 pc[12] = m13 = p6*x8 + p13*x9 + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14; 394 pc[13] = m14 = p7*x8 + p14*x9 + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14; 395 396 pc[14] = m15 = p1*x15 + p8*x16 + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21; 397 pc[15] = m16 = p2*x15 + p9*x16 + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21; 398 pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21; 399 pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21; 400 pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21; 401 pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21; 402 pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21; 403 404 pc[21] = m22 = p1*x22 + p8*x23 + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28; 405 pc[22] = m23 = p2*x22 + p9*x23 + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28; 406 pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28; 407 pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28; 408 pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28; 409 pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28; 410 pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28; 411 412 pc[28] = m29 = p1*x29 + p8*x30 + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35; 413 pc[29] = m30 = p2*x29 + p9*x30 + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35; 414 pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35; 415 pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35; 416 pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35; 417 pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35; 418 pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35; 419 420 pc[35] = m36 = p1*x36 + p8*x37 + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42; 421 pc[36] = m37 = p2*x36 + p9*x37 + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42; 422 pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42; 423 pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42; 424 pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42; 425 pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42; 426 pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42; 427 428 pc[42] = m43 = p1*x43 + p8*x44 + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49; 429 pc[43] = m44 = p2*x43 + p9*x44 + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49; 430 pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49; 431 pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49; 432 pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49; 433 pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49; 434 pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49; 435 436 nz = bi[row+1] - diag_offset[row] - 1; 437 pv += 49; 438 for (j=0; j<nz; j++) { 439 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 440 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 441 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 442 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 443 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 444 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 445 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 446 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 447 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 448 x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 449 x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 450 x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 451 x49 = pv[48]; 452 x = rtmp + 49*pj[j]; 453 x[0] -= m1*x1 + m8*x2 + m15*x3 + m22*x4 + m29*x5 + m36*x6 + m43*x7; 454 x[1] -= m2*x1 + m9*x2 + m16*x3 + m23*x4 + m30*x5 + m37*x6 + m44*x7; 455 x[2] -= m3*x1 + m10*x2 + m17*x3 + m24*x4 + m31*x5 + m38*x6 + m45*x7; 456 x[3] -= m4*x1 + m11*x2 + m18*x3 + m25*x4 + m32*x5 + m39*x6 + m46*x7; 457 x[4] -= m5*x1 + m12*x2 + m19*x3 + m26*x4 + m33*x5 + m40*x6 + m47*x7; 458 x[5] -= m6*x1 + m13*x2 + m20*x3 + m27*x4 + m34*x5 + m41*x6 + m48*x7; 459 x[6] -= m7*x1 + m14*x2 + m21*x3 + m28*x4 + m35*x5 + m42*x6 + m49*x7; 460 461 x[7] -= m1*x8 + m8*x9 + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14; 462 x[8] -= m2*x8 + m9*x9 + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14; 463 x[9] -= m3*x8 + m10*x9 + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14; 464 x[10] -= m4*x8 + m11*x9 + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14; 465 x[11] -= m5*x8 + m12*x9 + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14; 466 x[12] -= m6*x8 + m13*x9 + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14; 467 x[13] -= m7*x8 + m14*x9 + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14; 468 469 x[14] -= m1*x15 + m8*x16 + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21; 470 x[15] -= m2*x15 + m9*x16 + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21; 471 x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21; 472 x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21; 473 x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21; 474 x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21; 475 x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21; 476 477 x[21] -= m1*x22 + m8*x23 + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28; 478 x[22] -= m2*x22 + m9*x23 + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28; 479 x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28; 480 x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28; 481 x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28; 482 x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28; 483 x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28; 484 485 x[28] -= m1*x29 + m8*x30 + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35; 486 x[29] -= m2*x29 + m9*x30 + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35; 487 x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35; 488 x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35; 489 x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35; 490 x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35; 491 x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35; 492 493 x[35] -= m1*x36 + m8*x37 + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42; 494 x[36] -= m2*x36 + m9*x37 + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42; 495 x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42; 496 x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42; 497 x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42; 498 x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42; 499 x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42; 500 501 x[42] -= m1*x43 + m8*x44 + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49; 502 x[43] -= m2*x43 + m9*x44 + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49; 503 x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49; 504 x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49; 505 x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49; 506 x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49; 507 x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49; 508 pv += 49; 509 } 510 PLogFlops(686*nz+637); 511 } 512 row = *ajtmp++; 513 } 514 /* finished row so stick it into b->a */ 515 pv = ba + 49*bi[i]; 516 pj = bj + bi[i]; 517 nz = bi[i+1] - bi[i]; 518 for (j=0; j<nz; j++) { 519 x = rtmp+49*pj[j]; 520 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 521 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 522 pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 523 pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 524 pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 525 pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 526 pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 527 pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 528 pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 529 pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39]; 530 pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43]; 531 pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47]; 532 pv[48] = x[48]; 533 pv += 49; 534 } 535 /* invert diagonal block */ 536 w = ba + 49*diag_offset[i]; 537 ierr = Kernel_A_gets_inverse_A_7(w);CHKERRQ(ierr); 538 } 539 540 ierr = PetscFree(rtmp);CHKERRQ(ierr); 541 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 542 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 543 C->factor = FACTOR_LU; 544 C->assembled = PETSC_TRUE; 545 PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */ 546 PetscFunctionReturn(0); 547 } 548 549 /* 550 Version for when blocks are 7 by 7 Using natural ordering 551 */ 552 #undef __FUNC__ 553 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering" 554 int MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B) 555 { 556 Mat C = *B; 557 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 558 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 559 int *ajtmpold,*ajtmp,nz,row; 560 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 561 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 562 MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 563 MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 564 MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 565 MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 566 MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 567 MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 568 MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 569 MatScalar p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49; 570 MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 571 MatScalar x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49; 572 MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 573 MatScalar m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49; 574 MatScalar *ba = b->a,*aa = a->a; 575 576 PetscFunctionBegin; 577 rtmp = (MatScalar*)PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 578 for (i=0; i<n; i++) { 579 nz = bi[i+1] - bi[i]; 580 ajtmp = bj + bi[i]; 581 for (j=0; j<nz; j++) { 582 x = rtmp+49*ajtmp[j]; 583 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 584 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 585 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 586 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 587 x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ; 588 x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0 ; 589 } 590 /* load in initial (unfactored row) */ 591 nz = ai[i+1] - ai[i]; 592 ajtmpold = aj + ai[i]; 593 v = aa + 49*ai[i]; 594 for (j=0; j<nz; j++) { 595 x = rtmp+49*ajtmpold[j]; 596 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 597 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 598 x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 599 x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 600 x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 601 x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 602 x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 603 x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 604 x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 605 x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39]; 606 x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43]; 607 x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47]; 608 x[48] = v[48]; 609 v += 49; 610 } 611 row = *ajtmp++; 612 while (row < i) { 613 pc = rtmp + 49*row; 614 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 615 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 616 p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 617 p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 618 p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 619 p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 620 p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 621 p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 622 p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 623 p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39]; 624 p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43]; 625 p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47]; 626 p49 = pc[48]; 627 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 628 p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 629 p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 630 p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 631 p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 632 p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 633 p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 634 p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 635 p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 || 636 p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 || 637 p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 || 638 p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 || 639 p49 != 0.0) { 640 pv = ba + 49*diag_offset[row]; 641 pj = bj + diag_offset[row] + 1; 642 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 643 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 644 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 645 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 646 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 647 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 648 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 649 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 650 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 651 x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 652 x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 653 x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 654 x49 = pv[48]; 655 pc[0] = m1 = p1*x1 + p8*x2 + p15*x3 + p22*x4 + p29*x5 + p36*x6 + p43*x7; 656 pc[1] = m2 = p2*x1 + p9*x2 + p16*x3 + p23*x4 + p30*x5 + p37*x6 + p44*x7; 657 pc[2] = m3 = p3*x1 + p10*x2 + p17*x3 + p24*x4 + p31*x5 + p38*x6 + p45*x7; 658 pc[3] = m4 = p4*x1 + p11*x2 + p18*x3 + p25*x4 + p32*x5 + p39*x6 + p46*x7; 659 pc[4] = m5 = p5*x1 + p12*x2 + p19*x3 + p26*x4 + p33*x5 + p40*x6 + p47*x7; 660 pc[5] = m6 = p6*x1 + p13*x2 + p20*x3 + p27*x4 + p34*x5 + p41*x6 + p48*x7; 661 pc[6] = m7 = p7*x1 + p14*x2 + p21*x3 + p28*x4 + p35*x5 + p42*x6 + p49*x7; 662 663 pc[7] = m8 = p1*x8 + p8*x9 + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14; 664 pc[8] = m9 = p2*x8 + p9*x9 + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14; 665 pc[9] = m10 = p3*x8 + p10*x9 + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14; 666 pc[10] = m11 = p4*x8 + p11*x9 + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14; 667 pc[11] = m12 = p5*x8 + p12*x9 + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14; 668 pc[12] = m13 = p6*x8 + p13*x9 + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14; 669 pc[13] = m14 = p7*x8 + p14*x9 + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14; 670 671 pc[14] = m15 = p1*x15 + p8*x16 + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21; 672 pc[15] = m16 = p2*x15 + p9*x16 + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21; 673 pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21; 674 pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21; 675 pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21; 676 pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21; 677 pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21; 678 679 pc[21] = m22 = p1*x22 + p8*x23 + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28; 680 pc[22] = m23 = p2*x22 + p9*x23 + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28; 681 pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28; 682 pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28; 683 pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28; 684 pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28; 685 pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28; 686 687 pc[28] = m29 = p1*x29 + p8*x30 + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35; 688 pc[29] = m30 = p2*x29 + p9*x30 + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35; 689 pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35; 690 pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35; 691 pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35; 692 pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35; 693 pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35; 694 695 pc[35] = m36 = p1*x36 + p8*x37 + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42; 696 pc[36] = m37 = p2*x36 + p9*x37 + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42; 697 pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42; 698 pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42; 699 pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42; 700 pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42; 701 pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42; 702 703 pc[42] = m43 = p1*x43 + p8*x44 + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49; 704 pc[43] = m44 = p2*x43 + p9*x44 + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49; 705 pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49; 706 pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49; 707 pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49; 708 pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49; 709 pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49; 710 711 nz = bi[row+1] - diag_offset[row] - 1; 712 pv += 49; 713 for (j=0; j<nz; j++) { 714 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 715 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 716 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 717 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 718 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 719 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 720 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 721 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 722 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 723 x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 724 x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 725 x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 726 x49 = pv[48]; 727 x = rtmp + 49*pj[j]; 728 x[0] -= m1*x1 + m8*x2 + m15*x3 + m22*x4 + m29*x5 + m36*x6 + m43*x7; 729 x[1] -= m2*x1 + m9*x2 + m16*x3 + m23*x4 + m30*x5 + m37*x6 + m44*x7; 730 x[2] -= m3*x1 + m10*x2 + m17*x3 + m24*x4 + m31*x5 + m38*x6 + m45*x7; 731 x[3] -= m4*x1 + m11*x2 + m18*x3 + m25*x4 + m32*x5 + m39*x6 + m46*x7; 732 x[4] -= m5*x1 + m12*x2 + m19*x3 + m26*x4 + m33*x5 + m40*x6 + m47*x7; 733 x[5] -= m6*x1 + m13*x2 + m20*x3 + m27*x4 + m34*x5 + m41*x6 + m48*x7; 734 x[6] -= m7*x1 + m14*x2 + m21*x3 + m28*x4 + m35*x5 + m42*x6 + m49*x7; 735 736 x[7] -= m1*x8 + m8*x9 + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14; 737 x[8] -= m2*x8 + m9*x9 + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14; 738 x[9] -= m3*x8 + m10*x9 + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14; 739 x[10] -= m4*x8 + m11*x9 + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14; 740 x[11] -= m5*x8 + m12*x9 + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14; 741 x[12] -= m6*x8 + m13*x9 + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14; 742 x[13] -= m7*x8 + m14*x9 + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14; 743 744 x[14] -= m1*x15 + m8*x16 + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21; 745 x[15] -= m2*x15 + m9*x16 + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21; 746 x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21; 747 x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21; 748 x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21; 749 x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21; 750 x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21; 751 752 x[21] -= m1*x22 + m8*x23 + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28; 753 x[22] -= m2*x22 + m9*x23 + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28; 754 x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28; 755 x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28; 756 x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28; 757 x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28; 758 x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28; 759 760 x[28] -= m1*x29 + m8*x30 + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35; 761 x[29] -= m2*x29 + m9*x30 + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35; 762 x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35; 763 x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35; 764 x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35; 765 x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35; 766 x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35; 767 768 x[35] -= m1*x36 + m8*x37 + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42; 769 x[36] -= m2*x36 + m9*x37 + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42; 770 x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42; 771 x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42; 772 x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42; 773 x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42; 774 x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42; 775 776 x[42] -= m1*x43 + m8*x44 + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49; 777 x[43] -= m2*x43 + m9*x44 + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49; 778 x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49; 779 x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49; 780 x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49; 781 x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49; 782 x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49; 783 pv += 49; 784 } 785 PLogFlops(686*nz+637); 786 } 787 row = *ajtmp++; 788 } 789 /* finished row so stick it into b->a */ 790 pv = ba + 49*bi[i]; 791 pj = bj + bi[i]; 792 nz = bi[i+1] - bi[i]; 793 for (j=0; j<nz; j++) { 794 x = rtmp+49*pj[j]; 795 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 796 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 797 pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 798 pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 799 pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 800 pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 801 pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 802 pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 803 pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 804 pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39]; 805 pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43]; 806 pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47]; 807 pv[48] = x[48]; 808 pv += 49; 809 } 810 /* invert diagonal block */ 811 w = ba + 49*diag_offset[i]; 812 ierr = Kernel_A_gets_inverse_A_7(w);CHKERRQ(ierr); 813 } 814 815 ierr = PetscFree(rtmp);CHKERRQ(ierr); 816 C->factor = FACTOR_LU; 817 C->assembled = PETSC_TRUE; 818 PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */ 819 PetscFunctionReturn(0); 820 } 821 822 /* ------------------------------------------------------------*/ 823 /* 824 Version for when blocks are 6 by 6 825 */ 826 #undef __FUNC__ 827 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6" 828 int MatCholeskyFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B) 829 { 830 Mat C = *B; 831 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 832 IS isrow = b->row,isicol = b->icol; 833 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 834 int *ajtmpold,*ajtmp,nz,row; 835 int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 836 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 837 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 838 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 839 MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 840 MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 841 MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 842 MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 843 MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 844 MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 845 MatScalar *ba = b->a,*aa = a->a; 846 847 PetscFunctionBegin; 848 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 849 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 850 rtmp = (MatScalar*)PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 851 852 for (i=0; i<n; i++) { 853 nz = bi[i+1] - bi[i]; 854 ajtmp = bj + bi[i]; 855 for (j=0; j<nz; j++) { 856 x = rtmp+36*ajtmp[j]; 857 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 858 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 859 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 860 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 861 x[34] = x[35] = 0.0 ; 862 } 863 /* load in initial (unfactored row) */ 864 idx = r[i]; 865 nz = ai[idx+1] - ai[idx]; 866 ajtmpold = aj + ai[idx]; 867 v = aa + 36*ai[idx]; 868 for (j=0; j<nz; j++) { 869 x = rtmp+36*ic[ajtmpold[j]]; 870 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 871 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 872 x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 873 x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 874 x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 875 x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 876 x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 877 x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 878 x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 879 v += 36; 880 } 881 row = *ajtmp++; 882 while (row < i) { 883 pc = rtmp + 36*row; 884 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 885 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 886 p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 887 p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 888 p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 889 p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 890 p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 891 p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 892 p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 893 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 894 p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 895 p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 896 p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 897 p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 898 p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 899 p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 900 p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 901 p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) { 902 pv = ba + 36*diag_offset[row]; 903 pj = bj + diag_offset[row] + 1; 904 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 905 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 906 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 907 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 908 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 909 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 910 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 911 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 912 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 913 pc[0] = m1 = p1*x1 + p7*x2 + p13*x3 + p19*x4 + p25*x5 + p31*x6; 914 pc[1] = m2 = p2*x1 + p8*x2 + p14*x3 + p20*x4 + p26*x5 + p32*x6; 915 pc[2] = m3 = p3*x1 + p9*x2 + p15*x3 + p21*x4 + p27*x5 + p33*x6; 916 pc[3] = m4 = p4*x1 + p10*x2 + p16*x3 + p22*x4 + p28*x5 + p34*x6; 917 pc[4] = m5 = p5*x1 + p11*x2 + p17*x3 + p23*x4 + p29*x5 + p35*x6; 918 pc[5] = m6 = p6*x1 + p12*x2 + p18*x3 + p24*x4 + p30*x5 + p36*x6; 919 920 pc[6] = m7 = p1*x7 + p7*x8 + p13*x9 + p19*x10 + p25*x11 + p31*x12; 921 pc[7] = m8 = p2*x7 + p8*x8 + p14*x9 + p20*x10 + p26*x11 + p32*x12; 922 pc[8] = m9 = p3*x7 + p9*x8 + p15*x9 + p21*x10 + p27*x11 + p33*x12; 923 pc[9] = m10 = p4*x7 + p10*x8 + p16*x9 + p22*x10 + p28*x11 + p34*x12; 924 pc[10] = m11 = p5*x7 + p11*x8 + p17*x9 + p23*x10 + p29*x11 + p35*x12; 925 pc[11] = m12 = p6*x7 + p12*x8 + p18*x9 + p24*x10 + p30*x11 + p36*x12; 926 927 pc[12] = m13 = p1*x13 + p7*x14 + p13*x15 + p19*x16 + p25*x17 + p31*x18; 928 pc[13] = m14 = p2*x13 + p8*x14 + p14*x15 + p20*x16 + p26*x17 + p32*x18; 929 pc[14] = m15 = p3*x13 + p9*x14 + p15*x15 + p21*x16 + p27*x17 + p33*x18; 930 pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18; 931 pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18; 932 pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18; 933 934 pc[18] = m19 = p1*x19 + p7*x20 + p13*x21 + p19*x22 + p25*x23 + p31*x24; 935 pc[19] = m20 = p2*x19 + p8*x20 + p14*x21 + p20*x22 + p26*x23 + p32*x24; 936 pc[20] = m21 = p3*x19 + p9*x20 + p15*x21 + p21*x22 + p27*x23 + p33*x24; 937 pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24; 938 pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24; 939 pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24; 940 941 pc[24] = m25 = p1*x25 + p7*x26 + p13*x27 + p19*x28 + p25*x29 + p31*x30; 942 pc[25] = m26 = p2*x25 + p8*x26 + p14*x27 + p20*x28 + p26*x29 + p32*x30; 943 pc[26] = m27 = p3*x25 + p9*x26 + p15*x27 + p21*x28 + p27*x29 + p33*x30; 944 pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30; 945 pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30; 946 pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30; 947 948 pc[30] = m31 = p1*x31 + p7*x32 + p13*x33 + p19*x34 + p25*x35 + p31*x36; 949 pc[31] = m32 = p2*x31 + p8*x32 + p14*x33 + p20*x34 + p26*x35 + p32*x36; 950 pc[32] = m33 = p3*x31 + p9*x32 + p15*x33 + p21*x34 + p27*x35 + p33*x36; 951 pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36; 952 pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36; 953 pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36; 954 955 nz = bi[row+1] - diag_offset[row] - 1; 956 pv += 36; 957 for (j=0; j<nz; j++) { 958 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 959 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 960 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 961 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 962 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 963 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 964 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 965 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 966 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 967 x = rtmp + 36*pj[j]; 968 x[0] -= m1*x1 + m7*x2 + m13*x3 + m19*x4 + m25*x5 + m31*x6; 969 x[1] -= m2*x1 + m8*x2 + m14*x3 + m20*x4 + m26*x5 + m32*x6; 970 x[2] -= m3*x1 + m9*x2 + m15*x3 + m21*x4 + m27*x5 + m33*x6; 971 x[3] -= m4*x1 + m10*x2 + m16*x3 + m22*x4 + m28*x5 + m34*x6; 972 x[4] -= m5*x1 + m11*x2 + m17*x3 + m23*x4 + m29*x5 + m35*x6; 973 x[5] -= m6*x1 + m12*x2 + m18*x3 + m24*x4 + m30*x5 + m36*x6; 974 975 x[6] -= m1*x7 + m7*x8 + m13*x9 + m19*x10 + m25*x11 + m31*x12; 976 x[7] -= m2*x7 + m8*x8 + m14*x9 + m20*x10 + m26*x11 + m32*x12; 977 x[8] -= m3*x7 + m9*x8 + m15*x9 + m21*x10 + m27*x11 + m33*x12; 978 x[9] -= m4*x7 + m10*x8 + m16*x9 + m22*x10 + m28*x11 + m34*x12; 979 x[10] -= m5*x7 + m11*x8 + m17*x9 + m23*x10 + m29*x11 + m35*x12; 980 x[11] -= m6*x7 + m12*x8 + m18*x9 + m24*x10 + m30*x11 + m36*x12; 981 982 x[12] -= m1*x13 + m7*x14 + m13*x15 + m19*x16 + m25*x17 + m31*x18; 983 x[13] -= m2*x13 + m8*x14 + m14*x15 + m20*x16 + m26*x17 + m32*x18; 984 x[14] -= m3*x13 + m9*x14 + m15*x15 + m21*x16 + m27*x17 + m33*x18; 985 x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18; 986 x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18; 987 x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18; 988 989 x[18] -= m1*x19 + m7*x20 + m13*x21 + m19*x22 + m25*x23 + m31*x24; 990 x[19] -= m2*x19 + m8*x20 + m14*x21 + m20*x22 + m26*x23 + m32*x24; 991 x[20] -= m3*x19 + m9*x20 + m15*x21 + m21*x22 + m27*x23 + m33*x24; 992 x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24; 993 x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24; 994 x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24; 995 996 x[24] -= m1*x25 + m7*x26 + m13*x27 + m19*x28 + m25*x29 + m31*x30; 997 x[25] -= m2*x25 + m8*x26 + m14*x27 + m20*x28 + m26*x29 + m32*x30; 998 x[26] -= m3*x25 + m9*x26 + m15*x27 + m21*x28 + m27*x29 + m33*x30; 999 x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30; 1000 x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30; 1001 x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30; 1002 1003 x[30] -= m1*x31 + m7*x32 + m13*x33 + m19*x34 + m25*x35 + m31*x36; 1004 x[31] -= m2*x31 + m8*x32 + m14*x33 + m20*x34 + m26*x35 + m32*x36; 1005 x[32] -= m3*x31 + m9*x32 + m15*x33 + m21*x34 + m27*x35 + m33*x36; 1006 x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36; 1007 x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36; 1008 x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36; 1009 1010 pv += 36; 1011 } 1012 PLogFlops(432*nz+396); 1013 } 1014 row = *ajtmp++; 1015 } 1016 /* finished row so stick it into b->a */ 1017 pv = ba + 36*bi[i]; 1018 pj = bj + bi[i]; 1019 nz = bi[i+1] - bi[i]; 1020 for (j=0; j<nz; j++) { 1021 x = rtmp+36*pj[j]; 1022 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1023 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 1024 pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 1025 pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1026 pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 1027 pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 1028 pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 1029 pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 1030 pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 1031 pv += 36; 1032 } 1033 /* invert diagonal block */ 1034 w = ba + 36*diag_offset[i]; 1035 ierr = Kernel_A_gets_inverse_A_6(w);CHKERRQ(ierr); 1036 } 1037 1038 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1039 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1040 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1041 C->factor = FACTOR_LU; 1042 C->assembled = PETSC_TRUE; 1043 PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */ 1044 PetscFunctionReturn(0); 1045 } 1046 /* 1047 Version for when blocks are 6 by 6 Using natural ordering 1048 */ 1049 #undef __FUNC__ 1050 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering" 1051 int MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B) 1052 { 1053 Mat C = *B; 1054 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1055 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1056 int *ajtmpold,*ajtmp,nz,row; 1057 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 1058 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1059 MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 1060 MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 1061 MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 1062 MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 1063 MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 1064 MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 1065 MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 1066 MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 1067 MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 1068 MatScalar *ba = b->a,*aa = a->a; 1069 1070 PetscFunctionBegin; 1071 rtmp = (MatScalar*)PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1072 for (i=0; i<n; i++) { 1073 nz = bi[i+1] - bi[i]; 1074 ajtmp = bj + bi[i]; 1075 for (j=0; j<nz; j++) { 1076 x = rtmp+36*ajtmp[j]; 1077 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1078 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 1079 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 1080 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 1081 x[34] = x[35] = 0.0 ; 1082 } 1083 /* load in initial (unfactored row) */ 1084 nz = ai[i+1] - ai[i]; 1085 ajtmpold = aj + ai[i]; 1086 v = aa + 36*ai[i]; 1087 for (j=0; j<nz; j++) { 1088 x = rtmp+36*ajtmpold[j]; 1089 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1090 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 1091 x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 1092 x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 1093 x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 1094 x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 1095 x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 1096 x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 1097 x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 1098 v += 36; 1099 } 1100 row = *ajtmp++; 1101 while (row < i) { 1102 pc = rtmp + 36*row; 1103 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1104 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 1105 p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 1106 p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 1107 p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 1108 p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 1109 p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 1110 p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 1111 p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 1112 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 1113 p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 1114 p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 1115 p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 1116 p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 1117 p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 1118 p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 1119 p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 1120 p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) { 1121 pv = ba + 36*diag_offset[row]; 1122 pj = bj + diag_offset[row] + 1; 1123 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1124 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 1125 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 1126 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1127 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 1128 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1129 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 1130 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 1131 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 1132 pc[0] = m1 = p1*x1 + p7*x2 + p13*x3 + p19*x4 + p25*x5 + p31*x6; 1133 pc[1] = m2 = p2*x1 + p8*x2 + p14*x3 + p20*x4 + p26*x5 + p32*x6; 1134 pc[2] = m3 = p3*x1 + p9*x2 + p15*x3 + p21*x4 + p27*x5 + p33*x6; 1135 pc[3] = m4 = p4*x1 + p10*x2 + p16*x3 + p22*x4 + p28*x5 + p34*x6; 1136 pc[4] = m5 = p5*x1 + p11*x2 + p17*x3 + p23*x4 + p29*x5 + p35*x6; 1137 pc[5] = m6 = p6*x1 + p12*x2 + p18*x3 + p24*x4 + p30*x5 + p36*x6; 1138 1139 pc[6] = m7 = p1*x7 + p7*x8 + p13*x9 + p19*x10 + p25*x11 + p31*x12; 1140 pc[7] = m8 = p2*x7 + p8*x8 + p14*x9 + p20*x10 + p26*x11 + p32*x12; 1141 pc[8] = m9 = p3*x7 + p9*x8 + p15*x9 + p21*x10 + p27*x11 + p33*x12; 1142 pc[9] = m10 = p4*x7 + p10*x8 + p16*x9 + p22*x10 + p28*x11 + p34*x12; 1143 pc[10] = m11 = p5*x7 + p11*x8 + p17*x9 + p23*x10 + p29*x11 + p35*x12; 1144 pc[11] = m12 = p6*x7 + p12*x8 + p18*x9 + p24*x10 + p30*x11 + p36*x12; 1145 1146 pc[12] = m13 = p1*x13 + p7*x14 + p13*x15 + p19*x16 + p25*x17 + p31*x18; 1147 pc[13] = m14 = p2*x13 + p8*x14 + p14*x15 + p20*x16 + p26*x17 + p32*x18; 1148 pc[14] = m15 = p3*x13 + p9*x14 + p15*x15 + p21*x16 + p27*x17 + p33*x18; 1149 pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18; 1150 pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18; 1151 pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18; 1152 1153 pc[18] = m19 = p1*x19 + p7*x20 + p13*x21 + p19*x22 + p25*x23 + p31*x24; 1154 pc[19] = m20 = p2*x19 + p8*x20 + p14*x21 + p20*x22 + p26*x23 + p32*x24; 1155 pc[20] = m21 = p3*x19 + p9*x20 + p15*x21 + p21*x22 + p27*x23 + p33*x24; 1156 pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24; 1157 pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24; 1158 pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24; 1159 1160 pc[24] = m25 = p1*x25 + p7*x26 + p13*x27 + p19*x28 + p25*x29 + p31*x30; 1161 pc[25] = m26 = p2*x25 + p8*x26 + p14*x27 + p20*x28 + p26*x29 + p32*x30; 1162 pc[26] = m27 = p3*x25 + p9*x26 + p15*x27 + p21*x28 + p27*x29 + p33*x30; 1163 pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30; 1164 pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30; 1165 pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30; 1166 1167 pc[30] = m31 = p1*x31 + p7*x32 + p13*x33 + p19*x34 + p25*x35 + p31*x36; 1168 pc[31] = m32 = p2*x31 + p8*x32 + p14*x33 + p20*x34 + p26*x35 + p32*x36; 1169 pc[32] = m33 = p3*x31 + p9*x32 + p15*x33 + p21*x34 + p27*x35 + p33*x36; 1170 pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36; 1171 pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36; 1172 pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36; 1173 1174 nz = bi[row+1] - diag_offset[row] - 1; 1175 pv += 36; 1176 for (j=0; j<nz; j++) { 1177 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1178 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 1179 x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 1180 x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1181 x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 1182 x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1183 x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 1184 x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 1185 x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 1186 x = rtmp + 36*pj[j]; 1187 x[0] -= m1*x1 + m7*x2 + m13*x3 + m19*x4 + m25*x5 + m31*x6; 1188 x[1] -= m2*x1 + m8*x2 + m14*x3 + m20*x4 + m26*x5 + m32*x6; 1189 x[2] -= m3*x1 + m9*x2 + m15*x3 + m21*x4 + m27*x5 + m33*x6; 1190 x[3] -= m4*x1 + m10*x2 + m16*x3 + m22*x4 + m28*x5 + m34*x6; 1191 x[4] -= m5*x1 + m11*x2 + m17*x3 + m23*x4 + m29*x5 + m35*x6; 1192 x[5] -= m6*x1 + m12*x2 + m18*x3 + m24*x4 + m30*x5 + m36*x6; 1193 1194 x[6] -= m1*x7 + m7*x8 + m13*x9 + m19*x10 + m25*x11 + m31*x12; 1195 x[7] -= m2*x7 + m8*x8 + m14*x9 + m20*x10 + m26*x11 + m32*x12; 1196 x[8] -= m3*x7 + m9*x8 + m15*x9 + m21*x10 + m27*x11 + m33*x12; 1197 x[9] -= m4*x7 + m10*x8 + m16*x9 + m22*x10 + m28*x11 + m34*x12; 1198 x[10] -= m5*x7 + m11*x8 + m17*x9 + m23*x10 + m29*x11 + m35*x12; 1199 x[11] -= m6*x7 + m12*x8 + m18*x9 + m24*x10 + m30*x11 + m36*x12; 1200 1201 x[12] -= m1*x13 + m7*x14 + m13*x15 + m19*x16 + m25*x17 + m31*x18; 1202 x[13] -= m2*x13 + m8*x14 + m14*x15 + m20*x16 + m26*x17 + m32*x18; 1203 x[14] -= m3*x13 + m9*x14 + m15*x15 + m21*x16 + m27*x17 + m33*x18; 1204 x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18; 1205 x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18; 1206 x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18; 1207 1208 x[18] -= m1*x19 + m7*x20 + m13*x21 + m19*x22 + m25*x23 + m31*x24; 1209 x[19] -= m2*x19 + m8*x20 + m14*x21 + m20*x22 + m26*x23 + m32*x24; 1210 x[20] -= m3*x19 + m9*x20 + m15*x21 + m21*x22 + m27*x23 + m33*x24; 1211 x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24; 1212 x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24; 1213 x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24; 1214 1215 x[24] -= m1*x25 + m7*x26 + m13*x27 + m19*x28 + m25*x29 + m31*x30; 1216 x[25] -= m2*x25 + m8*x26 + m14*x27 + m20*x28 + m26*x29 + m32*x30; 1217 x[26] -= m3*x25 + m9*x26 + m15*x27 + m21*x28 + m27*x29 + m33*x30; 1218 x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30; 1219 x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30; 1220 x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30; 1221 1222 x[30] -= m1*x31 + m7*x32 + m13*x33 + m19*x34 + m25*x35 + m31*x36; 1223 x[31] -= m2*x31 + m8*x32 + m14*x33 + m20*x34 + m26*x35 + m32*x36; 1224 x[32] -= m3*x31 + m9*x32 + m15*x33 + m21*x34 + m27*x35 + m33*x36; 1225 x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36; 1226 x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36; 1227 x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36; 1228 1229 pv += 36; 1230 } 1231 PLogFlops(432*nz+396); 1232 } 1233 row = *ajtmp++; 1234 } 1235 /* finished row so stick it into b->a */ 1236 pv = ba + 36*bi[i]; 1237 pj = bj + bi[i]; 1238 nz = bi[i+1] - bi[i]; 1239 for (j=0; j<nz; j++) { 1240 x = rtmp+36*pj[j]; 1241 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1242 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 1243 pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 1244 pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1245 pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 1246 pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 1247 pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 1248 pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 1249 pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 1250 pv += 36; 1251 } 1252 /* invert diagonal block */ 1253 w = ba + 36*diag_offset[i]; 1254 ierr = Kernel_A_gets_inverse_A_6(w);CHKERRQ(ierr); 1255 } 1256 1257 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1258 C->factor = FACTOR_LU; 1259 C->assembled = PETSC_TRUE; 1260 PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */ 1261 PetscFunctionReturn(0); 1262 } 1263 1264 /* ------------------------------------------------------------*/ 1265 /* 1266 Version for when blocks are 5 by 5 1267 */ 1268 #undef __FUNC__ 1269 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5" 1270 int MatCholeskyFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B) 1271 { 1272 Mat C = *B; 1273 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1274 IS isrow = b->row,isicol = b->icol; 1275 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1276 int *ajtmpold,*ajtmp,nz,row; 1277 int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 1278 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1279 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 1280 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 1281 MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 1282 MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 1283 MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 1284 MatScalar *ba = b->a,*aa = a->a; 1285 1286 PetscFunctionBegin; 1287 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1288 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1289 rtmp = (MatScalar*)PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1290 1291 for (i=0; i<n; i++) { 1292 nz = bi[i+1] - bi[i]; 1293 ajtmp = bj + bi[i]; 1294 for (j=0; j<nz; j++) { 1295 x = rtmp+25*ajtmp[j]; 1296 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1297 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 1298 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0; 1299 } 1300 /* load in initial (unfactored row) */ 1301 idx = r[i]; 1302 nz = ai[idx+1] - ai[idx]; 1303 ajtmpold = aj + ai[idx]; 1304 v = aa + 25*ai[idx]; 1305 for (j=0; j<nz; j++) { 1306 x = rtmp+25*ic[ajtmpold[j]]; 1307 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1308 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1309 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1310 x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17]; 1311 x[18] = v[18]; x[19] = v[19]; x[20] = v[20]; x[21] = v[21]; 1312 x[22] = v[22]; x[23] = v[23]; x[24] = v[24]; 1313 v += 25; 1314 } 1315 row = *ajtmp++; 1316 while (row < i) { 1317 pc = rtmp + 25*row; 1318 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1319 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1320 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1321 p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; 1322 p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 1323 p25 = pc[24]; 1324 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1325 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1326 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1327 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || 1328 p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || 1329 p24 != 0.0 || p25 != 0.0) { 1330 pv = ba + 25*diag_offset[row]; 1331 pj = bj + diag_offset[row] + 1; 1332 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1333 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1334 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1335 x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; 1336 x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; 1337 x23 = pv[22]; x24 = pv[23]; x25 = pv[24]; 1338 pc[0] = m1 = p1*x1 + p6*x2 + p11*x3 + p16*x4 + p21*x5; 1339 pc[1] = m2 = p2*x1 + p7*x2 + p12*x3 + p17*x4 + p22*x5; 1340 pc[2] = m3 = p3*x1 + p8*x2 + p13*x3 + p18*x4 + p23*x5; 1341 pc[3] = m4 = p4*x1 + p9*x2 + p14*x3 + p19*x4 + p24*x5; 1342 pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5; 1343 1344 pc[5] = m6 = p1*x6 + p6*x7 + p11*x8 + p16*x9 + p21*x10; 1345 pc[6] = m7 = p2*x6 + p7*x7 + p12*x8 + p17*x9 + p22*x10; 1346 pc[7] = m8 = p3*x6 + p8*x7 + p13*x8 + p18*x9 + p23*x10; 1347 pc[8] = m9 = p4*x6 + p9*x7 + p14*x8 + p19*x9 + p24*x10; 1348 pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10; 1349 1350 pc[10] = m11 = p1*x11 + p6*x12 + p11*x13 + p16*x14 + p21*x15; 1351 pc[11] = m12 = p2*x11 + p7*x12 + p12*x13 + p17*x14 + p22*x15; 1352 pc[12] = m13 = p3*x11 + p8*x12 + p13*x13 + p18*x14 + p23*x15; 1353 pc[13] = m14 = p4*x11 + p9*x12 + p14*x13 + p19*x14 + p24*x15; 1354 pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15; 1355 1356 pc[15] = m16 = p1*x16 + p6*x17 + p11*x18 + p16*x19 + p21*x20; 1357 pc[16] = m17 = p2*x16 + p7*x17 + p12*x18 + p17*x19 + p22*x20; 1358 pc[17] = m18 = p3*x16 + p8*x17 + p13*x18 + p18*x19 + p23*x20; 1359 pc[18] = m19 = p4*x16 + p9*x17 + p14*x18 + p19*x19 + p24*x20; 1360 pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20; 1361 1362 pc[20] = m21 = p1*x21 + p6*x22 + p11*x23 + p16*x24 + p21*x25; 1363 pc[21] = m22 = p2*x21 + p7*x22 + p12*x23 + p17*x24 + p22*x25; 1364 pc[22] = m23 = p3*x21 + p8*x22 + p13*x23 + p18*x24 + p23*x25; 1365 pc[23] = m24 = p4*x21 + p9*x22 + p14*x23 + p19*x24 + p24*x25; 1366 pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25; 1367 1368 nz = bi[row+1] - diag_offset[row] - 1; 1369 pv += 25; 1370 for (j=0; j<nz; j++) { 1371 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1372 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1373 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1374 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; 1375 x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; 1376 x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; x25 = pv[24]; 1377 x = rtmp + 25*pj[j]; 1378 x[0] -= m1*x1 + m6*x2 + m11*x3 + m16*x4 + m21*x5; 1379 x[1] -= m2*x1 + m7*x2 + m12*x3 + m17*x4 + m22*x5; 1380 x[2] -= m3*x1 + m8*x2 + m13*x3 + m18*x4 + m23*x5; 1381 x[3] -= m4*x1 + m9*x2 + m14*x3 + m19*x4 + m24*x5; 1382 x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5; 1383 1384 x[5] -= m1*x6 + m6*x7 + m11*x8 + m16*x9 + m21*x10; 1385 x[6] -= m2*x6 + m7*x7 + m12*x8 + m17*x9 + m22*x10; 1386 x[7] -= m3*x6 + m8*x7 + m13*x8 + m18*x9 + m23*x10; 1387 x[8] -= m4*x6 + m9*x7 + m14*x8 + m19*x9 + m24*x10; 1388 x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10; 1389 1390 x[10] -= m1*x11 + m6*x12 + m11*x13 + m16*x14 + m21*x15; 1391 x[11] -= m2*x11 + m7*x12 + m12*x13 + m17*x14 + m22*x15; 1392 x[12] -= m3*x11 + m8*x12 + m13*x13 + m18*x14 + m23*x15; 1393 x[13] -= m4*x11 + m9*x12 + m14*x13 + m19*x14 + m24*x15; 1394 x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15; 1395 1396 x[15] -= m1*x16 + m6*x17 + m11*x18 + m16*x19 + m21*x20; 1397 x[16] -= m2*x16 + m7*x17 + m12*x18 + m17*x19 + m22*x20; 1398 x[17] -= m3*x16 + m8*x17 + m13*x18 + m18*x19 + m23*x20; 1399 x[18] -= m4*x16 + m9*x17 + m14*x18 + m19*x19 + m24*x20; 1400 x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20; 1401 1402 x[20] -= m1*x21 + m6*x22 + m11*x23 + m16*x24 + m21*x25; 1403 x[21] -= m2*x21 + m7*x22 + m12*x23 + m17*x24 + m22*x25; 1404 x[22] -= m3*x21 + m8*x22 + m13*x23 + m18*x24 + m23*x25; 1405 x[23] -= m4*x21 + m9*x22 + m14*x23 + m19*x24 + m24*x25; 1406 x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25; 1407 1408 pv += 25; 1409 } 1410 PLogFlops(250*nz+225); 1411 } 1412 row = *ajtmp++; 1413 } 1414 /* finished row so stick it into b->a */ 1415 pv = ba + 25*bi[i]; 1416 pj = bj + bi[i]; 1417 nz = bi[i+1] - bi[i]; 1418 for (j=0; j<nz; j++) { 1419 x = rtmp+25*pj[j]; 1420 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1421 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1422 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1423 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16]; 1424 pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20]; 1425 pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; pv[24] = x[24]; 1426 pv += 25; 1427 } 1428 /* invert diagonal block */ 1429 w = ba + 25*diag_offset[i]; 1430 ierr = Kernel_A_gets_inverse_A_5(w);CHKERRQ(ierr); 1431 } 1432 1433 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1434 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1435 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1436 C->factor = FACTOR_LU; 1437 C->assembled = PETSC_TRUE; 1438 PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */ 1439 PetscFunctionReturn(0); 1440 } 1441 /* 1442 Version for when blocks are 5 by 5 Using natural ordering 1443 */ 1444 #undef __FUNC__ 1445 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering" 1446 int MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B) 1447 { 1448 Mat C = *B; 1449 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1450 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1451 int *ajtmpold,*ajtmp,nz,row; 1452 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 1453 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1454 MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 1455 MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 1456 MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 1457 MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 1458 MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 1459 MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 1460 MatScalar *ba = b->a,*aa = a->a; 1461 1462 PetscFunctionBegin; 1463 rtmp = (MatScalar*)PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1464 for (i=0; i<n; i++) { 1465 nz = bi[i+1] - bi[i]; 1466 ajtmp = bj + bi[i]; 1467 for (j=0; j<nz; j++) { 1468 x = rtmp+25*ajtmp[j]; 1469 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1470 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 1471 x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0; 1472 } 1473 /* load in initial (unfactored row) */ 1474 nz = ai[i+1] - ai[i]; 1475 ajtmpold = aj + ai[i]; 1476 v = aa + 25*ai[i]; 1477 for (j=0; j<nz; j++) { 1478 x = rtmp+25*ajtmpold[j]; 1479 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1480 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1481 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1482 x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; 1483 x[19] = v[19]; x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 1484 x[24] = v[24]; 1485 v += 25; 1486 } 1487 row = *ajtmp++; 1488 while (row < i) { 1489 pc = rtmp + 25*row; 1490 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1491 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1492 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1493 p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; 1494 p19 = pc[18]; p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; 1495 p24 = pc[23]; p25 = pc[24]; 1496 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1497 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1498 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1499 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 1500 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) { 1501 pv = ba + 25*diag_offset[row]; 1502 pj = bj + diag_offset[row] + 1; 1503 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1504 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1505 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1506 x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; 1507 x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1508 x25 = pv[24]; 1509 pc[0] = m1 = p1*x1 + p6*x2 + p11*x3 + p16*x4 + p21*x5; 1510 pc[1] = m2 = p2*x1 + p7*x2 + p12*x3 + p17*x4 + p22*x5; 1511 pc[2] = m3 = p3*x1 + p8*x2 + p13*x3 + p18*x4 + p23*x5; 1512 pc[3] = m4 = p4*x1 + p9*x2 + p14*x3 + p19*x4 + p24*x5; 1513 pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5; 1514 1515 pc[5] = m6 = p1*x6 + p6*x7 + p11*x8 + p16*x9 + p21*x10; 1516 pc[6] = m7 = p2*x6 + p7*x7 + p12*x8 + p17*x9 + p22*x10; 1517 pc[7] = m8 = p3*x6 + p8*x7 + p13*x8 + p18*x9 + p23*x10; 1518 pc[8] = m9 = p4*x6 + p9*x7 + p14*x8 + p19*x9 + p24*x10; 1519 pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10; 1520 1521 pc[10] = m11 = p1*x11 + p6*x12 + p11*x13 + p16*x14 + p21*x15; 1522 pc[11] = m12 = p2*x11 + p7*x12 + p12*x13 + p17*x14 + p22*x15; 1523 pc[12] = m13 = p3*x11 + p8*x12 + p13*x13 + p18*x14 + p23*x15; 1524 pc[13] = m14 = p4*x11 + p9*x12 + p14*x13 + p19*x14 + p24*x15; 1525 pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15; 1526 1527 pc[15] = m16 = p1*x16 + p6*x17 + p11*x18 + p16*x19 + p21*x20; 1528 pc[16] = m17 = p2*x16 + p7*x17 + p12*x18 + p17*x19 + p22*x20; 1529 pc[17] = m18 = p3*x16 + p8*x17 + p13*x18 + p18*x19 + p23*x20; 1530 pc[18] = m19 = p4*x16 + p9*x17 + p14*x18 + p19*x19 + p24*x20; 1531 pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20; 1532 1533 pc[20] = m21 = p1*x21 + p6*x22 + p11*x23 + p16*x24 + p21*x25; 1534 pc[21] = m22 = p2*x21 + p7*x22 + p12*x23 + p17*x24 + p22*x25; 1535 pc[22] = m23 = p3*x21 + p8*x22 + p13*x23 + p18*x24 + p23*x25; 1536 pc[23] = m24 = p4*x21 + p9*x22 + p14*x23 + p19*x24 + p24*x25; 1537 pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25; 1538 1539 nz = bi[row+1] - diag_offset[row] - 1; 1540 pv += 25; 1541 for (j=0; j<nz; j++) { 1542 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1543 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1544 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1545 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; 1546 x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; 1547 x24 = pv[23]; x25 = pv[24]; 1548 x = rtmp + 25*pj[j]; 1549 x[0] -= m1*x1 + m6*x2 + m11*x3 + m16*x4 + m21*x5; 1550 x[1] -= m2*x1 + m7*x2 + m12*x3 + m17*x4 + m22*x5; 1551 x[2] -= m3*x1 + m8*x2 + m13*x3 + m18*x4 + m23*x5; 1552 x[3] -= m4*x1 + m9*x2 + m14*x3 + m19*x4 + m24*x5; 1553 x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5; 1554 1555 x[5] -= m1*x6 + m6*x7 + m11*x8 + m16*x9 + m21*x10; 1556 x[6] -= m2*x6 + m7*x7 + m12*x8 + m17*x9 + m22*x10; 1557 x[7] -= m3*x6 + m8*x7 + m13*x8 + m18*x9 + m23*x10; 1558 x[8] -= m4*x6 + m9*x7 + m14*x8 + m19*x9 + m24*x10; 1559 x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10; 1560 1561 x[10] -= m1*x11 + m6*x12 + m11*x13 + m16*x14 + m21*x15; 1562 x[11] -= m2*x11 + m7*x12 + m12*x13 + m17*x14 + m22*x15; 1563 x[12] -= m3*x11 + m8*x12 + m13*x13 + m18*x14 + m23*x15; 1564 x[13] -= m4*x11 + m9*x12 + m14*x13 + m19*x14 + m24*x15; 1565 x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15; 1566 1567 x[15] -= m1*x16 + m6*x17 + m11*x18 + m16*x19 + m21*x20; 1568 x[16] -= m2*x16 + m7*x17 + m12*x18 + m17*x19 + m22*x20; 1569 x[17] -= m3*x16 + m8*x17 + m13*x18 + m18*x19 + m23*x20; 1570 x[18] -= m4*x16 + m9*x17 + m14*x18 + m19*x19 + m24*x20; 1571 x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20; 1572 1573 x[20] -= m1*x21 + m6*x22 + m11*x23 + m16*x24 + m21*x25; 1574 x[21] -= m2*x21 + m7*x22 + m12*x23 + m17*x24 + m22*x25; 1575 x[22] -= m3*x21 + m8*x22 + m13*x23 + m18*x24 + m23*x25; 1576 x[23] -= m4*x21 + m9*x22 + m14*x23 + m19*x24 + m24*x25; 1577 x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25; 1578 pv += 25; 1579 } 1580 PLogFlops(250*nz+225); 1581 } 1582 row = *ajtmp++; 1583 } 1584 /* finished row so stick it into b->a */ 1585 pv = ba + 25*bi[i]; 1586 pj = bj + bi[i]; 1587 nz = bi[i+1] - bi[i]; 1588 for (j=0; j<nz; j++) { 1589 x = rtmp+25*pj[j]; 1590 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1591 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1592 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1593 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16]; pv[17] = x[17]; 1594 pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; 1595 pv[23] = x[23]; pv[24] = x[24]; 1596 pv += 25; 1597 } 1598 /* invert diagonal block */ 1599 w = ba + 25*diag_offset[i]; 1600 ierr = Kernel_A_gets_inverse_A_5(w);CHKERRQ(ierr); 1601 } 1602 1603 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1604 C->factor = FACTOR_LU; 1605 C->assembled = PETSC_TRUE; 1606 PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */ 1607 PetscFunctionReturn(0); 1608 } 1609 1610 /* ------------------------------------------------------------*/ 1611 /* 1612 Version for when blocks are 4 by 4 1613 */ 1614 #undef __FUNC__ 1615 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4" 1616 int MatCholeskyFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B) 1617 { 1618 Mat C = *B; 1619 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1620 IS isrow = b->row,isicol = b->icol; 1621 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1622 int *ajtmpold,*ajtmp,nz,row; 1623 int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 1624 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1625 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 1626 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 1627 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 1628 MatScalar m13,m14,m15,m16; 1629 MatScalar *ba = b->a,*aa = a->a; 1630 1631 PetscFunctionBegin; 1632 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1633 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1634 rtmp = (MatScalar*)PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1635 1636 for (i=0; i<n; i++) { 1637 nz = bi[i+1] - bi[i]; 1638 ajtmp = bj + bi[i]; 1639 for (j=0; j<nz; j++) { 1640 x = rtmp+16*ajtmp[j]; 1641 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1642 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 1643 } 1644 /* load in initial (unfactored row) */ 1645 idx = r[i]; 1646 nz = ai[idx+1] - ai[idx]; 1647 ajtmpold = aj + ai[idx]; 1648 v = aa + 16*ai[idx]; 1649 for (j=0; j<nz; j++) { 1650 x = rtmp+16*ic[ajtmpold[j]]; 1651 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1652 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1653 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1654 x[14] = v[14]; x[15] = v[15]; 1655 v += 16; 1656 } 1657 row = *ajtmp++; 1658 while (row < i) { 1659 pc = rtmp + 16*row; 1660 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1661 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1662 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1663 p15 = pc[14]; p16 = pc[15]; 1664 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1665 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1666 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1667 || p16 != 0.0) { 1668 pv = ba + 16*diag_offset[row]; 1669 pj = bj + diag_offset[row] + 1; 1670 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1671 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1672 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1673 x15 = pv[14]; x16 = pv[15]; 1674 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 1675 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 1676 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 1677 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 1678 1679 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 1680 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 1681 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 1682 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 1683 1684 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 1685 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 1686 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 1687 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 1688 1689 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 1690 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 1691 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 1692 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 1693 1694 nz = bi[row+1] - diag_offset[row] - 1; 1695 pv += 16; 1696 for (j=0; j<nz; j++) { 1697 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1698 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1699 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1700 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1701 x = rtmp + 16*pj[j]; 1702 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 1703 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 1704 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 1705 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 1706 1707 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 1708 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 1709 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 1710 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 1711 1712 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 1713 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 1714 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 1715 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 1716 1717 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 1718 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 1719 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 1720 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 1721 1722 pv += 16; 1723 } 1724 PLogFlops(128*nz+112); 1725 } 1726 row = *ajtmp++; 1727 } 1728 /* finished row so stick it into b->a */ 1729 pv = ba + 16*bi[i]; 1730 pj = bj + bi[i]; 1731 nz = bi[i+1] - bi[i]; 1732 for (j=0; j<nz; j++) { 1733 x = rtmp+16*pj[j]; 1734 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1735 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1736 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1737 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1738 pv += 16; 1739 } 1740 /* invert diagonal block */ 1741 w = ba + 16*diag_offset[i]; 1742 ierr = Kernel_A_gets_inverse_A_4(w);CHKERRQ(ierr); 1743 } 1744 1745 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1746 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1747 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1748 C->factor = FACTOR_LU; 1749 C->assembled = PETSC_TRUE; 1750 PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */ 1751 PetscFunctionReturn(0); 1752 } 1753 /* 1754 Version for when blocks are 4 by 4 Using natural ordering 1755 */ 1756 #undef __FUNC__ 1757 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering" 1758 int MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B) 1759 { 1760 Mat C = *B; 1761 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1762 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1763 int *ajtmpold,*ajtmp,nz,row; 1764 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 1765 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1766 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 1767 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 1768 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 1769 MatScalar m13,m14,m15,m16; 1770 MatScalar *ba = b->a,*aa = a->a; 1771 1772 PetscFunctionBegin; 1773 rtmp = (MatScalar*)PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1774 1775 for (i=0; i<n; i++) { 1776 nz = bi[i+1] - bi[i]; 1777 ajtmp = bj + bi[i]; 1778 for (j=0; j<nz; j++) { 1779 x = rtmp+16*ajtmp[j]; 1780 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1781 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 1782 } 1783 /* load in initial (unfactored row) */ 1784 nz = ai[i+1] - ai[i]; 1785 ajtmpold = aj + ai[i]; 1786 v = aa + 16*ai[i]; 1787 for (j=0; j<nz; j++) { 1788 x = rtmp+16*ajtmpold[j]; 1789 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1790 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1791 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1792 x[14] = v[14]; x[15] = v[15]; 1793 v += 16; 1794 } 1795 row = *ajtmp++; 1796 while (row < i) { 1797 pc = rtmp + 16*row; 1798 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1799 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1800 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1801 p15 = pc[14]; p16 = pc[15]; 1802 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1803 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1804 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1805 || p16 != 0.0) { 1806 pv = ba + 16*diag_offset[row]; 1807 pj = bj + diag_offset[row] + 1; 1808 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1809 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1810 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1811 x15 = pv[14]; x16 = pv[15]; 1812 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 1813 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 1814 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 1815 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 1816 1817 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 1818 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 1819 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 1820 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 1821 1822 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 1823 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 1824 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 1825 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 1826 1827 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 1828 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 1829 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 1830 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 1831 1832 nz = bi[row+1] - diag_offset[row] - 1; 1833 pv += 16; 1834 for (j=0; j<nz; j++) { 1835 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1836 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1837 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1838 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1839 x = rtmp + 16*pj[j]; 1840 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 1841 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 1842 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 1843 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 1844 1845 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 1846 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 1847 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 1848 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 1849 1850 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 1851 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 1852 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 1853 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 1854 1855 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 1856 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 1857 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 1858 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 1859 1860 pv += 16; 1861 } 1862 PLogFlops(128*nz+112); 1863 } 1864 row = *ajtmp++; 1865 } 1866 /* finished row so stick it into b->a */ 1867 pv = ba + 16*bi[i]; 1868 pj = bj + bi[i]; 1869 nz = bi[i+1] - bi[i]; 1870 for (j=0; j<nz; j++) { 1871 x = rtmp+16*pj[j]; 1872 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1873 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1874 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1875 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1876 pv += 16; 1877 } 1878 /* invert diagonal block */ 1879 w = ba + 16*diag_offset[i]; 1880 ierr = Kernel_A_gets_inverse_A_4(w);CHKERRQ(ierr); 1881 } 1882 1883 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1884 C->factor = FACTOR_LU; 1885 C->assembled = PETSC_TRUE; 1886 PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */ 1887 PetscFunctionReturn(0); 1888 } 1889 1890 1891 /* ------------------------------------------------------------*/ 1892 /* 1893 Version for when blocks are 3 by 3 1894 */ 1895 #undef __FUNC__ 1896 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3" 1897 int MatCholeskyFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B) 1898 { 1899 Mat C = *B; 1900 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1901 IS isrow = b->row,isicol = b->icol; 1902 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1903 int *ajtmpold,*ajtmp,nz,row,*ai=a->i,*aj=a->j; 1904 int *diag_offset = b->diag,idx,*pj; 1905 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1906 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 1907 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9; 1908 MatScalar *ba = b->a,*aa = a->a; 1909 1910 PetscFunctionBegin; 1911 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1912 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1913 rtmp = (MatScalar*)PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1914 1915 for (i=0; i<n; i++) { 1916 nz = bi[i+1] - bi[i]; 1917 ajtmp = bj + bi[i]; 1918 for (j=0; j<nz; j++) { 1919 x = rtmp + 9*ajtmp[j]; 1920 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 1921 } 1922 /* load in initial (unfactored row) */ 1923 idx = r[i]; 1924 nz = ai[idx+1] - ai[idx]; 1925 ajtmpold = aj + ai[idx]; 1926 v = aa + 9*ai[idx]; 1927 for (j=0; j<nz; j++) { 1928 x = rtmp + 9*ic[ajtmpold[j]]; 1929 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1930 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1931 v += 9; 1932 } 1933 row = *ajtmp++; 1934 while (row < i) { 1935 pc = rtmp + 9*row; 1936 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1937 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1938 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1939 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 1940 pv = ba + 9*diag_offset[row]; 1941 pj = bj + diag_offset[row] + 1; 1942 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1943 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1944 pc[0] = m1 = p1*x1 + p4*x2 + p7*x3; 1945 pc[1] = m2 = p2*x1 + p5*x2 + p8*x3; 1946 pc[2] = m3 = p3*x1 + p6*x2 + p9*x3; 1947 1948 pc[3] = m4 = p1*x4 + p4*x5 + p7*x6; 1949 pc[4] = m5 = p2*x4 + p5*x5 + p8*x6; 1950 pc[5] = m6 = p3*x4 + p6*x5 + p9*x6; 1951 1952 pc[6] = m7 = p1*x7 + p4*x8 + p7*x9; 1953 pc[7] = m8 = p2*x7 + p5*x8 + p8*x9; 1954 pc[8] = m9 = p3*x7 + p6*x8 + p9*x9; 1955 nz = bi[row+1] - diag_offset[row] - 1; 1956 pv += 9; 1957 for (j=0; j<nz; j++) { 1958 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1959 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1960 x = rtmp + 9*pj[j]; 1961 x[0] -= m1*x1 + m4*x2 + m7*x3; 1962 x[1] -= m2*x1 + m5*x2 + m8*x3; 1963 x[2] -= m3*x1 + m6*x2 + m9*x3; 1964 1965 x[3] -= m1*x4 + m4*x5 + m7*x6; 1966 x[4] -= m2*x4 + m5*x5 + m8*x6; 1967 x[5] -= m3*x4 + m6*x5 + m9*x6; 1968 1969 x[6] -= m1*x7 + m4*x8 + m7*x9; 1970 x[7] -= m2*x7 + m5*x8 + m8*x9; 1971 x[8] -= m3*x7 + m6*x8 + m9*x9; 1972 pv += 9; 1973 } 1974 PLogFlops(54*nz+36); 1975 } 1976 row = *ajtmp++; 1977 } 1978 /* finished row so stick it into b->a */ 1979 pv = ba + 9*bi[i]; 1980 pj = bj + bi[i]; 1981 nz = bi[i+1] - bi[i]; 1982 for (j=0; j<nz; j++) { 1983 x = rtmp + 9*pj[j]; 1984 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1985 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1986 pv += 9; 1987 } 1988 /* invert diagonal block */ 1989 w = ba + 9*diag_offset[i]; 1990 ierr = Kernel_A_gets_inverse_A_3(w);CHKERRQ(ierr); 1991 } 1992 1993 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1994 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1995 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1996 C->factor = FACTOR_LU; 1997 C->assembled = PETSC_TRUE; 1998 PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */ 1999 PetscFunctionReturn(0); 2000 } 2001 /* 2002 Version for when blocks are 3 by 3 Using natural ordering 2003 */ 2004 #undef __FUNC__ 2005 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering" 2006 int MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B) 2007 { 2008 Mat C = *B; 2009 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 2010 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 2011 int *ajtmpold,*ajtmp,nz,row; 2012 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 2013 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2014 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 2015 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9; 2016 MatScalar *ba = b->a,*aa = a->a; 2017 2018 PetscFunctionBegin; 2019 rtmp = (MatScalar*)PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 2020 2021 for (i=0; i<n; i++) { 2022 nz = bi[i+1] - bi[i]; 2023 ajtmp = bj + bi[i]; 2024 for (j=0; j<nz; j++) { 2025 x = rtmp+9*ajtmp[j]; 2026 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 2027 } 2028 /* load in initial (unfactored row) */ 2029 nz = ai[i+1] - ai[i]; 2030 ajtmpold = aj + ai[i]; 2031 v = aa + 9*ai[i]; 2032 for (j=0; j<nz; j++) { 2033 x = rtmp+9*ajtmpold[j]; 2034 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 2035 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 2036 v += 9; 2037 } 2038 row = *ajtmp++; 2039 while (row < i) { 2040 pc = rtmp + 9*row; 2041 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 2042 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 2043 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 2044 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 2045 pv = ba + 9*diag_offset[row]; 2046 pj = bj + diag_offset[row] + 1; 2047 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2048 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 2049 pc[0] = m1 = p1*x1 + p4*x2 + p7*x3; 2050 pc[1] = m2 = p2*x1 + p5*x2 + p8*x3; 2051 pc[2] = m3 = p3*x1 + p6*x2 + p9*x3; 2052 2053 pc[3] = m4 = p1*x4 + p4*x5 + p7*x6; 2054 pc[4] = m5 = p2*x4 + p5*x5 + p8*x6; 2055 pc[5] = m6 = p3*x4 + p6*x5 + p9*x6; 2056 2057 pc[6] = m7 = p1*x7 + p4*x8 + p7*x9; 2058 pc[7] = m8 = p2*x7 + p5*x8 + p8*x9; 2059 pc[8] = m9 = p3*x7 + p6*x8 + p9*x9; 2060 2061 nz = bi[row+1] - diag_offset[row] - 1; 2062 pv += 9; 2063 for (j=0; j<nz; j++) { 2064 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2065 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 2066 x = rtmp + 9*pj[j]; 2067 x[0] -= m1*x1 + m4*x2 + m7*x3; 2068 x[1] -= m2*x1 + m5*x2 + m8*x3; 2069 x[2] -= m3*x1 + m6*x2 + m9*x3; 2070 2071 x[3] -= m1*x4 + m4*x5 + m7*x6; 2072 x[4] -= m2*x4 + m5*x5 + m8*x6; 2073 x[5] -= m3*x4 + m6*x5 + m9*x6; 2074 2075 x[6] -= m1*x7 + m4*x8 + m7*x9; 2076 x[7] -= m2*x7 + m5*x8 + m8*x9; 2077 x[8] -= m3*x7 + m6*x8 + m9*x9; 2078 pv += 9; 2079 } 2080 PLogFlops(54*nz+36); 2081 } 2082 row = *ajtmp++; 2083 } 2084 /* finished row so stick it into b->a */ 2085 pv = ba + 9*bi[i]; 2086 pj = bj + bi[i]; 2087 nz = bi[i+1] - bi[i]; 2088 for (j=0; j<nz; j++) { 2089 x = rtmp+9*pj[j]; 2090 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 2091 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 2092 pv += 9; 2093 } 2094 /* invert diagonal block */ 2095 w = ba + 9*diag_offset[i]; 2096 ierr = Kernel_A_gets_inverse_A_3(w);CHKERRQ(ierr); 2097 } 2098 2099 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2100 C->factor = FACTOR_LU; 2101 C->assembled = PETSC_TRUE; 2102 PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */ 2103 PetscFunctionReturn(0); 2104 } 2105 2106 /* ------------------------------------------------------------*/ 2107 /* 2108 Version for when blocks are 2 by 2 2109 */ 2110 #undef __FUNC__ 2111 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2" 2112 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B) 2113 { 2114 Mat C = *B; 2115 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 2116 IS isrow = b->row,isicol = b->icol; 2117 int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 2118 int *ajtmpold,*ajtmp,nz,row; 2119 int *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj; 2120 MatScalar *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4; 2121 MatScalar p1,p2,p3,p4; 2122 MatScalar *ba = b->a,*aa = a->a; 2123 2124 PetscFunctionBegin; 2125 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 2126 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 2127 rtmp = (MatScalar*)PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 2128 2129 for (i=0; i<n; i++) { 2130 nz = bi[i+1] - bi[i]; 2131 ajtmp = bj + bi[i]; 2132 for (j=0; j<nz; j++) { 2133 x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0; 2134 } 2135 /* load in initial (unfactored row) */ 2136 idx = r[i]; 2137 nz = ai[idx+1] - ai[idx]; 2138 ajtmpold = aj + ai[idx]; 2139 v = aa + 4*ai[idx]; 2140 for (j=0; j<nz; j++) { 2141 x = rtmp+4*ic[ajtmpold[j]]; 2142 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 2143 v += 4; 2144 } 2145 row = *ajtmp++; 2146 while (row < i) { 2147 pc = rtmp + 4*row; 2148 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 2149 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 2150 pv = ba + 4*diag_offset[row]; 2151 pj = bj + diag_offset[row] + 1; 2152 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2153 pc[0] = m1 = p1*x1 + p3*x2; 2154 pc[1] = m2 = p2*x1 + p4*x2; 2155 pc[2] = m3 = p1*x3 + p3*x4; 2156 pc[3] = m4 = p2*x3 + p4*x4; 2157 nz = bi[row+1] - diag_offset[row] - 1; 2158 pv += 4; 2159 for (j=0; j<nz; j++) { 2160 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2161 x = rtmp + 4*pj[j]; 2162 x[0] -= m1*x1 + m3*x2; 2163 x[1] -= m2*x1 + m4*x2; 2164 x[2] -= m1*x3 + m3*x4; 2165 x[3] -= m2*x3 + m4*x4; 2166 pv += 4; 2167 } 2168 PLogFlops(16*nz+12); 2169 } 2170 row = *ajtmp++; 2171 } 2172 /* finished row so stick it into b->a */ 2173 pv = ba + 4*bi[i]; 2174 pj = bj + bi[i]; 2175 nz = bi[i+1] - bi[i]; 2176 for (j=0; j<nz; j++) { 2177 x = rtmp+4*pj[j]; 2178 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 2179 pv += 4; 2180 } 2181 /* invert diagonal block */ 2182 w = ba + 4*diag_offset[i]; 2183 ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr); 2184 /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/ 2185 } 2186 2187 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2188 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2189 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2190 C->factor = FACTOR_LU; 2191 C->assembled = PETSC_TRUE; 2192 PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 2193 PetscFunctionReturn(0); 2194 } 2195 /* 2196 Version for when blocks are 2 by 2 Using natural ordering 2197 */ 2198 #undef __FUNC__ 2199 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering" 2200 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B) 2201 { 2202 Mat C = *B; 2203 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 2204 int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 2205 int *ajtmpold,*ajtmp,nz,row; 2206 int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 2207 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2208 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4; 2209 MatScalar *ba = b->a,*aa = a->a; 2210 2211 PetscFunctionBegin; 2212 rtmp = (MatScalar*)PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 2213 2214 for (i=0; i<n; i++) { 2215 nz = bi[i+1] - bi[i]; 2216 ajtmp = bj + bi[i]; 2217 for (j=0; j<nz; j++) { 2218 x = rtmp+4*ajtmp[j]; 2219 x[0] = x[1] = x[2] = x[3] = 0.0; 2220 } 2221 /* load in initial (unfactored row) */ 2222 nz = ai[i+1] - ai[i]; 2223 ajtmpold = aj + ai[i]; 2224 v = aa + 4*ai[i]; 2225 for (j=0; j<nz; j++) { 2226 x = rtmp+4*ajtmpold[j]; 2227 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 2228 v += 4; 2229 } 2230 row = *ajtmp++; 2231 while (row < i) { 2232 pc = rtmp + 4*row; 2233 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 2234 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 2235 pv = ba + 4*diag_offset[row]; 2236 pj = bj + diag_offset[row] + 1; 2237 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2238 pc[0] = m1 = p1*x1 + p3*x2; 2239 pc[1] = m2 = p2*x1 + p4*x2; 2240 pc[2] = m3 = p1*x3 + p3*x4; 2241 pc[3] = m4 = p2*x3 + p4*x4; 2242 nz = bi[row+1] - diag_offset[row] - 1; 2243 pv += 4; 2244 for (j=0; j<nz; j++) { 2245 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2246 x = rtmp + 4*pj[j]; 2247 x[0] -= m1*x1 + m3*x2; 2248 x[1] -= m2*x1 + m4*x2; 2249 x[2] -= m1*x3 + m3*x4; 2250 x[3] -= m2*x3 + m4*x4; 2251 pv += 4; 2252 } 2253 PLogFlops(16*nz+12); 2254 } 2255 row = *ajtmp++; 2256 } 2257 /* finished row so stick it into b->a */ 2258 pv = ba + 4*bi[i]; 2259 pj = bj + bi[i]; 2260 nz = bi[i+1] - bi[i]; 2261 for (j=0; j<nz; j++) { 2262 x = rtmp+4*pj[j]; 2263 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 2264 pv += 4; 2265 } 2266 /* invert diagonal block */ 2267 w = ba + 4*diag_offset[i]; 2268 ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr); 2269 /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/ 2270 } 2271 2272 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2273 C->factor = FACTOR_LU; 2274 C->assembled = PETSC_TRUE; 2275 PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 2276 PetscFunctionReturn(0); 2277 } 2278 2279 /* ----------------------------------------------------------- */ 2280 /* 2281 Version for when blocks are 1 by 1. 2282 */ 2283 #undef __FUNC__ 2284 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1" 2285 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B) 2286 { 2287 Mat C = *B; 2288 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 2289 IS ip = b->row; 2290 int *rip,*riip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j; 2291 int *ai = a->i,*aj = a->j; 2292 MatScalar *rtmp; 2293 MatScalar *ba = b->a,*aa = a->a; 2294 MatScalar dk,uikdi; 2295 int k,jmin,jmax,*jl,*il,vj,nexti,juj,ili; 2296 2297 PetscFunctionBegin; 2298 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 2299 riip = rip; 2300 2301 /* INITIALIZATION */ 2302 /* il and jl record the first nonzero element in each row of the accessing 2303 window U(0:k, k:mbs-1). 2304 jl: list of rows to be added to uneliminated rows 2305 i>= k: jl(i) is the first row to be added to row i 2306 i< k: jl(i) is the row following row i in some list of rows 2307 jl(i) = mbs indicates the end of a list 2308 il(i): points to the first nonzero element in columns k,...,mbs-1 of 2309 row i of U */ 2310 rtmp = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp); 2311 il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 2312 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2313 for (i=0; i<mbs; i++) { 2314 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 2315 } 2316 2317 /* FOR EACH ROW K */ 2318 for (k = 0; k<mbs; k++){ 2319 2320 /* INITIALIZE K-TH ROW WITH ELEMENTS NONZERO IN ROW P(K) OF A */ 2321 jmin = ai[rip[k]]; jmax = ai[rip[k]+1]; 2322 if (jmin < jmax) { 2323 for (j = jmin; j < jmax; j++){ 2324 vj = riip[aj[j]]; 2325 if (k <= vj) rtmp[vj] = aa[j]; 2326 } 2327 } 2328 2329 /* MODIFY K-TH ROW BY ADDING IN THOSE ROWS I WITH U(I,K) NE 0 2330 FOR EACH ROW I TO BE ADDED IN */ 2331 dk = rtmp[k]; 2332 i = jl[k]; /* first row to be added to k_th row */ 2333 /* printf(" k=%d, pivot row = %d\n",k,i); */ 2334 2335 while (i < mbs){ 2336 nexti = jl[i]; /* next row to be added to k_th row */ 2337 /* printf(" pivot row = %d\n", nexti); */ 2338 2339 /* COMPUTE MULTIPLIER AND UPDATE DIAGONAL ELEMENT */ 2340 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 2341 uikdi = - ba[ili]*ba[i]; 2342 dk += uikdi*ba[ili]; 2343 ba[ili] = uikdi; /* update U(i,k) */ 2344 2345 /* ADD MULTIPLE OF ROW I TO K-TH ROW ... */ 2346 jmin = ili + 1; jmax = bi[i+1]; 2347 if (jmin < jmax){ 2348 for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j]; 2349 /* ... AND ADD I TO ROW LIST FOR NEXT NONZERO ENTRY */ 2350 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 2351 j = bj[jmin]; 2352 jl[i] = jl[j]; jl[j] = i; /* update jl */ 2353 } 2354 i = nexti; 2355 /* printf(" pivot row i=%d\n",i); */ 2356 } 2357 2358 /* CHECK FOR ZERO PIVOT AND SAVE DIAGONAL ELEMENT */ 2359 if (dk == 0.0){ 2360 SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,0,"Zero pivot"); 2361 } 2362 2363 /* SAVE NONZERO ENTRIES IN K-TH ROW OF U ... */ 2364 ba[k] = 1.0/dk; 2365 jmin = bi[k]; jmax = bi[k+1]; 2366 if (jmin < jmax) { 2367 for (j=jmin; j<jmax; j++){ 2368 juj = bj[j]; ba[j] = rtmp[juj]; rtmp[juj] = 0.0; 2369 } 2370 2371 /* ... AND ADD K TO ROW LIST FOR FIRST NONZERO ENTRY IN K-TH ROW */ 2372 il[k] = jmin; 2373 i = bj[jmin]; 2374 jl[k] = jl[i]; jl[i] = k; 2375 } 2376 } 2377 2378 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2379 ierr = PetscFree(il);CHKERRQ(ierr); 2380 ierr = PetscFree(jl);CHKERRQ(ierr); 2381 2382 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 2383 C->factor = FACTOR_LU; 2384 C->assembled = PETSC_TRUE; 2385 PLogFlops(b->mbs); 2386 PetscFunctionReturn(0); 2387 } 2388 2389 #undef __FUNC__ 2390 #define __FUNC__ "MatCholeskyFactor_SeqSBAIJ" 2391 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f) 2392 { 2393 Mat_SeqSBAIJ *mat = (Mat_SeqSBAIJ*)A->data; 2394 int ierr,refct; 2395 Mat C; 2396 PetscOps *Abops; 2397 MatOps Aops; 2398 2399 PetscFunctionBegin; 2400 ierr = MatCholeskyFactorSymbolic(A,perm,f,&C);CHKERRQ(ierr); 2401 ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr); 2402 2403 /* free all the data structures from mat */ 2404 ierr = PetscFree(mat->a);CHKERRQ(ierr); 2405 if (!mat->singlemalloc) { 2406 ierr = PetscFree(mat->i);CHKERRQ(ierr); 2407 ierr = PetscFree(mat->j);CHKERRQ(ierr); 2408 } 2409 if (mat->diag) {ierr = PetscFree(mat->diag);CHKERRQ(ierr);} 2410 if (mat->ilen) {ierr = PetscFree(mat->ilen);CHKERRQ(ierr);} 2411 if (mat->imax) {ierr = PetscFree(mat->imax);CHKERRQ(ierr);} 2412 if (mat->solve_work) {ierr = PetscFree(mat->solve_work);CHKERRQ(ierr);} 2413 if (mat->mult_work) {ierr = PetscFree(mat->mult_work);CHKERRQ(ierr);} 2414 if (mat->icol) {ierr = ISDestroy(mat->icol);CHKERRQ(ierr);} 2415 ierr = PetscFree(mat);CHKERRQ(ierr); 2416 2417 ierr = MapDestroy(A->rmap);CHKERRQ(ierr); 2418 ierr = MapDestroy(A->cmap);CHKERRQ(ierr); 2419 2420 /* 2421 This is horrible,horrible code. We need to keep the 2422 A pointers for the bops and ops but copy everything 2423 else from C. 2424 */ 2425 Abops = A->bops; 2426 Aops = A->ops; 2427 refct = A->refct; 2428 ierr = PetscMemcpy(A,C,sizeof(struct _p_Mat));CHKERRQ(ierr); 2429 mat = (Mat_SeqSBAIJ*)A->data; 2430 PLogObjectParent(A,mat->icol); 2431 2432 A->bops = Abops; 2433 A->ops = Aops; 2434 A->qlist = 0; 2435 A->refct = refct; 2436 /* copy over the type_name and name */ 2437 ierr = PetscStrallocpy(C->type_name,&A->type_name);CHKERRQ(ierr); 2438 ierr = PetscStrallocpy(C->name,&A->name);CHKERRQ(ierr); 2439 2440 PetscHeaderDestroy(C); 2441 PetscFunctionReturn(0); 2442 } 2443 2444 2445