1a5eb4965SSatish Balay #ifdef PETSC_RCS_HEADER 2*4cd38560SBarry Smith static char vcid[] = "$Id: baijfact.c,v 1.68 1999/02/03 17:22:53 balay Exp bsmith $"; 35d517e7eSBarry Smith #endif 45d517e7eSBarry Smith /* 5ec3a8b7bSBarry Smith Factorization code for BAIJ format. 65d517e7eSBarry Smith */ 75d517e7eSBarry Smith 870f55243SBarry Smith #include "src/mat/impls/baij/seq/baij.h" 9c16cb8f2SBarry Smith #include "src/vec/vecimpl.h" 104078e994SBarry Smith #include "src/inline/ilu.h" 11ec3a8b7bSBarry Smith 12c16cb8f2SBarry Smith 13ec3a8b7bSBarry Smith /* 14ec3a8b7bSBarry Smith The symbolic factorization code is identical to that for AIJ format, 15ec3a8b7bSBarry Smith except for very small changes since this is now a SeqBAIJ datastructure. 16ec3a8b7bSBarry Smith NOT good code reuse. 17ec3a8b7bSBarry Smith */ 185615d1e5SSatish Balay #undef __FUNC__ 195615d1e5SSatish Balay #define __FUNC__ "MatLUFactorSymbolic_SeqBAIJ" 20ec3a8b7bSBarry Smith int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B) 215d517e7eSBarry Smith { 22ec3a8b7bSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 235d517e7eSBarry Smith IS isicol; 24ec3a8b7bSBarry Smith int *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j; 259df24120SSatish Balay int *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs, bs2=a->bs2; 2691bf9adeSBarry Smith int *idnew, idx, row,m,fm, nnz, nzi,realloc = 0,nzbd,*im; 275d517e7eSBarry Smith 283a40ed3dSBarry Smith PetscFunctionBegin; 29d3cbd50cSLois Curfman McInnes PetscValidHeaderSpecific(isrow,IS_COOKIE); 30d3cbd50cSLois Curfman McInnes PetscValidHeaderSpecific(iscol,IS_COOKIE); 315d517e7eSBarry Smith ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 325d517e7eSBarry Smith ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic); 335d517e7eSBarry Smith 345d517e7eSBarry Smith /* get new row pointers */ 355d517e7eSBarry Smith ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 36de6a44a3SBarry Smith ainew[0] = 0; 375d517e7eSBarry Smith /* don't know how many column pointers are needed so estimate */ 38de6a44a3SBarry Smith jmax = (int) (f*ai[n] + 1); 395d517e7eSBarry Smith ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 405d517e7eSBarry Smith /* fill is a linked list of nonzeros in active row */ 415d517e7eSBarry Smith fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill); 425d517e7eSBarry Smith im = fill + n + 1; 435d517e7eSBarry Smith /* idnew is location of diagonal in factor */ 445d517e7eSBarry Smith idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew); 45de6a44a3SBarry Smith idnew[0] = 0; 465d517e7eSBarry Smith 475d517e7eSBarry Smith for ( i=0; i<n; i++ ) { 485d517e7eSBarry Smith /* first copy previous fill into linked list */ 495d517e7eSBarry Smith nnz = nz = ai[r[i]+1] - ai[r[i]]; 506b96ef82SSatish Balay if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,1,"Empty row in matrix"); 51de6a44a3SBarry Smith ajtmp = aj + ai[r[i]]; 525d517e7eSBarry Smith fill[n] = n; 535d517e7eSBarry Smith while (nz--) { 545d517e7eSBarry Smith fm = n; 55de6a44a3SBarry Smith idx = ic[*ajtmp++]; 565d517e7eSBarry Smith do { 575d517e7eSBarry Smith m = fm; 585d517e7eSBarry Smith fm = fill[m]; 595d517e7eSBarry Smith } while (fm < idx); 605d517e7eSBarry Smith fill[m] = idx; 615d517e7eSBarry Smith fill[idx] = fm; 625d517e7eSBarry Smith } 635d517e7eSBarry Smith row = fill[n]; 645d517e7eSBarry Smith while ( row < i ) { 65de6a44a3SBarry Smith ajtmp = ajnew + idnew[row] + 1; 665d517e7eSBarry Smith nzbd = 1 + idnew[row] - ainew[row]; 675d517e7eSBarry Smith nz = im[row] - nzbd; 685d517e7eSBarry Smith fm = row; 695d517e7eSBarry Smith while (nz-- > 0) { 70de6a44a3SBarry Smith idx = *ajtmp++; 715d517e7eSBarry Smith nzbd++; 725d517e7eSBarry Smith if (idx == i) im[row] = nzbd; 735d517e7eSBarry Smith do { 745d517e7eSBarry Smith m = fm; 755d517e7eSBarry Smith fm = fill[m]; 765d517e7eSBarry Smith } while (fm < idx); 775d517e7eSBarry Smith if (fm != idx) { 785d517e7eSBarry Smith fill[m] = idx; 795d517e7eSBarry Smith fill[idx] = fm; 805d517e7eSBarry Smith fm = idx; 815d517e7eSBarry Smith nnz++; 825d517e7eSBarry Smith } 835d517e7eSBarry Smith } 845d517e7eSBarry Smith row = fill[row]; 855d517e7eSBarry Smith } 865d517e7eSBarry Smith /* copy new filled row into permanent storage */ 875d517e7eSBarry Smith ainew[i+1] = ainew[i] + nnz; 884078e994SBarry Smith if (ainew[i+1] > jmax) { 89ecf371e4SBarry Smith 90ecf371e4SBarry Smith /* estimate how much additional space we will need */ 91ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 92ecf371e4SBarry Smith /* just double the memory each time */ 93ecf371e4SBarry Smith int maxadd = jmax; 94ecf371e4SBarry Smith /* maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n); */ 955d517e7eSBarry Smith if (maxadd < nnz) maxadd = (n-i)*(nnz+1); 965d517e7eSBarry Smith jmax += maxadd; 97ecf371e4SBarry Smith 98ecf371e4SBarry Smith /* allocate a longer ajnew */ 995d517e7eSBarry Smith ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp); 100de6a44a3SBarry Smith PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int)); 1015d517e7eSBarry Smith PetscFree(ajnew); 1025d517e7eSBarry Smith ajnew = ajtmp; 1035d517e7eSBarry Smith realloc++; /* count how many times we realloc */ 1045d517e7eSBarry Smith } 105de6a44a3SBarry Smith ajtmp = ajnew + ainew[i]; 1065d517e7eSBarry Smith fm = fill[n]; 1075d517e7eSBarry Smith nzi = 0; 1085d517e7eSBarry Smith im[i] = nnz; 1095d517e7eSBarry Smith while (nnz--) { 1105d517e7eSBarry Smith if (fm < i) nzi++; 111de6a44a3SBarry Smith *ajtmp++ = fm; 1125d517e7eSBarry Smith fm = fill[fm]; 1135d517e7eSBarry Smith } 1145d517e7eSBarry Smith idnew[i] = ainew[i] + nzi; 1155d517e7eSBarry Smith } 1165d517e7eSBarry Smith 117fbe2de07SSatish Balay if (ai[n] != 0) { 118fbe2de07SSatish Balay double af = ((double)ainew[n])/((double)ai[n]); 119981c4779SBarry Smith PLogInfo(A,"MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n", 120f64065bbSBarry Smith realloc,f,af); 121981c4779SBarry Smith PLogInfo(A,"MatLUFactorSymbolic_SeqBAIJ:Run with -pc_lu_fill %g or use \n",af); 122981c4779SBarry Smith PLogInfo(A,"MatLUFactorSymbolic_SeqBAIJ:PCLUSetFill(pc,%g);\n",af); 123981c4779SBarry Smith PLogInfo(A,"MatLUFactorSymbolic_SeqBAIJ:for best performance.\n"); 12470794672SSatish Balay } else { 125981c4779SBarry Smith PLogInfo(A,"MatLUFactorSymbolic_SeqBAIJ:Empty matrix.\n"); 12670794672SSatish Balay } 12770794672SSatish Balay 1285d517e7eSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 1295d517e7eSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 1305d517e7eSBarry Smith 1315d517e7eSBarry Smith PetscFree(fill); 1325d517e7eSBarry Smith 1335d517e7eSBarry Smith /* put together the new matrix */ 134ec3a8b7bSBarry Smith ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr); 1355d517e7eSBarry Smith PLogObjectParent(*B,isicol); 136ec3a8b7bSBarry Smith b = (Mat_SeqBAIJ *) (*B)->data; 1375d517e7eSBarry Smith PetscFree(b->imax); 1385d517e7eSBarry Smith b->singlemalloc = 0; 1395d517e7eSBarry Smith /* the next line frees the default space generated by the Create() */ 1405d517e7eSBarry Smith PetscFree(b->a); PetscFree(b->ilen); 1413f1db9ecSBarry Smith b->a = (MatScalar *) PetscMalloc((ainew[n]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a); 1425d517e7eSBarry Smith b->j = ajnew; 1435d517e7eSBarry Smith b->i = ainew; 1445d517e7eSBarry Smith b->diag = idnew; 1455d517e7eSBarry Smith b->ilen = 0; 1465d517e7eSBarry Smith b->imax = 0; 1475d517e7eSBarry Smith b->row = isrow; 1485d517e7eSBarry Smith b->col = iscol; 149e51c0b9cSSatish Balay b->icol = isicol; 1503a40ed3dSBarry Smith b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 1515d517e7eSBarry Smith /* In b structure: Free imax, ilen, old a, old j. 1525d517e7eSBarry Smith Allocate idnew, solve_work, new a, new j */ 1533f1db9ecSBarry Smith PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(MatScalar))); 154de6a44a3SBarry Smith b->maxnz = b->nz = ainew[n]; 1555d517e7eSBarry Smith 156e93ccc4dSBarry Smith (*B)->factor = FACTOR_LU; 157ae068f58SLois Curfman McInnes (*B)->info.factor_mallocs = realloc; 158ae068f58SLois Curfman McInnes (*B)->info.fill_ratio_given = f; 159e93ccc4dSBarry Smith if (ai[n] != 0) { 160e93ccc4dSBarry Smith (*B)->info.fill_ratio_needed = ((double)ainew[n])/((double)ai[n]); 161966b9fa7SBarry Smith } else { 162966b9fa7SBarry Smith (*B)->info.fill_ratio_needed = 0.0; 163966b9fa7SBarry Smith } 164966b9fa7SBarry Smith 165ae068f58SLois Curfman McInnes 1663a40ed3dSBarry Smith PetscFunctionReturn(0); 1675d517e7eSBarry Smith } 1685d517e7eSBarry Smith 169ec3a8b7bSBarry Smith /* ----------------------------------------------------------- */ 1705615d1e5SSatish Balay #undef __FUNC__ 1715615d1e5SSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_N" 1727fc0212eSBarry Smith int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B) 1735d517e7eSBarry Smith { 1745d517e7eSBarry Smith Mat C = *B; 175ec3a8b7bSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 17608b7f112SBarry Smith IS isrow = b->row, isicol = b->icol; 1774078e994SBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 17848e9ddb2SSatish Balay int *ajtmpold, *ajtmp, nz, row, bslog,*ai=a->i,*aj=a->j,k,flg; 17948e9ddb2SSatish Balay int *diag_offset=b->diag,diag,bs=a->bs,bs2 = a->bs2,*v_pivots; 1805d517e7eSBarry Smith register int *pj; 1813f1db9ecSBarry Smith register MatScalar *pv,*v,*rtmp,*multiplier,*v_work,*pc,*w; 1823f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 1835d517e7eSBarry Smith 1843a40ed3dSBarry Smith PetscFunctionBegin; 1855d517e7eSBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 1865d517e7eSBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 1873f1db9ecSBarry Smith rtmp = (MatScalar *) PetscMalloc(bs2*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1883f1db9ecSBarry Smith PetscMemzero(rtmp,bs2*(n+1)*sizeof(MatScalar)); 189ec3a8b7bSBarry Smith /* generate work space needed by dense LU factorization */ 1903f1db9ecSBarry Smith v_work = (MatScalar *) PetscMalloc(bs*sizeof(int) + (bs+bs2)*sizeof(MatScalar));CHKPTRQ(v_work); 1916d84be18SBarry Smith multiplier = v_work + bs; 192de6a44a3SBarry Smith v_pivots = (int *) (multiplier + bs2); 193de6a44a3SBarry Smith 194de6a44a3SBarry Smith /* flops in while loop */ 1954eeb42bcSBarry Smith bslog = 2*bs*bs2; 1965d517e7eSBarry Smith 1975d517e7eSBarry Smith for ( i=0; i<n; i++ ) { 1984078e994SBarry Smith nz = bi[i+1] - bi[i]; 1994078e994SBarry Smith ajtmp = bj + bi[i]; 2004eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 2013f1db9ecSBarry Smith PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(MatScalar)); 2024eeb42bcSBarry Smith } 2035d517e7eSBarry Smith /* load in initial (unfactored row) */ 2044078e994SBarry Smith nz = ai[r[i]+1] - ai[r[i]]; 2054078e994SBarry Smith ajtmpold = aj + ai[r[i]]; 2064078e994SBarry Smith v = aa + bs2*ai[r[i]]; 2074eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 2083f1db9ecSBarry Smith PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(MatScalar)); 2094eeb42bcSBarry Smith } 210de6a44a3SBarry Smith row = *ajtmp++; 2115d517e7eSBarry Smith while (row < i) { 212de6a44a3SBarry Smith pc = rtmp + bs2*row; 213de6a44a3SBarry Smith /* if (*pc) { */ 214974088aaSSatish Balay for ( flg=0,k=0; k<bs2; k++ ) { if (pc[k]!=0.0) { flg =1; break; }} 21548e9ddb2SSatish Balay if (flg) { 2164078e994SBarry Smith pv = ba + bs2*diag_offset[row]; 2174078e994SBarry Smith pj = bj + diag_offset[row] + 1; 2184078e994SBarry Smith Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); 2194078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 220ec3a8b7bSBarry Smith pv += bs2; 2214eeb42bcSBarry Smith for (j=0; j<nz; j++) { 2224078e994SBarry Smith Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 2234eeb42bcSBarry Smith } 2244eeb42bcSBarry Smith PLogFlops(bslog*(nz+1)-bs); 22548e9ddb2SSatish Balay } 226de6a44a3SBarry Smith row = *ajtmp++; 2275d517e7eSBarry Smith } 2285d517e7eSBarry Smith /* finished row so stick it into b->a */ 2294078e994SBarry Smith pv = ba + bs2*bi[i]; 2304078e994SBarry Smith pj = bj + bi[i]; 2314078e994SBarry Smith nz = bi[i+1] - bi[i]; 2324eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 2333f1db9ecSBarry Smith PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar)); 2344eeb42bcSBarry Smith } 2354078e994SBarry Smith diag = diag_offset[i] - bi[i]; 236ec3a8b7bSBarry Smith /* invert diagonal block */ 2374eeb42bcSBarry Smith w = pv + bs2*diag; 2384078e994SBarry Smith Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work); 2395d517e7eSBarry Smith } 2405d517e7eSBarry Smith 241de6a44a3SBarry Smith PetscFree(rtmp); PetscFree(v_work); 2425d517e7eSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 2435d517e7eSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 2445d517e7eSBarry Smith C->factor = FACTOR_LU; 2455d517e7eSBarry Smith C->assembled = PETSC_TRUE; 246de6a44a3SBarry Smith PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 2473a40ed3dSBarry Smith PetscFunctionReturn(0); 2485d517e7eSBarry Smith } 2494eeb42bcSBarry Smith /* ------------------------------------------------------------*/ 2504eeb42bcSBarry Smith /* 251*4cd38560SBarry Smith Version for when blocks are 7 by 7 252*4cd38560SBarry Smith */ 253*4cd38560SBarry Smith #undef __FUNC__ 254*4cd38560SBarry Smith #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_7" 255*4cd38560SBarry Smith int MatLUFactorNumeric_SeqBAIJ_7(Mat A,Mat *B) 256*4cd38560SBarry Smith { 257*4cd38560SBarry Smith Mat C = *B; 258*4cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 259*4cd38560SBarry Smith IS isrow = b->row, isicol = b->icol; 260*4cd38560SBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 261*4cd38560SBarry Smith int *ajtmpold, *ajtmp, nz, row; 262*4cd38560SBarry Smith int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j; 263*4cd38560SBarry Smith register int *pj; 264*4cd38560SBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 265*4cd38560SBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 266*4cd38560SBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 267*4cd38560SBarry Smith MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 268*4cd38560SBarry Smith MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 269*4cd38560SBarry Smith MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 270*4cd38560SBarry Smith MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 271*4cd38560SBarry Smith MatScalar p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49; 272*4cd38560SBarry Smith MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 273*4cd38560SBarry Smith MatScalar x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49; 274*4cd38560SBarry Smith MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 275*4cd38560SBarry Smith MatScalar m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49; 276*4cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 277*4cd38560SBarry Smith 278*4cd38560SBarry Smith PetscFunctionBegin; 279*4cd38560SBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 280*4cd38560SBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 281*4cd38560SBarry Smith rtmp = (MatScalar *) PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 282*4cd38560SBarry Smith 283*4cd38560SBarry Smith for ( i=0; i<n; i++ ) { 284*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 285*4cd38560SBarry Smith ajtmp = bj + bi[i]; 286*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 287*4cd38560SBarry Smith x = rtmp+49*ajtmp[j]; 288*4cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 289*4cd38560SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 290*4cd38560SBarry Smith x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 291*4cd38560SBarry Smith x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 292*4cd38560SBarry Smith x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ; 293*4cd38560SBarry Smith x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = x[49] = 0.0 ; 294*4cd38560SBarry Smith } 295*4cd38560SBarry Smith /* load in initial (unfactored row) */ 296*4cd38560SBarry Smith idx = r[i]; 297*4cd38560SBarry Smith nz = ai[idx+1] - ai[idx]; 298*4cd38560SBarry Smith ajtmpold = aj + ai[idx]; 299*4cd38560SBarry Smith v = aa + 49*ai[idx]; 300*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 301*4cd38560SBarry Smith x = rtmp+49*ic[ajtmpold[j]]; 302*4cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 303*4cd38560SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 304*4cd38560SBarry Smith x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 305*4cd38560SBarry Smith x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 306*4cd38560SBarry Smith x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 307*4cd38560SBarry Smith x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 308*4cd38560SBarry Smith x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 309*4cd38560SBarry Smith x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 310*4cd38560SBarry Smith x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 311*4cd38560SBarry Smith x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39]; 312*4cd38560SBarry Smith x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43]; 313*4cd38560SBarry Smith x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47]; 314*4cd38560SBarry Smith x[48] = v[48]; 315*4cd38560SBarry Smith v += 49; 316*4cd38560SBarry Smith } 317*4cd38560SBarry Smith row = *ajtmp++; 318*4cd38560SBarry Smith while (row < i) { 319*4cd38560SBarry Smith pc = rtmp + 49*row; 320*4cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 321*4cd38560SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 322*4cd38560SBarry Smith p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 323*4cd38560SBarry Smith p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 324*4cd38560SBarry Smith p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 325*4cd38560SBarry Smith p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 326*4cd38560SBarry Smith p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 327*4cd38560SBarry Smith p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 328*4cd38560SBarry Smith p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 329*4cd38560SBarry Smith p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39]; 330*4cd38560SBarry Smith p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43]; 331*4cd38560SBarry Smith p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47]; 332*4cd38560SBarry Smith p49 = pc[48]; 333*4cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 334*4cd38560SBarry Smith p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 335*4cd38560SBarry Smith p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 336*4cd38560SBarry Smith p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 337*4cd38560SBarry Smith p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 338*4cd38560SBarry Smith p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 339*4cd38560SBarry Smith p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 340*4cd38560SBarry Smith p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 341*4cd38560SBarry Smith p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 || 342*4cd38560SBarry Smith p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 || 343*4cd38560SBarry Smith p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 || 344*4cd38560SBarry Smith p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 || 345*4cd38560SBarry Smith p49 != 0.0) { 346*4cd38560SBarry Smith pv = ba + 49*diag_offset[row]; 347*4cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 348*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 349*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 350*4cd38560SBarry Smith x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 351*4cd38560SBarry Smith x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 352*4cd38560SBarry Smith x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 353*4cd38560SBarry Smith x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 354*4cd38560SBarry Smith x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 355*4cd38560SBarry Smith x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 356*4cd38560SBarry Smith x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 357*4cd38560SBarry Smith x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 358*4cd38560SBarry Smith x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 359*4cd38560SBarry Smith x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 360*4cd38560SBarry Smith x49 = pv[48]; 361*4cd38560SBarry Smith pc[0] = m1 = p1*x1 + p8*x2 + p15*x3 + p22*x4 + p29*x5 + p36*x6 + p43*x7; 362*4cd38560SBarry Smith pc[1] = m2 = p2*x1 + p9*x2 + p16*x3 + p23*x4 + p30*x5 + p37*x6 + p44*x7; 363*4cd38560SBarry Smith pc[2] = m3 = p3*x1 + p10*x2 + p17*x3 + p24*x4 + p31*x5 + p38*x6 + p45*x7; 364*4cd38560SBarry Smith pc[3] = m4 = p4*x1 + p11*x2 + p18*x3 + p25*x4 + p32*x5 + p39*x6 + p46*x7; 365*4cd38560SBarry Smith pc[4] = m5 = p5*x1 + p12*x2 + p19*x3 + p26*x4 + p33*x5 + p40*x6 + p47*x7; 366*4cd38560SBarry Smith pc[5] = m6 = p6*x1 + p13*x2 + p20*x3 + p27*x4 + p34*x5 + p41*x6 + p48*x7; 367*4cd38560SBarry Smith pc[6] = m7 = p7*x1 + p14*x2 + p21*x3 + p28*x4 + p35*x5 + p42*x6 + p49*x7; 368*4cd38560SBarry Smith 369*4cd38560SBarry Smith pc[7] = m8 = p1*x8 + p8*x9 + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14; 370*4cd38560SBarry Smith pc[8] = m9 = p2*x8 + p9*x9 + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14; 371*4cd38560SBarry Smith pc[9] = m10 = p3*x8 + p10*x9 + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14; 372*4cd38560SBarry Smith pc[10] = m11 = p4*x8 + p11*x9 + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14; 373*4cd38560SBarry Smith pc[11] = m12 = p5*x8 + p12*x9 + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14; 374*4cd38560SBarry Smith pc[12] = m13 = p6*x8 + p13*x9 + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14; 375*4cd38560SBarry Smith pc[13] = m14 = p7*x8 + p14*x9 + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14; 376*4cd38560SBarry Smith 377*4cd38560SBarry Smith pc[14] = m15 = p1*x15 + p8*x16 + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21; 378*4cd38560SBarry Smith pc[15] = m16 = p2*x15 + p9*x16 + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21; 379*4cd38560SBarry Smith pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21; 380*4cd38560SBarry Smith pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21; 381*4cd38560SBarry Smith pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21; 382*4cd38560SBarry Smith pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21; 383*4cd38560SBarry Smith pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21; 384*4cd38560SBarry Smith 385*4cd38560SBarry Smith pc[21] = m22 = p1*x22 + p8*x23 + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28; 386*4cd38560SBarry Smith pc[22] = m23 = p2*x22 + p9*x23 + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28; 387*4cd38560SBarry Smith pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28; 388*4cd38560SBarry Smith pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28; 389*4cd38560SBarry Smith pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28; 390*4cd38560SBarry Smith pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28; 391*4cd38560SBarry Smith pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28; 392*4cd38560SBarry Smith 393*4cd38560SBarry Smith pc[28] = m29 = p1*x29 + p8*x30 + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35; 394*4cd38560SBarry Smith pc[29] = m30 = p2*x29 + p9*x30 + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35; 395*4cd38560SBarry Smith pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35; 396*4cd38560SBarry Smith pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35; 397*4cd38560SBarry Smith pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35; 398*4cd38560SBarry Smith pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35; 399*4cd38560SBarry Smith pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35; 400*4cd38560SBarry Smith 401*4cd38560SBarry Smith pc[35] = m36 = p1*x36 + p8*x37 + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42; 402*4cd38560SBarry Smith pc[36] = m37 = p2*x36 + p9*x37 + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42; 403*4cd38560SBarry Smith pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42; 404*4cd38560SBarry Smith pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42; 405*4cd38560SBarry Smith pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42; 406*4cd38560SBarry Smith pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42; 407*4cd38560SBarry Smith pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42; 408*4cd38560SBarry Smith 409*4cd38560SBarry Smith pc[42] = m43 = p1*x43 + p8*x44 + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49; 410*4cd38560SBarry Smith pc[43] = m44 = p2*x43 + p9*x44 + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49; 411*4cd38560SBarry Smith pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49; 412*4cd38560SBarry Smith pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49; 413*4cd38560SBarry Smith pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49; 414*4cd38560SBarry Smith pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49; 415*4cd38560SBarry Smith pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49; 416*4cd38560SBarry Smith 417*4cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 418*4cd38560SBarry Smith pv += 49; 419*4cd38560SBarry Smith for (j=0; j<nz; j++) { 420*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 421*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 422*4cd38560SBarry Smith x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 423*4cd38560SBarry Smith x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 424*4cd38560SBarry Smith x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 425*4cd38560SBarry Smith x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 426*4cd38560SBarry Smith x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 427*4cd38560SBarry Smith x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 428*4cd38560SBarry Smith x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 429*4cd38560SBarry Smith x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 430*4cd38560SBarry Smith x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 431*4cd38560SBarry Smith x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 432*4cd38560SBarry Smith x49 = pv[48]; 433*4cd38560SBarry Smith x = rtmp + 49*pj[j]; 434*4cd38560SBarry Smith x[0] -= m1*x1 + m8*x2 + m15*x3 + m22*x4 + m29*x5 + m36*x6 + m43*x7; 435*4cd38560SBarry Smith x[1] -= m2*x1 + m9*x2 + m16*x3 + m23*x4 + m30*x5 + m37*x6 + m44*x7; 436*4cd38560SBarry Smith x[2] -= m3*x1 + m10*x2 + m17*x3 + m24*x4 + m31*x5 + m38*x6 + m45*x7; 437*4cd38560SBarry Smith x[3] -= m4*x1 + m11*x2 + m18*x3 + m25*x4 + m32*x5 + m39*x6 + m46*x7; 438*4cd38560SBarry Smith x[4] -= m5*x1 + m12*x2 + m19*x3 + m26*x4 + m33*x5 + m40*x6 + m47*x7; 439*4cd38560SBarry Smith x[5] -= m6*x1 + m13*x2 + m20*x3 + m27*x4 + m34*x5 + m41*x6 + m48*x7; 440*4cd38560SBarry Smith x[6] -= m7*x1 + m14*x2 + m21*x3 + m28*x4 + m35*x5 + m42*x6 + m49*x7; 441*4cd38560SBarry Smith 442*4cd38560SBarry Smith x[7] -= m1*x8 + m8*x9 + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14; 443*4cd38560SBarry Smith x[8] -= m2*x8 + m9*x9 + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14; 444*4cd38560SBarry Smith x[9] -= m3*x8 + m10*x9 + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14; 445*4cd38560SBarry Smith x[10] -= m4*x8 + m11*x9 + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14; 446*4cd38560SBarry Smith x[11] -= m5*x8 + m12*x9 + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14; 447*4cd38560SBarry Smith x[12] -= m6*x8 + m13*x9 + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14; 448*4cd38560SBarry Smith x[13] -= m7*x8 + m14*x9 + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14; 449*4cd38560SBarry Smith 450*4cd38560SBarry Smith x[14] -= m1*x15 + m8*x16 + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21; 451*4cd38560SBarry Smith x[15] -= m2*x15 + m9*x16 + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21; 452*4cd38560SBarry Smith x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21; 453*4cd38560SBarry Smith x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21; 454*4cd38560SBarry Smith x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21; 455*4cd38560SBarry Smith x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21; 456*4cd38560SBarry Smith x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21; 457*4cd38560SBarry Smith 458*4cd38560SBarry Smith x[21] -= m1*x22 + m8*x23 + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28; 459*4cd38560SBarry Smith x[22] -= m2*x22 + m9*x23 + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28; 460*4cd38560SBarry Smith x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28; 461*4cd38560SBarry Smith x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28; 462*4cd38560SBarry Smith x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28; 463*4cd38560SBarry Smith x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28; 464*4cd38560SBarry Smith x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28; 465*4cd38560SBarry Smith 466*4cd38560SBarry Smith x[28] -= m1*x29 + m8*x30 + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35; 467*4cd38560SBarry Smith x[29] -= m2*x29 + m9*x30 + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35; 468*4cd38560SBarry Smith x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35; 469*4cd38560SBarry Smith x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35; 470*4cd38560SBarry Smith x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35; 471*4cd38560SBarry Smith x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35; 472*4cd38560SBarry Smith x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35; 473*4cd38560SBarry Smith 474*4cd38560SBarry Smith x[35] -= m1*x36 + m8*x37 + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42; 475*4cd38560SBarry Smith x[36] -= m2*x36 + m9*x37 + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42; 476*4cd38560SBarry Smith x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42; 477*4cd38560SBarry Smith x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42; 478*4cd38560SBarry Smith x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42; 479*4cd38560SBarry Smith x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42; 480*4cd38560SBarry Smith x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42; 481*4cd38560SBarry Smith 482*4cd38560SBarry Smith x[42] -= m1*x43 + m8*x44 + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49; 483*4cd38560SBarry Smith x[43] -= m2*x43 + m9*x44 + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49; 484*4cd38560SBarry Smith x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49; 485*4cd38560SBarry Smith x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49; 486*4cd38560SBarry Smith x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49; 487*4cd38560SBarry Smith x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49; 488*4cd38560SBarry Smith x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49; 489*4cd38560SBarry Smith pv += 49; 490*4cd38560SBarry Smith } 491*4cd38560SBarry Smith PLogFlops(686*nz+637); 492*4cd38560SBarry Smith } 493*4cd38560SBarry Smith row = *ajtmp++; 494*4cd38560SBarry Smith } 495*4cd38560SBarry Smith /* finished row so stick it into b->a */ 496*4cd38560SBarry Smith pv = ba + 49*bi[i]; 497*4cd38560SBarry Smith pj = bj + bi[i]; 498*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 499*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 500*4cd38560SBarry Smith x = rtmp+49*pj[j]; 501*4cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 502*4cd38560SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 503*4cd38560SBarry Smith pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 504*4cd38560SBarry Smith pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 505*4cd38560SBarry Smith pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 506*4cd38560SBarry Smith pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 507*4cd38560SBarry Smith pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 508*4cd38560SBarry Smith pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 509*4cd38560SBarry Smith pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 510*4cd38560SBarry Smith pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39]; 511*4cd38560SBarry Smith pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43]; 512*4cd38560SBarry Smith pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47]; 513*4cd38560SBarry Smith pv[48] = v[48]; 514*4cd38560SBarry Smith pv += 49; 515*4cd38560SBarry Smith } 516*4cd38560SBarry Smith /* invert diagonal block */ 517*4cd38560SBarry Smith w = ba + 49*diag_offset[i]; 518*4cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_7(w); CHKERRQ(ierr); 519*4cd38560SBarry Smith } 520*4cd38560SBarry Smith 521*4cd38560SBarry Smith PetscFree(rtmp); 522*4cd38560SBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 523*4cd38560SBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 524*4cd38560SBarry Smith C->factor = FACTOR_LU; 525*4cd38560SBarry Smith C->assembled = PETSC_TRUE; 526*4cd38560SBarry Smith PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */ 527*4cd38560SBarry Smith PetscFunctionReturn(0); 528*4cd38560SBarry Smith } 529*4cd38560SBarry Smith /* 530*4cd38560SBarry Smith Version for when blocks are 7 by 7 Using natural ordering 531*4cd38560SBarry Smith */ 532*4cd38560SBarry Smith #undef __FUNC__ 533*4cd38560SBarry Smith #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering" 534*4cd38560SBarry Smith int MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering(Mat A,Mat *B) 535*4cd38560SBarry Smith { 536*4cd38560SBarry Smith Mat C = *B; 537*4cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 538*4cd38560SBarry Smith int ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 539*4cd38560SBarry Smith int *ajtmpold, *ajtmp, nz, row; 540*4cd38560SBarry Smith int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 541*4cd38560SBarry Smith register int *pj; 542*4cd38560SBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 543*4cd38560SBarry Smith MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 544*4cd38560SBarry Smith MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 545*4cd38560SBarry Smith MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 546*4cd38560SBarry Smith MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 547*4cd38560SBarry Smith MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 548*4cd38560SBarry Smith MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 549*4cd38560SBarry Smith MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 550*4cd38560SBarry Smith MatScalar p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49; 551*4cd38560SBarry Smith MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 552*4cd38560SBarry Smith MatScalar x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49; 553*4cd38560SBarry Smith MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 554*4cd38560SBarry Smith MatScalar m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49; 555*4cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 556*4cd38560SBarry Smith 557*4cd38560SBarry Smith PetscFunctionBegin; 558*4cd38560SBarry Smith rtmp = (MatScalar *) PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 559*4cd38560SBarry Smith for ( i=0; i<n; i++ ) { 560*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 561*4cd38560SBarry Smith ajtmp = bj + bi[i]; 562*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 563*4cd38560SBarry Smith x = rtmp+49*ajtmp[j]; 564*4cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 565*4cd38560SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 566*4cd38560SBarry Smith x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 567*4cd38560SBarry Smith x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 568*4cd38560SBarry Smith x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ; 569*4cd38560SBarry Smith x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = x[49] = 0.0 ; 570*4cd38560SBarry Smith } 571*4cd38560SBarry Smith /* load in initial (unfactored row) */ 572*4cd38560SBarry Smith nz = ai[i+1] - ai[i]; 573*4cd38560SBarry Smith ajtmpold = aj + ai[i]; 574*4cd38560SBarry Smith v = aa + 49*ai[i]; 575*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 576*4cd38560SBarry Smith x = rtmp+49*ajtmpold[j]; 577*4cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 578*4cd38560SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 579*4cd38560SBarry Smith x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 580*4cd38560SBarry Smith x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 581*4cd38560SBarry Smith x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 582*4cd38560SBarry Smith x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 583*4cd38560SBarry Smith x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 584*4cd38560SBarry Smith x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 585*4cd38560SBarry Smith x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 586*4cd38560SBarry Smith x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39]; 587*4cd38560SBarry Smith x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43]; 588*4cd38560SBarry Smith x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47]; 589*4cd38560SBarry Smith x[48] = v[48]; 590*4cd38560SBarry Smith v += 49; 591*4cd38560SBarry Smith } 592*4cd38560SBarry Smith row = *ajtmp++; 593*4cd38560SBarry Smith while (row < i) { 594*4cd38560SBarry Smith pc = rtmp + 49*row; 595*4cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 596*4cd38560SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 597*4cd38560SBarry Smith p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 598*4cd38560SBarry Smith p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 599*4cd38560SBarry Smith p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 600*4cd38560SBarry Smith p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 601*4cd38560SBarry Smith p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 602*4cd38560SBarry Smith p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 603*4cd38560SBarry Smith p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 604*4cd38560SBarry Smith p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39]; 605*4cd38560SBarry Smith p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43]; 606*4cd38560SBarry Smith p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47]; 607*4cd38560SBarry Smith p49 = pc[48]; 608*4cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 609*4cd38560SBarry Smith p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 610*4cd38560SBarry Smith p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 611*4cd38560SBarry Smith p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 612*4cd38560SBarry Smith p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 613*4cd38560SBarry Smith p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 614*4cd38560SBarry Smith p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 615*4cd38560SBarry Smith p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 616*4cd38560SBarry Smith p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 || 617*4cd38560SBarry Smith p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 || 618*4cd38560SBarry Smith p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 || 619*4cd38560SBarry Smith p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 || 620*4cd38560SBarry Smith p49 != 0.0) { 621*4cd38560SBarry Smith pv = ba + 36*diag_offset[row]; 622*4cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 623*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 624*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 625*4cd38560SBarry Smith x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 626*4cd38560SBarry Smith x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 627*4cd38560SBarry Smith x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 628*4cd38560SBarry Smith x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 629*4cd38560SBarry Smith x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 630*4cd38560SBarry Smith x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 631*4cd38560SBarry Smith x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 632*4cd38560SBarry Smith x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 633*4cd38560SBarry Smith x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 634*4cd38560SBarry Smith x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 635*4cd38560SBarry Smith x49 = pv[48]; 636*4cd38560SBarry Smith pc[0] = m1 = p1*x1 + p8*x2 + p15*x3 + p22*x4 + p29*x5 + p36*x6 + p43*x7; 637*4cd38560SBarry Smith pc[1] = m2 = p2*x1 + p9*x2 + p16*x3 + p23*x4 + p30*x5 + p37*x6 + p44*x7; 638*4cd38560SBarry Smith pc[2] = m3 = p3*x1 + p10*x2 + p17*x3 + p24*x4 + p31*x5 + p38*x6 + p45*x7; 639*4cd38560SBarry Smith pc[3] = m4 = p4*x1 + p11*x2 + p18*x3 + p25*x4 + p32*x5 + p39*x6 + p46*x7; 640*4cd38560SBarry Smith pc[4] = m5 = p5*x1 + p12*x2 + p19*x3 + p26*x4 + p33*x5 + p40*x6 + p47*x7; 641*4cd38560SBarry Smith pc[5] = m6 = p6*x1 + p13*x2 + p20*x3 + p27*x4 + p34*x5 + p41*x6 + p48*x7; 642*4cd38560SBarry Smith pc[6] = m7 = p7*x1 + p14*x2 + p21*x3 + p28*x4 + p35*x5 + p42*x6 + p49*x7; 643*4cd38560SBarry Smith 644*4cd38560SBarry Smith pc[7] = m8 = p1*x8 + p8*x9 + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14; 645*4cd38560SBarry Smith pc[8] = m9 = p2*x8 + p9*x9 + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14; 646*4cd38560SBarry Smith pc[9] = m10 = p3*x8 + p10*x9 + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14; 647*4cd38560SBarry Smith pc[10] = m11 = p4*x8 + p11*x9 + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14; 648*4cd38560SBarry Smith pc[11] = m12 = p5*x8 + p12*x9 + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14; 649*4cd38560SBarry Smith pc[12] = m13 = p6*x8 + p13*x9 + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14; 650*4cd38560SBarry Smith pc[13] = m14 = p7*x8 + p14*x9 + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14; 651*4cd38560SBarry Smith 652*4cd38560SBarry Smith pc[14] = m15 = p1*x15 + p8*x16 + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21; 653*4cd38560SBarry Smith pc[15] = m16 = p2*x15 + p9*x16 + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21; 654*4cd38560SBarry Smith pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21; 655*4cd38560SBarry Smith pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21; 656*4cd38560SBarry Smith pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21; 657*4cd38560SBarry Smith pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21; 658*4cd38560SBarry Smith pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21; 659*4cd38560SBarry Smith 660*4cd38560SBarry Smith pc[21] = m22 = p1*x22 + p8*x23 + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28; 661*4cd38560SBarry Smith pc[22] = m23 = p2*x22 + p9*x23 + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28; 662*4cd38560SBarry Smith pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28; 663*4cd38560SBarry Smith pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28; 664*4cd38560SBarry Smith pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28; 665*4cd38560SBarry Smith pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28; 666*4cd38560SBarry Smith pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28; 667*4cd38560SBarry Smith 668*4cd38560SBarry Smith pc[28] = m29 = p1*x29 + p8*x30 + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35; 669*4cd38560SBarry Smith pc[29] = m30 = p2*x29 + p9*x30 + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35; 670*4cd38560SBarry Smith pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35; 671*4cd38560SBarry Smith pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35; 672*4cd38560SBarry Smith pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35; 673*4cd38560SBarry Smith pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35; 674*4cd38560SBarry Smith pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35; 675*4cd38560SBarry Smith 676*4cd38560SBarry Smith pc[35] = m36 = p1*x36 + p8*x37 + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42; 677*4cd38560SBarry Smith pc[36] = m37 = p2*x36 + p9*x37 + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42; 678*4cd38560SBarry Smith pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42; 679*4cd38560SBarry Smith pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42; 680*4cd38560SBarry Smith pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42; 681*4cd38560SBarry Smith pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42; 682*4cd38560SBarry Smith pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42; 683*4cd38560SBarry Smith 684*4cd38560SBarry Smith pc[42] = m43 = p1*x43 + p8*x44 + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49; 685*4cd38560SBarry Smith pc[43] = m44 = p2*x43 + p9*x44 + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49; 686*4cd38560SBarry Smith pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49; 687*4cd38560SBarry Smith pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49; 688*4cd38560SBarry Smith pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49; 689*4cd38560SBarry Smith pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49; 690*4cd38560SBarry Smith pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49; 691*4cd38560SBarry Smith 692*4cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 693*4cd38560SBarry Smith pv += 49; 694*4cd38560SBarry Smith for (j=0; j<nz; j++) { 695*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 696*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 697*4cd38560SBarry Smith x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 698*4cd38560SBarry Smith x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 699*4cd38560SBarry Smith x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 700*4cd38560SBarry Smith x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 701*4cd38560SBarry Smith x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 702*4cd38560SBarry Smith x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 703*4cd38560SBarry Smith x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 704*4cd38560SBarry Smith x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39]; 705*4cd38560SBarry Smith x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43]; 706*4cd38560SBarry Smith x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47]; 707*4cd38560SBarry Smith x49 = pv[48]; 708*4cd38560SBarry Smith x = rtmp + 49*pj[j]; 709*4cd38560SBarry Smith x[0] -= m1*x1 + m8*x2 + m15*x3 + m22*x4 + m29*x5 + m36*x6 + m43*x7; 710*4cd38560SBarry Smith x[1] -= m2*x1 + m9*x2 + m16*x3 + m23*x4 + m30*x5 + m37*x6 + m44*x7; 711*4cd38560SBarry Smith x[2] -= m3*x1 + m10*x2 + m17*x3 + m24*x4 + m31*x5 + m38*x6 + m45*x7; 712*4cd38560SBarry Smith x[3] -= m4*x1 + m11*x2 + m18*x3 + m25*x4 + m32*x5 + m39*x6 + m46*x7; 713*4cd38560SBarry Smith x[4] -= m5*x1 + m12*x2 + m19*x3 + m26*x4 + m33*x5 + m40*x6 + m47*x7; 714*4cd38560SBarry Smith x[5] -= m6*x1 + m13*x2 + m20*x3 + m27*x4 + m34*x5 + m41*x6 + m48*x7; 715*4cd38560SBarry Smith x[6] -= m7*x1 + m14*x2 + m21*x3 + m28*x4 + m35*x5 + m42*x6 + m49*x7; 716*4cd38560SBarry Smith 717*4cd38560SBarry Smith x[7] -= m1*x8 + m8*x9 + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14; 718*4cd38560SBarry Smith x[8] -= m2*x8 + m9*x9 + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14; 719*4cd38560SBarry Smith x[9] -= m3*x8 + m10*x9 + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14; 720*4cd38560SBarry Smith x[10] -= m4*x8 + m11*x9 + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14; 721*4cd38560SBarry Smith x[11] -= m5*x8 + m12*x9 + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14; 722*4cd38560SBarry Smith x[12] -= m6*x8 + m13*x9 + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14; 723*4cd38560SBarry Smith x[13] -= m7*x8 + m14*x9 + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14; 724*4cd38560SBarry Smith 725*4cd38560SBarry Smith x[14] -= m1*x15 + m8*x16 + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21; 726*4cd38560SBarry Smith x[15] -= m2*x15 + m9*x16 + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21; 727*4cd38560SBarry Smith x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21; 728*4cd38560SBarry Smith x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21; 729*4cd38560SBarry Smith x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21; 730*4cd38560SBarry Smith x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21; 731*4cd38560SBarry Smith x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21; 732*4cd38560SBarry Smith 733*4cd38560SBarry Smith x[21] -= m1*x22 + m8*x23 + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28; 734*4cd38560SBarry Smith x[22] -= m2*x22 + m9*x23 + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28; 735*4cd38560SBarry Smith x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28; 736*4cd38560SBarry Smith x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28; 737*4cd38560SBarry Smith x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28; 738*4cd38560SBarry Smith x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28; 739*4cd38560SBarry Smith x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28; 740*4cd38560SBarry Smith 741*4cd38560SBarry Smith x[28] -= m1*x29 + m8*x30 + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35; 742*4cd38560SBarry Smith x[29] -= m2*x29 + m9*x30 + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35; 743*4cd38560SBarry Smith x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35; 744*4cd38560SBarry Smith x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35; 745*4cd38560SBarry Smith x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35; 746*4cd38560SBarry Smith x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35; 747*4cd38560SBarry Smith x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35; 748*4cd38560SBarry Smith 749*4cd38560SBarry Smith x[35] -= m1*x36 + m8*x37 + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42; 750*4cd38560SBarry Smith x[36] -= m2*x36 + m9*x37 + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42; 751*4cd38560SBarry Smith x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42; 752*4cd38560SBarry Smith x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42; 753*4cd38560SBarry Smith x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42; 754*4cd38560SBarry Smith x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42; 755*4cd38560SBarry Smith x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42; 756*4cd38560SBarry Smith 757*4cd38560SBarry Smith x[42] -= m1*x43 + m8*x44 + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49; 758*4cd38560SBarry Smith x[43] -= m2*x43 + m9*x44 + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49; 759*4cd38560SBarry Smith x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49; 760*4cd38560SBarry Smith x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49; 761*4cd38560SBarry Smith x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49; 762*4cd38560SBarry Smith x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49; 763*4cd38560SBarry Smith x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49; 764*4cd38560SBarry Smith pv += 49; 765*4cd38560SBarry Smith } 766*4cd38560SBarry Smith PLogFlops(686*nz+637); 767*4cd38560SBarry Smith } 768*4cd38560SBarry Smith row = *ajtmp++; 769*4cd38560SBarry Smith } 770*4cd38560SBarry Smith /* finished row so stick it into b->a */ 771*4cd38560SBarry Smith pv = ba + 49*bi[i]; 772*4cd38560SBarry Smith pj = bj + bi[i]; 773*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 774*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 775*4cd38560SBarry Smith x = rtmp+49*pj[j]; 776*4cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 777*4cd38560SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 778*4cd38560SBarry Smith pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 779*4cd38560SBarry Smith pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 780*4cd38560SBarry Smith pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 781*4cd38560SBarry Smith pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 782*4cd38560SBarry Smith pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 783*4cd38560SBarry Smith pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 784*4cd38560SBarry Smith pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 785*4cd38560SBarry Smith pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39]; 786*4cd38560SBarry Smith pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43]; 787*4cd38560SBarry Smith pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47]; 788*4cd38560SBarry Smith pv[48] = v[48]; 789*4cd38560SBarry Smith pv += 49; 790*4cd38560SBarry Smith } 791*4cd38560SBarry Smith /* invert diagonal block */ 792*4cd38560SBarry Smith w = ba + 49*diag_offset[i]; 793*4cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_7(w); CHKERRQ(ierr); 794*4cd38560SBarry Smith } 795*4cd38560SBarry Smith 796*4cd38560SBarry Smith PetscFree(rtmp); 797*4cd38560SBarry Smith C->factor = FACTOR_LU; 798*4cd38560SBarry Smith C->assembled = PETSC_TRUE; 799*4cd38560SBarry Smith PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */ 800*4cd38560SBarry Smith PetscFunctionReturn(0); 801*4cd38560SBarry Smith } 802*4cd38560SBarry Smith 803*4cd38560SBarry Smith /* ------------------------------------------------------------*/ 804*4cd38560SBarry Smith /* 805*4cd38560SBarry Smith Version for when blocks are 6 by 6 806*4cd38560SBarry Smith */ 807*4cd38560SBarry Smith #undef __FUNC__ 808*4cd38560SBarry Smith #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_6" 809*4cd38560SBarry Smith int MatLUFactorNumeric_SeqBAIJ_6(Mat A,Mat *B) 810*4cd38560SBarry Smith { 811*4cd38560SBarry Smith Mat C = *B; 812*4cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 813*4cd38560SBarry Smith IS isrow = b->row, isicol = b->icol; 814*4cd38560SBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 815*4cd38560SBarry Smith int *ajtmpold, *ajtmp, nz, row; 816*4cd38560SBarry Smith int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j; 817*4cd38560SBarry Smith register int *pj; 818*4cd38560SBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 819*4cd38560SBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 820*4cd38560SBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 821*4cd38560SBarry Smith MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 822*4cd38560SBarry Smith MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 823*4cd38560SBarry Smith MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 824*4cd38560SBarry Smith MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 825*4cd38560SBarry Smith MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 826*4cd38560SBarry Smith MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 827*4cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 828*4cd38560SBarry Smith 829*4cd38560SBarry Smith PetscFunctionBegin; 830*4cd38560SBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 831*4cd38560SBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 832*4cd38560SBarry Smith rtmp = (MatScalar *) PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 833*4cd38560SBarry Smith 834*4cd38560SBarry Smith for ( i=0; i<n; i++ ) { 835*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 836*4cd38560SBarry Smith ajtmp = bj + bi[i]; 837*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 838*4cd38560SBarry Smith x = rtmp+36*ajtmp[j]; 839*4cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 840*4cd38560SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 841*4cd38560SBarry Smith x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 842*4cd38560SBarry Smith x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 843*4cd38560SBarry Smith x[34] = x[35] = 0.0 ; 844*4cd38560SBarry Smith } 845*4cd38560SBarry Smith /* load in initial (unfactored row) */ 846*4cd38560SBarry Smith idx = r[i]; 847*4cd38560SBarry Smith nz = ai[idx+1] - ai[idx]; 848*4cd38560SBarry Smith ajtmpold = aj + ai[idx]; 849*4cd38560SBarry Smith v = aa + 36*ai[idx]; 850*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 851*4cd38560SBarry Smith x = rtmp+36*ic[ajtmpold[j]]; 852*4cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 853*4cd38560SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 854*4cd38560SBarry Smith x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 855*4cd38560SBarry Smith x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 856*4cd38560SBarry Smith x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 857*4cd38560SBarry Smith x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 858*4cd38560SBarry Smith x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 859*4cd38560SBarry Smith x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 860*4cd38560SBarry Smith x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 861*4cd38560SBarry Smith v += 36; 862*4cd38560SBarry Smith } 863*4cd38560SBarry Smith row = *ajtmp++; 864*4cd38560SBarry Smith while (row < i) { 865*4cd38560SBarry Smith pc = rtmp + 36*row; 866*4cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 867*4cd38560SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 868*4cd38560SBarry Smith p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 869*4cd38560SBarry Smith p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 870*4cd38560SBarry Smith p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 871*4cd38560SBarry Smith p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 872*4cd38560SBarry Smith p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 873*4cd38560SBarry Smith p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 874*4cd38560SBarry Smith p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 875*4cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 876*4cd38560SBarry Smith p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 877*4cd38560SBarry Smith p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 878*4cd38560SBarry Smith p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 879*4cd38560SBarry Smith p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 880*4cd38560SBarry Smith p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 881*4cd38560SBarry Smith p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 882*4cd38560SBarry Smith p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 883*4cd38560SBarry Smith p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) { 884*4cd38560SBarry Smith pv = ba + 36*diag_offset[row]; 885*4cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 886*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 887*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 888*4cd38560SBarry Smith x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 889*4cd38560SBarry Smith x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 890*4cd38560SBarry Smith x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 891*4cd38560SBarry Smith x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 892*4cd38560SBarry Smith x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 893*4cd38560SBarry Smith x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 894*4cd38560SBarry Smith x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 895*4cd38560SBarry Smith pc[0] = m1 = p1*x1 + p7*x2 + p13*x3 + p19*x4 + p25*x5 + p31*x6; 896*4cd38560SBarry Smith pc[1] = m2 = p2*x1 + p8*x2 + p14*x3 + p20*x4 + p26*x5 + p32*x6; 897*4cd38560SBarry Smith pc[2] = m3 = p3*x1 + p9*x2 + p15*x3 + p21*x4 + p27*x5 + p33*x6; 898*4cd38560SBarry Smith pc[3] = m4 = p4*x1 + p10*x2 + p16*x3 + p22*x4 + p28*x5 + p34*x6; 899*4cd38560SBarry Smith pc[4] = m5 = p5*x1 + p11*x2 + p17*x3 + p23*x4 + p29*x5 + p35*x6; 900*4cd38560SBarry Smith pc[5] = m6 = p6*x1 + p12*x2 + p18*x3 + p24*x4 + p30*x5 + p36*x6; 901*4cd38560SBarry Smith 902*4cd38560SBarry Smith pc[6] = m7 = p1*x7 + p7*x8 + p13*x9 + p19*x10 + p25*x11 + p31*x12; 903*4cd38560SBarry Smith pc[7] = m8 = p2*x7 + p8*x8 + p14*x9 + p20*x10 + p26*x11 + p32*x12; 904*4cd38560SBarry Smith pc[8] = m9 = p3*x7 + p9*x8 + p15*x9 + p21*x10 + p27*x11 + p33*x12; 905*4cd38560SBarry Smith pc[9] = m10 = p4*x7 + p10*x8 + p16*x9 + p22*x10 + p28*x11 + p34*x12; 906*4cd38560SBarry Smith pc[10] = m11 = p5*x7 + p11*x8 + p17*x9 + p23*x10 + p29*x11 + p35*x12; 907*4cd38560SBarry Smith pc[11] = m12 = p6*x7 + p12*x8 + p18*x9 + p24*x10 + p30*x11 + p36*x12; 908*4cd38560SBarry Smith 909*4cd38560SBarry Smith pc[12] = m13 = p1*x13 + p7*x14 + p13*x15 + p19*x16 + p25*x17 + p31*x18; 910*4cd38560SBarry Smith pc[13] = m14 = p2*x13 + p8*x14 + p14*x15 + p20*x16 + p26*x17 + p32*x18; 911*4cd38560SBarry Smith pc[14] = m15 = p3*x13 + p9*x14 + p15*x15 + p21*x16 + p27*x17 + p33*x18; 912*4cd38560SBarry Smith pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18; 913*4cd38560SBarry Smith pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18; 914*4cd38560SBarry Smith pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18; 915*4cd38560SBarry Smith 916*4cd38560SBarry Smith pc[18] = m19 = p1*x19 + p7*x20 + p13*x21 + p19*x22 + p25*x23 + p31*x24; 917*4cd38560SBarry Smith pc[19] = m20 = p2*x19 + p8*x20 + p14*x21 + p20*x22 + p26*x23 + p32*x24; 918*4cd38560SBarry Smith pc[20] = m21 = p3*x19 + p9*x20 + p15*x21 + p21*x22 + p27*x23 + p33*x24; 919*4cd38560SBarry Smith pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24; 920*4cd38560SBarry Smith pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24; 921*4cd38560SBarry Smith pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24; 922*4cd38560SBarry Smith 923*4cd38560SBarry Smith pc[24] = m25 = p1*x25 + p7*x26 + p13*x27 + p19*x28 + p25*x29 + p31*x30; 924*4cd38560SBarry Smith pc[25] = m26 = p2*x25 + p8*x26 + p14*x27 + p20*x28 + p26*x29 + p32*x30; 925*4cd38560SBarry Smith pc[26] = m27 = p3*x25 + p9*x26 + p15*x27 + p21*x28 + p27*x29 + p33*x30; 926*4cd38560SBarry Smith pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30; 927*4cd38560SBarry Smith pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30; 928*4cd38560SBarry Smith pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30; 929*4cd38560SBarry Smith 930*4cd38560SBarry Smith pc[30] = m31 = p1*x31 + p7*x32 + p13*x33 + p19*x34 + p25*x35 + p31*x36; 931*4cd38560SBarry Smith pc[31] = m32 = p2*x31 + p8*x32 + p14*x33 + p20*x34 + p26*x35 + p32*x36; 932*4cd38560SBarry Smith pc[32] = m33 = p3*x31 + p9*x32 + p15*x33 + p21*x34 + p27*x35 + p33*x36; 933*4cd38560SBarry Smith pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36; 934*4cd38560SBarry Smith pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36; 935*4cd38560SBarry Smith pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36; 936*4cd38560SBarry Smith 937*4cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 938*4cd38560SBarry Smith pv += 36; 939*4cd38560SBarry Smith for (j=0; j<nz; j++) { 940*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 941*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 942*4cd38560SBarry Smith x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 943*4cd38560SBarry Smith x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 944*4cd38560SBarry Smith x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 945*4cd38560SBarry Smith x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 946*4cd38560SBarry Smith x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 947*4cd38560SBarry Smith x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 948*4cd38560SBarry Smith x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 949*4cd38560SBarry Smith x = rtmp + 36*pj[j]; 950*4cd38560SBarry Smith x[0] -= m1*x1 + m7*x2 + m13*x3 + m19*x4 + m25*x5 + m31*x6; 951*4cd38560SBarry Smith x[1] -= m2*x1 + m8*x2 + m14*x3 + m20*x4 + m26*x5 + m32*x6; 952*4cd38560SBarry Smith x[2] -= m3*x1 + m9*x2 + m15*x3 + m21*x4 + m27*x5 + m33*x6; 953*4cd38560SBarry Smith x[3] -= m4*x1 + m10*x2 + m16*x3 + m22*x4 + m28*x5 + m34*x6; 954*4cd38560SBarry Smith x[4] -= m5*x1 + m11*x2 + m17*x3 + m23*x4 + m29*x5 + m35*x6; 955*4cd38560SBarry Smith x[5] -= m6*x1 + m12*x2 + m18*x3 + m24*x4 + m30*x5 + m36*x6; 956*4cd38560SBarry Smith 957*4cd38560SBarry Smith x[6] -= m1*x7 + m7*x8 + m13*x9 + m19*x10 + m25*x11 + m31*x12; 958*4cd38560SBarry Smith x[7] -= m2*x7 + m8*x8 + m14*x9 + m20*x10 + m26*x11 + m32*x12; 959*4cd38560SBarry Smith x[8] -= m3*x7 + m9*x8 + m15*x9 + m21*x10 + m27*x11 + m33*x12; 960*4cd38560SBarry Smith x[9] -= m4*x7 + m10*x8 + m16*x9 + m22*x10 + m28*x11 + m34*x12; 961*4cd38560SBarry Smith x[10] -= m5*x7 + m11*x8 + m17*x9 + m23*x10 + m29*x11 + m35*x12; 962*4cd38560SBarry Smith x[11] -= m6*x7 + m12*x8 + m18*x9 + m24*x10 + m30*x11 + m36*x12; 963*4cd38560SBarry Smith 964*4cd38560SBarry Smith x[12] -= m1*x13 + m7*x14 + m13*x15 + m19*x16 + m25*x17 + m31*x18; 965*4cd38560SBarry Smith x[13] -= m2*x13 + m8*x14 + m14*x15 + m20*x16 + m26*x17 + m32*x18; 966*4cd38560SBarry Smith x[14] -= m3*x13 + m9*x14 + m15*x15 + m21*x16 + m27*x17 + m33*x18; 967*4cd38560SBarry Smith x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18; 968*4cd38560SBarry Smith x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18; 969*4cd38560SBarry Smith x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18; 970*4cd38560SBarry Smith 971*4cd38560SBarry Smith x[18] -= m1*x19 + m7*x20 + m13*x21 + m19*x22 + m25*x23 + m31*x24; 972*4cd38560SBarry Smith x[19] -= m2*x19 + m8*x20 + m14*x21 + m20*x22 + m26*x23 + m32*x24; 973*4cd38560SBarry Smith x[20] -= m3*x19 + m9*x20 + m15*x21 + m21*x22 + m27*x23 + m33*x24; 974*4cd38560SBarry Smith x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24; 975*4cd38560SBarry Smith x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24; 976*4cd38560SBarry Smith x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24; 977*4cd38560SBarry Smith 978*4cd38560SBarry Smith x[24] -= m1*x25 + m7*x26 + m13*x27 + m19*x28 + m25*x29 + m31*x30; 979*4cd38560SBarry Smith x[25] -= m2*x25 + m8*x26 + m14*x27 + m20*x28 + m26*x29 + m32*x30; 980*4cd38560SBarry Smith x[26] -= m3*x25 + m9*x26 + m15*x27 + m21*x28 + m27*x29 + m33*x30; 981*4cd38560SBarry Smith x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30; 982*4cd38560SBarry Smith x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30; 983*4cd38560SBarry Smith x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30; 984*4cd38560SBarry Smith 985*4cd38560SBarry Smith x[30] -= m1*x31 + m7*x32 + m13*x33 + m19*x34 + m25*x35 + m31*x36; 986*4cd38560SBarry Smith x[31] -= m2*x31 + m8*x32 + m14*x33 + m20*x34 + m26*x35 + m32*x36; 987*4cd38560SBarry Smith x[32] -= m3*x31 + m9*x32 + m15*x33 + m21*x34 + m27*x35 + m33*x36; 988*4cd38560SBarry Smith x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36; 989*4cd38560SBarry Smith x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36; 990*4cd38560SBarry Smith x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36; 991*4cd38560SBarry Smith 992*4cd38560SBarry Smith pv += 36; 993*4cd38560SBarry Smith } 994*4cd38560SBarry Smith PLogFlops(432*nz+396); 995*4cd38560SBarry Smith } 996*4cd38560SBarry Smith row = *ajtmp++; 997*4cd38560SBarry Smith } 998*4cd38560SBarry Smith /* finished row so stick it into b->a */ 999*4cd38560SBarry Smith pv = ba + 36*bi[i]; 1000*4cd38560SBarry Smith pj = bj + bi[i]; 1001*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 1002*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 1003*4cd38560SBarry Smith x = rtmp+36*pj[j]; 1004*4cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1005*4cd38560SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 1006*4cd38560SBarry Smith pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 1007*4cd38560SBarry Smith pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1008*4cd38560SBarry Smith pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 1009*4cd38560SBarry Smith pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 1010*4cd38560SBarry Smith pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 1011*4cd38560SBarry Smith pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 1012*4cd38560SBarry Smith pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 1013*4cd38560SBarry Smith pv += 36; 1014*4cd38560SBarry Smith } 1015*4cd38560SBarry Smith /* invert diagonal block */ 1016*4cd38560SBarry Smith w = ba + 36*diag_offset[i]; 1017*4cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_6(w); CHKERRQ(ierr); 1018*4cd38560SBarry Smith } 1019*4cd38560SBarry Smith 1020*4cd38560SBarry Smith PetscFree(rtmp); 1021*4cd38560SBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 1022*4cd38560SBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 1023*4cd38560SBarry Smith C->factor = FACTOR_LU; 1024*4cd38560SBarry Smith C->assembled = PETSC_TRUE; 1025*4cd38560SBarry Smith PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */ 1026*4cd38560SBarry Smith PetscFunctionReturn(0); 1027*4cd38560SBarry Smith } 1028*4cd38560SBarry Smith /* 1029*4cd38560SBarry Smith Version for when blocks are 6 by 6 Using natural ordering 1030*4cd38560SBarry Smith */ 1031*4cd38560SBarry Smith #undef __FUNC__ 1032*4cd38560SBarry Smith #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering" 1033*4cd38560SBarry Smith int MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering(Mat A,Mat *B) 1034*4cd38560SBarry Smith { 1035*4cd38560SBarry Smith Mat C = *B; 1036*4cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 1037*4cd38560SBarry Smith int ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 1038*4cd38560SBarry Smith int *ajtmpold, *ajtmp, nz, row; 1039*4cd38560SBarry Smith int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1040*4cd38560SBarry Smith register int *pj; 1041*4cd38560SBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1042*4cd38560SBarry Smith MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 1043*4cd38560SBarry Smith MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 1044*4cd38560SBarry Smith MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 1045*4cd38560SBarry Smith MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 1046*4cd38560SBarry Smith MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 1047*4cd38560SBarry Smith MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 1048*4cd38560SBarry Smith MatScalar p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36; 1049*4cd38560SBarry Smith MatScalar x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36; 1050*4cd38560SBarry Smith MatScalar m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36; 1051*4cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 1052*4cd38560SBarry Smith 1053*4cd38560SBarry Smith PetscFunctionBegin; 1054*4cd38560SBarry Smith rtmp = (MatScalar *) PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1055*4cd38560SBarry Smith for ( i=0; i<n; i++ ) { 1056*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 1057*4cd38560SBarry Smith ajtmp = bj + bi[i]; 1058*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 1059*4cd38560SBarry Smith x = rtmp+36*ajtmp[j]; 1060*4cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1061*4cd38560SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 1062*4cd38560SBarry Smith x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ; 1063*4cd38560SBarry Smith x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ; 1064*4cd38560SBarry Smith x[34] = x[35] = 0.0 ; 1065*4cd38560SBarry Smith } 1066*4cd38560SBarry Smith /* load in initial (unfactored row) */ 1067*4cd38560SBarry Smith nz = ai[i+1] - ai[i]; 1068*4cd38560SBarry Smith ajtmpold = aj + ai[i]; 1069*4cd38560SBarry Smith v = aa + 36*ai[i]; 1070*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 1071*4cd38560SBarry Smith x = rtmp+36*ajtmpold[j]; 1072*4cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1073*4cd38560SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; 1074*4cd38560SBarry Smith x[8] = v[8]; x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; 1075*4cd38560SBarry Smith x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15]; 1076*4cd38560SBarry Smith x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19]; 1077*4cd38560SBarry Smith x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 1078*4cd38560SBarry Smith x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27]; 1079*4cd38560SBarry Smith x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31]; 1080*4cd38560SBarry Smith x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35]; 1081*4cd38560SBarry Smith v += 36; 1082*4cd38560SBarry Smith } 1083*4cd38560SBarry Smith row = *ajtmp++; 1084*4cd38560SBarry Smith while (row < i) { 1085*4cd38560SBarry Smith pc = rtmp + 36*row; 1086*4cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1087*4cd38560SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; 1088*4cd38560SBarry Smith p9 = pc[8]; p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; 1089*4cd38560SBarry Smith p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15]; 1090*4cd38560SBarry Smith p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19]; 1091*4cd38560SBarry Smith p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 1092*4cd38560SBarry Smith p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27]; 1093*4cd38560SBarry Smith p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31]; 1094*4cd38560SBarry Smith p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35]; 1095*4cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || 1096*4cd38560SBarry Smith p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || 1097*4cd38560SBarry Smith p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || 1098*4cd38560SBarry Smith p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || 1099*4cd38560SBarry Smith p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || 1100*4cd38560SBarry Smith p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || 1101*4cd38560SBarry Smith p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || 1102*4cd38560SBarry Smith p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || 1103*4cd38560SBarry Smith p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) { 1104*4cd38560SBarry Smith pv = ba + 36*diag_offset[row]; 1105*4cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 1106*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1107*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 1108*4cd38560SBarry Smith x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 1109*4cd38560SBarry Smith x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1110*4cd38560SBarry Smith x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 1111*4cd38560SBarry Smith x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1112*4cd38560SBarry Smith x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 1113*4cd38560SBarry Smith x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 1114*4cd38560SBarry Smith x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 1115*4cd38560SBarry Smith pc[0] = m1 = p1*x1 + p7*x2 + p13*x3 + p19*x4 + p25*x5 + p31*x6; 1116*4cd38560SBarry Smith pc[1] = m2 = p2*x1 + p8*x2 + p14*x3 + p20*x4 + p26*x5 + p32*x6; 1117*4cd38560SBarry Smith pc[2] = m3 = p3*x1 + p9*x2 + p15*x3 + p21*x4 + p27*x5 + p33*x6; 1118*4cd38560SBarry Smith pc[3] = m4 = p4*x1 + p10*x2 + p16*x3 + p22*x4 + p28*x5 + p34*x6; 1119*4cd38560SBarry Smith pc[4] = m5 = p5*x1 + p11*x2 + p17*x3 + p23*x4 + p29*x5 + p35*x6; 1120*4cd38560SBarry Smith pc[5] = m6 = p6*x1 + p12*x2 + p18*x3 + p24*x4 + p30*x5 + p36*x6; 1121*4cd38560SBarry Smith 1122*4cd38560SBarry Smith pc[6] = m7 = p1*x7 + p7*x8 + p13*x9 + p19*x10 + p25*x11 + p31*x12; 1123*4cd38560SBarry Smith pc[7] = m8 = p2*x7 + p8*x8 + p14*x9 + p20*x10 + p26*x11 + p32*x12; 1124*4cd38560SBarry Smith pc[8] = m9 = p3*x7 + p9*x8 + p15*x9 + p21*x10 + p27*x11 + p33*x12; 1125*4cd38560SBarry Smith pc[9] = m10 = p4*x7 + p10*x8 + p16*x9 + p22*x10 + p28*x11 + p34*x12; 1126*4cd38560SBarry Smith pc[10] = m11 = p5*x7 + p11*x8 + p17*x9 + p23*x10 + p29*x11 + p35*x12; 1127*4cd38560SBarry Smith pc[11] = m12 = p6*x7 + p12*x8 + p18*x9 + p24*x10 + p30*x11 + p36*x12; 1128*4cd38560SBarry Smith 1129*4cd38560SBarry Smith pc[12] = m13 = p1*x13 + p7*x14 + p13*x15 + p19*x16 + p25*x17 + p31*x18; 1130*4cd38560SBarry Smith pc[13] = m14 = p2*x13 + p8*x14 + p14*x15 + p20*x16 + p26*x17 + p32*x18; 1131*4cd38560SBarry Smith pc[14] = m15 = p3*x13 + p9*x14 + p15*x15 + p21*x16 + p27*x17 + p33*x18; 1132*4cd38560SBarry Smith pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18; 1133*4cd38560SBarry Smith pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18; 1134*4cd38560SBarry Smith pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18; 1135*4cd38560SBarry Smith 1136*4cd38560SBarry Smith pc[18] = m19 = p1*x19 + p7*x20 + p13*x21 + p19*x22 + p25*x23 + p31*x24; 1137*4cd38560SBarry Smith pc[19] = m20 = p2*x19 + p8*x20 + p14*x21 + p20*x22 + p26*x23 + p32*x24; 1138*4cd38560SBarry Smith pc[20] = m21 = p3*x19 + p9*x20 + p15*x21 + p21*x22 + p27*x23 + p33*x24; 1139*4cd38560SBarry Smith pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24; 1140*4cd38560SBarry Smith pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24; 1141*4cd38560SBarry Smith pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24; 1142*4cd38560SBarry Smith 1143*4cd38560SBarry Smith pc[24] = m25 = p1*x25 + p7*x26 + p13*x27 + p19*x28 + p25*x29 + p31*x30; 1144*4cd38560SBarry Smith pc[25] = m26 = p2*x25 + p8*x26 + p14*x27 + p20*x28 + p26*x29 + p32*x30; 1145*4cd38560SBarry Smith pc[26] = m27 = p3*x25 + p9*x26 + p15*x27 + p21*x28 + p27*x29 + p33*x30; 1146*4cd38560SBarry Smith pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30; 1147*4cd38560SBarry Smith pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30; 1148*4cd38560SBarry Smith pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30; 1149*4cd38560SBarry Smith 1150*4cd38560SBarry Smith pc[30] = m31 = p1*x31 + p7*x32 + p13*x33 + p19*x34 + p25*x35 + p31*x36; 1151*4cd38560SBarry Smith pc[31] = m32 = p2*x31 + p8*x32 + p14*x33 + p20*x34 + p26*x35 + p32*x36; 1152*4cd38560SBarry Smith pc[32] = m33 = p3*x31 + p9*x32 + p15*x33 + p21*x34 + p27*x35 + p33*x36; 1153*4cd38560SBarry Smith pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36; 1154*4cd38560SBarry Smith pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36; 1155*4cd38560SBarry Smith pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36; 1156*4cd38560SBarry Smith 1157*4cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 1158*4cd38560SBarry Smith pv += 36; 1159*4cd38560SBarry Smith for (j=0; j<nz; j++) { 1160*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1161*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; 1162*4cd38560SBarry Smith x9 = pv[8]; x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; 1163*4cd38560SBarry Smith x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 1164*4cd38560SBarry Smith x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; 1165*4cd38560SBarry Smith x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1166*4cd38560SBarry Smith x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27]; 1167*4cd38560SBarry Smith x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31]; 1168*4cd38560SBarry Smith x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35]; 1169*4cd38560SBarry Smith x = rtmp + 36*pj[j]; 1170*4cd38560SBarry Smith x[0] -= m1*x1 + m7*x2 + m13*x3 + m19*x4 + m25*x5 + m31*x6; 1171*4cd38560SBarry Smith x[1] -= m2*x1 + m8*x2 + m14*x3 + m20*x4 + m26*x5 + m32*x6; 1172*4cd38560SBarry Smith x[2] -= m3*x1 + m9*x2 + m15*x3 + m21*x4 + m27*x5 + m33*x6; 1173*4cd38560SBarry Smith x[3] -= m4*x1 + m10*x2 + m16*x3 + m22*x4 + m28*x5 + m34*x6; 1174*4cd38560SBarry Smith x[4] -= m5*x1 + m11*x2 + m17*x3 + m23*x4 + m29*x5 + m35*x6; 1175*4cd38560SBarry Smith x[5] -= m6*x1 + m12*x2 + m18*x3 + m24*x4 + m30*x5 + m36*x6; 1176*4cd38560SBarry Smith 1177*4cd38560SBarry Smith x[6] -= m1*x7 + m7*x8 + m13*x9 + m19*x10 + m25*x11 + m31*x12; 1178*4cd38560SBarry Smith x[7] -= m2*x7 + m8*x8 + m14*x9 + m20*x10 + m26*x11 + m32*x12; 1179*4cd38560SBarry Smith x[8] -= m3*x7 + m9*x8 + m15*x9 + m21*x10 + m27*x11 + m33*x12; 1180*4cd38560SBarry Smith x[9] -= m4*x7 + m10*x8 + m16*x9 + m22*x10 + m28*x11 + m34*x12; 1181*4cd38560SBarry Smith x[10] -= m5*x7 + m11*x8 + m17*x9 + m23*x10 + m29*x11 + m35*x12; 1182*4cd38560SBarry Smith x[11] -= m6*x7 + m12*x8 + m18*x9 + m24*x10 + m30*x11 + m36*x12; 1183*4cd38560SBarry Smith 1184*4cd38560SBarry Smith x[12] -= m1*x13 + m7*x14 + m13*x15 + m19*x16 + m25*x17 + m31*x18; 1185*4cd38560SBarry Smith x[13] -= m2*x13 + m8*x14 + m14*x15 + m20*x16 + m26*x17 + m32*x18; 1186*4cd38560SBarry Smith x[14] -= m3*x13 + m9*x14 + m15*x15 + m21*x16 + m27*x17 + m33*x18; 1187*4cd38560SBarry Smith x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18; 1188*4cd38560SBarry Smith x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18; 1189*4cd38560SBarry Smith x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18; 1190*4cd38560SBarry Smith 1191*4cd38560SBarry Smith x[18] -= m1*x19 + m7*x20 + m13*x21 + m19*x22 + m25*x23 + m31*x24; 1192*4cd38560SBarry Smith x[19] -= m2*x19 + m8*x20 + m14*x21 + m20*x22 + m26*x23 + m32*x24; 1193*4cd38560SBarry Smith x[20] -= m3*x19 + m9*x20 + m15*x21 + m21*x22 + m27*x23 + m33*x24; 1194*4cd38560SBarry Smith x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24; 1195*4cd38560SBarry Smith x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24; 1196*4cd38560SBarry Smith x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24; 1197*4cd38560SBarry Smith 1198*4cd38560SBarry Smith x[24] -= m1*x25 + m7*x26 + m13*x27 + m19*x28 + m25*x29 + m31*x30; 1199*4cd38560SBarry Smith x[25] -= m2*x25 + m8*x26 + m14*x27 + m20*x28 + m26*x29 + m32*x30; 1200*4cd38560SBarry Smith x[26] -= m3*x25 + m9*x26 + m15*x27 + m21*x28 + m27*x29 + m33*x30; 1201*4cd38560SBarry Smith x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30; 1202*4cd38560SBarry Smith x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30; 1203*4cd38560SBarry Smith x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30; 1204*4cd38560SBarry Smith 1205*4cd38560SBarry Smith x[30] -= m1*x31 + m7*x32 + m13*x33 + m19*x34 + m25*x35 + m31*x36; 1206*4cd38560SBarry Smith x[31] -= m2*x31 + m8*x32 + m14*x33 + m20*x34 + m26*x35 + m32*x36; 1207*4cd38560SBarry Smith x[32] -= m3*x31 + m9*x32 + m15*x33 + m21*x34 + m27*x35 + m33*x36; 1208*4cd38560SBarry Smith x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36; 1209*4cd38560SBarry Smith x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36; 1210*4cd38560SBarry Smith x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36; 1211*4cd38560SBarry Smith 1212*4cd38560SBarry Smith pv += 36; 1213*4cd38560SBarry Smith } 1214*4cd38560SBarry Smith PLogFlops(432*nz+396); 1215*4cd38560SBarry Smith } 1216*4cd38560SBarry Smith row = *ajtmp++; 1217*4cd38560SBarry Smith } 1218*4cd38560SBarry Smith /* finished row so stick it into b->a */ 1219*4cd38560SBarry Smith pv = ba + 36*bi[i]; 1220*4cd38560SBarry Smith pj = bj + bi[i]; 1221*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 1222*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 1223*4cd38560SBarry Smith x = rtmp+36*pj[j]; 1224*4cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1225*4cd38560SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; 1226*4cd38560SBarry Smith pv[8] = x[8]; pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; 1227*4cd38560SBarry Smith pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 1228*4cd38560SBarry Smith pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; 1229*4cd38560SBarry Smith pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; 1230*4cd38560SBarry Smith pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27]; 1231*4cd38560SBarry Smith pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31]; 1232*4cd38560SBarry Smith pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35]; 1233*4cd38560SBarry Smith pv += 36; 1234*4cd38560SBarry Smith } 1235*4cd38560SBarry Smith /* invert diagonal block */ 1236*4cd38560SBarry Smith w = ba + 36*diag_offset[i]; 1237*4cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_6(w); CHKERRQ(ierr); 1238*4cd38560SBarry Smith } 1239*4cd38560SBarry Smith 1240*4cd38560SBarry Smith PetscFree(rtmp); 1241*4cd38560SBarry Smith C->factor = FACTOR_LU; 1242*4cd38560SBarry Smith C->assembled = PETSC_TRUE; 1243*4cd38560SBarry Smith PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */ 1244*4cd38560SBarry Smith PetscFunctionReturn(0); 1245*4cd38560SBarry Smith } 1246*4cd38560SBarry Smith 1247*4cd38560SBarry Smith /* ------------------------------------------------------------*/ 1248*4cd38560SBarry Smith /* 12496d84be18SBarry Smith Version for when blocks are 5 by 5 12506d84be18SBarry Smith */ 12515615d1e5SSatish Balay #undef __FUNC__ 12525615d1e5SSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_5" 12536d84be18SBarry Smith int MatLUFactorNumeric_SeqBAIJ_5(Mat A,Mat *B) 12546d84be18SBarry Smith { 12556d84be18SBarry Smith Mat C = *B; 12566d84be18SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 12577cdf2dbbSSatish Balay IS isrow = b->row, isicol = b->icol; 12584078e994SBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 12598a36c062SBarry Smith int *ajtmpold, *ajtmp, nz, row; 12608a36c062SBarry Smith int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j; 12613a40ed3dSBarry Smith register int *pj; 12623f1db9ecSBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 12633f1db9ecSBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 12643f1db9ecSBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 12653f1db9ecSBarry Smith MatScalar x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14; 12663f1db9ecSBarry Smith MatScalar p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12; 12673f1db9ecSBarry Smith MatScalar m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 12683f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 12696d84be18SBarry Smith 12703a40ed3dSBarry Smith PetscFunctionBegin; 12716d84be18SBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 12726d84be18SBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 12733f1db9ecSBarry Smith rtmp = (MatScalar *) PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 12746d84be18SBarry Smith 12756d84be18SBarry Smith for ( i=0; i<n; i++ ) { 12764078e994SBarry Smith nz = bi[i+1] - bi[i]; 12774078e994SBarry Smith ajtmp = bj + bi[i]; 12786d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 12796d84be18SBarry Smith x = rtmp+25*ajtmp[j]; 12806d84be18SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 12816d84be18SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 12826d84be18SBarry Smith x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0; 12836d84be18SBarry Smith } 12846d84be18SBarry Smith /* load in initial (unfactored row) */ 12856d84be18SBarry Smith idx = r[i]; 12864078e994SBarry Smith nz = ai[idx+1] - ai[idx]; 12874078e994SBarry Smith ajtmpold = aj + ai[idx]; 12884078e994SBarry Smith v = aa + 25*ai[idx]; 12896d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 12906d84be18SBarry Smith x = rtmp+25*ic[ajtmpold[j]]; 12916d84be18SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 12926d84be18SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 12936d84be18SBarry Smith x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 12946d84be18SBarry Smith x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17]; 12956d84be18SBarry Smith x[18] = v[18]; x[19] = v[19]; x[20] = v[20]; x[21] = v[21]; 12966d84be18SBarry Smith x[22] = v[22]; x[23] = v[23]; x[24] = v[24]; 12976d84be18SBarry Smith v += 25; 12986d84be18SBarry Smith } 12996d84be18SBarry Smith row = *ajtmp++; 13006d84be18SBarry Smith while (row < i) { 13016d84be18SBarry Smith pc = rtmp + 25*row; 13026d84be18SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 13036d84be18SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 13046d84be18SBarry Smith p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 13056d84be18SBarry Smith p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; 13066d84be18SBarry Smith p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23]; 13076d84be18SBarry Smith p25 = pc[24]; 13086d84be18SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 13096d84be18SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 13106d84be18SBarry Smith p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 13116d84be18SBarry Smith || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || 13126d84be18SBarry Smith p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || 13134078e994SBarry Smith p24 != 0.0 || p25 != 0.0) { 13144078e994SBarry Smith pv = ba + 25*diag_offset[row]; 13154078e994SBarry Smith pj = bj + diag_offset[row] + 1; 13166d84be18SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 13176d84be18SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 13186d84be18SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 13196d84be18SBarry Smith x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; 13206d84be18SBarry Smith x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; 13216d84be18SBarry Smith x23 = pv[22]; x24 = pv[23]; x25 = pv[24]; 13226d84be18SBarry Smith pc[0] = m1 = p1*x1 + p6*x2 + p11*x3 + p16*x4 + p21*x5; 13236d84be18SBarry Smith pc[1] = m2 = p2*x1 + p7*x2 + p12*x3 + p17*x4 + p22*x5; 13246d84be18SBarry Smith pc[2] = m3 = p3*x1 + p8*x2 + p13*x3 + p18*x4 + p23*x5; 13256d84be18SBarry Smith pc[3] = m4 = p4*x1 + p9*x2 + p14*x3 + p19*x4 + p24*x5; 13266d84be18SBarry Smith pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5; 13276d84be18SBarry Smith 13286d84be18SBarry Smith pc[5] = m6 = p1*x6 + p6*x7 + p11*x8 + p16*x9 + p21*x10; 13296d84be18SBarry Smith pc[6] = m7 = p2*x6 + p7*x7 + p12*x8 + p17*x9 + p22*x10; 13306d84be18SBarry Smith pc[7] = m8 = p3*x6 + p8*x7 + p13*x8 + p18*x9 + p23*x10; 13316d84be18SBarry Smith pc[8] = m9 = p4*x6 + p9*x7 + p14*x8 + p19*x9 + p24*x10; 13326d84be18SBarry Smith pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10; 13336d84be18SBarry Smith 13346d84be18SBarry Smith pc[10] = m11 = p1*x11 + p6*x12 + p11*x13 + p16*x14 + p21*x15; 13356d84be18SBarry Smith pc[11] = m12 = p2*x11 + p7*x12 + p12*x13 + p17*x14 + p22*x15; 13366d84be18SBarry Smith pc[12] = m13 = p3*x11 + p8*x12 + p13*x13 + p18*x14 + p23*x15; 13376d84be18SBarry Smith pc[13] = m14 = p4*x11 + p9*x12 + p14*x13 + p19*x14 + p24*x15; 13386d84be18SBarry Smith pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15; 13396d84be18SBarry Smith 13406d84be18SBarry Smith pc[15] = m16 = p1*x16 + p6*x17 + p11*x18 + p16*x19 + p21*x20; 13416d84be18SBarry Smith pc[16] = m17 = p2*x16 + p7*x17 + p12*x18 + p17*x19 + p22*x20; 13426d84be18SBarry Smith pc[17] = m18 = p3*x16 + p8*x17 + p13*x18 + p18*x19 + p23*x20; 13436d84be18SBarry Smith pc[18] = m19 = p4*x16 + p9*x17 + p14*x18 + p19*x19 + p24*x20; 13446d84be18SBarry Smith pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20; 13456d84be18SBarry Smith 13466d84be18SBarry Smith pc[20] = m21 = p1*x21 + p6*x22 + p11*x23 + p16*x24 + p21*x25; 13476d84be18SBarry Smith pc[21] = m22 = p2*x21 + p7*x22 + p12*x23 + p17*x24 + p22*x25; 13486d84be18SBarry Smith pc[22] = m23 = p3*x21 + p8*x22 + p13*x23 + p18*x24 + p23*x25; 13496d84be18SBarry Smith pc[23] = m24 = p4*x21 + p9*x22 + p14*x23 + p19*x24 + p24*x25; 13506d84be18SBarry Smith pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25; 13516d84be18SBarry Smith 13524078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 13536d84be18SBarry Smith pv += 25; 13546d84be18SBarry Smith for (j=0; j<nz; j++) { 13556d84be18SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 13566d84be18SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 13576d84be18SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 13586d84be18SBarry Smith x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; 13596d84be18SBarry Smith x18 = pv[17]; x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; 13606d84be18SBarry Smith x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; x25 = pv[24]; 13616d84be18SBarry Smith x = rtmp + 25*pj[j]; 13626d84be18SBarry Smith x[0] -= m1*x1 + m6*x2 + m11*x3 + m16*x4 + m21*x5; 13636d84be18SBarry Smith x[1] -= m2*x1 + m7*x2 + m12*x3 + m17*x4 + m22*x5; 13646d84be18SBarry Smith x[2] -= m3*x1 + m8*x2 + m13*x3 + m18*x4 + m23*x5; 13656d84be18SBarry Smith x[3] -= m4*x1 + m9*x2 + m14*x3 + m19*x4 + m24*x5; 13666d84be18SBarry Smith x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5; 13676d84be18SBarry Smith 13686d84be18SBarry Smith x[5] -= m1*x6 + m6*x7 + m11*x8 + m16*x9 + m21*x10; 13696d84be18SBarry Smith x[6] -= m2*x6 + m7*x7 + m12*x8 + m17*x9 + m22*x10; 13706d84be18SBarry Smith x[7] -= m3*x6 + m8*x7 + m13*x8 + m18*x9 + m23*x10; 13716d84be18SBarry Smith x[8] -= m4*x6 + m9*x7 + m14*x8 + m19*x9 + m24*x10; 13726d84be18SBarry Smith x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10; 13736d84be18SBarry Smith 13746d84be18SBarry Smith x[10] -= m1*x11 + m6*x12 + m11*x13 + m16*x14 + m21*x15; 13756d84be18SBarry Smith x[11] -= m2*x11 + m7*x12 + m12*x13 + m17*x14 + m22*x15; 13766d84be18SBarry Smith x[12] -= m3*x11 + m8*x12 + m13*x13 + m18*x14 + m23*x15; 13776d84be18SBarry Smith x[13] -= m4*x11 + m9*x12 + m14*x13 + m19*x14 + m24*x15; 13786d84be18SBarry Smith x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15; 13796d84be18SBarry Smith 13806d84be18SBarry Smith x[15] -= m1*x16 + m6*x17 + m11*x18 + m16*x19 + m21*x20; 13816d84be18SBarry Smith x[16] -= m2*x16 + m7*x17 + m12*x18 + m17*x19 + m22*x20; 13826d84be18SBarry Smith x[17] -= m3*x16 + m8*x17 + m13*x18 + m18*x19 + m23*x20; 13836d84be18SBarry Smith x[18] -= m4*x16 + m9*x17 + m14*x18 + m19*x19 + m24*x20; 13846d84be18SBarry Smith x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20; 13856d84be18SBarry Smith 13866d84be18SBarry Smith x[20] -= m1*x21 + m6*x22 + m11*x23 + m16*x24 + m21*x25; 13876d84be18SBarry Smith x[21] -= m2*x21 + m7*x22 + m12*x23 + m17*x24 + m22*x25; 13886d84be18SBarry Smith x[22] -= m3*x21 + m8*x22 + m13*x23 + m18*x24 + m23*x25; 13896d84be18SBarry Smith x[23] -= m4*x21 + m9*x22 + m14*x23 + m19*x24 + m24*x25; 13906d84be18SBarry Smith x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25; 13916d84be18SBarry Smith 13926d84be18SBarry Smith pv += 25; 13936d84be18SBarry Smith } 13946d84be18SBarry Smith PLogFlops(250*nz+225); 13956d84be18SBarry Smith } 13966d84be18SBarry Smith row = *ajtmp++; 13976d84be18SBarry Smith } 13986d84be18SBarry Smith /* finished row so stick it into b->a */ 13994078e994SBarry Smith pv = ba + 25*bi[i]; 14004078e994SBarry Smith pj = bj + bi[i]; 14014078e994SBarry Smith nz = bi[i+1] - bi[i]; 14026d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 14036d84be18SBarry Smith x = rtmp+25*pj[j]; 14046d84be18SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 14056d84be18SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 14066d84be18SBarry Smith pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 14076d84be18SBarry Smith pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16]; 14086d84be18SBarry Smith pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20]; 14096d84be18SBarry Smith pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; pv[24] = x[24]; 14106d84be18SBarry Smith pv += 25; 14116d84be18SBarry Smith } 14126d84be18SBarry Smith /* invert diagonal block */ 14134078e994SBarry Smith w = ba + 25*diag_offset[i]; 14148a36c062SBarry Smith ierr = Kernel_A_gets_inverse_A_5(w); CHKERRQ(ierr); 14156d84be18SBarry Smith } 14166d84be18SBarry Smith 14176d84be18SBarry Smith PetscFree(rtmp); 14186d84be18SBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 14196d84be18SBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 14206d84be18SBarry Smith C->factor = FACTOR_LU; 14216d84be18SBarry Smith C->assembled = PETSC_TRUE; 14226d84be18SBarry Smith PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */ 14233a40ed3dSBarry Smith PetscFunctionReturn(0); 14246d84be18SBarry Smith } 1425*4cd38560SBarry Smith /* 1426*4cd38560SBarry Smith Version for when blocks are 5 by 5 Using natural ordering 1427*4cd38560SBarry Smith */ 1428*4cd38560SBarry Smith #undef __FUNC__ 1429*4cd38560SBarry Smith #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering" 1430*4cd38560SBarry Smith int MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering(Mat A,Mat *B) 1431*4cd38560SBarry Smith { 1432*4cd38560SBarry Smith Mat C = *B; 1433*4cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 1434*4cd38560SBarry Smith int ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 1435*4cd38560SBarry Smith int *ajtmpold, *ajtmp, nz, row; 1436*4cd38560SBarry Smith int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1437*4cd38560SBarry Smith register int *pj; 1438*4cd38560SBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1439*4cd38560SBarry Smith MatScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 1440*4cd38560SBarry Smith MatScalar x16,x17,x18,x19,x20,x21,x22,x23,x24,x25; 1441*4cd38560SBarry Smith MatScalar p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15; 1442*4cd38560SBarry Smith MatScalar p16,p17,p18,p19,p20,p21,p22,p23,p24,p25; 1443*4cd38560SBarry Smith MatScalar m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15; 1444*4cd38560SBarry Smith MatScalar m16,m17,m18,m19,m20,m21,m22,m23,m24,m25; 1445*4cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 1446*4cd38560SBarry Smith 1447*4cd38560SBarry Smith PetscFunctionBegin; 1448*4cd38560SBarry Smith rtmp = (MatScalar *) PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 1449*4cd38560SBarry Smith for ( i=0; i<n; i++ ) { 1450*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 1451*4cd38560SBarry Smith ajtmp = bj + bi[i]; 1452*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 1453*4cd38560SBarry Smith x = rtmp+25*ajtmp[j]; 1454*4cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 1455*4cd38560SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 1456*4cd38560SBarry Smith x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0; 1457*4cd38560SBarry Smith } 1458*4cd38560SBarry Smith /* load in initial (unfactored row) */ 1459*4cd38560SBarry Smith nz = ai[i+1] - ai[i]; 1460*4cd38560SBarry Smith ajtmpold = aj + ai[i]; 1461*4cd38560SBarry Smith v = aa + 25*ai[i]; 1462*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 1463*4cd38560SBarry Smith x = rtmp+25*ajtmpold[j]; 1464*4cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1465*4cd38560SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 1466*4cd38560SBarry Smith x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 1467*4cd38560SBarry Smith x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; 1468*4cd38560SBarry Smith x[19] = v[19]; x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23]; 1469*4cd38560SBarry Smith x[24] = v[24]; 1470*4cd38560SBarry Smith v += 25; 1471*4cd38560SBarry Smith } 1472*4cd38560SBarry Smith row = *ajtmp++; 1473*4cd38560SBarry Smith while (row < i) { 1474*4cd38560SBarry Smith pc = rtmp + 25*row; 1475*4cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1476*4cd38560SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 1477*4cd38560SBarry Smith p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 1478*4cd38560SBarry Smith p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; 1479*4cd38560SBarry Smith p19 = pc[18]; p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; 1480*4cd38560SBarry Smith p24 = pc[23]; p25 = pc[24]; 1481*4cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 1482*4cd38560SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 1483*4cd38560SBarry Smith p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 1484*4cd38560SBarry Smith || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 1485*4cd38560SBarry Smith || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) { 1486*4cd38560SBarry Smith pv = ba + 25*diag_offset[row]; 1487*4cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 1488*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1489*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1490*4cd38560SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 1491*4cd38560SBarry Smith x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; 1492*4cd38560SBarry Smith x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23]; 1493*4cd38560SBarry Smith x25 = pv[24]; 1494*4cd38560SBarry Smith pc[0] = m1 = p1*x1 + p6*x2 + p11*x3 + p16*x4 + p21*x5; 1495*4cd38560SBarry Smith pc[1] = m2 = p2*x1 + p7*x2 + p12*x3 + p17*x4 + p22*x5; 1496*4cd38560SBarry Smith pc[2] = m3 = p3*x1 + p8*x2 + p13*x3 + p18*x4 + p23*x5; 1497*4cd38560SBarry Smith pc[3] = m4 = p4*x1 + p9*x2 + p14*x3 + p19*x4 + p24*x5; 1498*4cd38560SBarry Smith pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5; 1499*4cd38560SBarry Smith 1500*4cd38560SBarry Smith pc[5] = m6 = p1*x6 + p6*x7 + p11*x8 + p16*x9 + p21*x10; 1501*4cd38560SBarry Smith pc[6] = m7 = p2*x6 + p7*x7 + p12*x8 + p17*x9 + p22*x10; 1502*4cd38560SBarry Smith pc[7] = m8 = p3*x6 + p8*x7 + p13*x8 + p18*x9 + p23*x10; 1503*4cd38560SBarry Smith pc[8] = m9 = p4*x6 + p9*x7 + p14*x8 + p19*x9 + p24*x10; 1504*4cd38560SBarry Smith pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10; 1505*4cd38560SBarry Smith 1506*4cd38560SBarry Smith pc[10] = m11 = p1*x11 + p6*x12 + p11*x13 + p16*x14 + p21*x15; 1507*4cd38560SBarry Smith pc[11] = m12 = p2*x11 + p7*x12 + p12*x13 + p17*x14 + p22*x15; 1508*4cd38560SBarry Smith pc[12] = m13 = p3*x11 + p8*x12 + p13*x13 + p18*x14 + p23*x15; 1509*4cd38560SBarry Smith pc[13] = m14 = p4*x11 + p9*x12 + p14*x13 + p19*x14 + p24*x15; 1510*4cd38560SBarry Smith pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15; 1511*4cd38560SBarry Smith 1512*4cd38560SBarry Smith pc[15] = m16 = p1*x16 + p6*x17 + p11*x18 + p16*x19 + p21*x20; 1513*4cd38560SBarry Smith pc[16] = m17 = p2*x16 + p7*x17 + p12*x18 + p17*x19 + p22*x20; 1514*4cd38560SBarry Smith pc[17] = m18 = p3*x16 + p8*x17 + p13*x18 + p18*x19 + p23*x20; 1515*4cd38560SBarry Smith pc[18] = m19 = p4*x16 + p9*x17 + p14*x18 + p19*x19 + p24*x20; 1516*4cd38560SBarry Smith pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20; 1517*4cd38560SBarry Smith 1518*4cd38560SBarry Smith pc[20] = m21 = p1*x21 + p6*x22 + p11*x23 + p16*x24 + p21*x25; 1519*4cd38560SBarry Smith pc[21] = m22 = p2*x21 + p7*x22 + p12*x23 + p17*x24 + p22*x25; 1520*4cd38560SBarry Smith pc[22] = m23 = p3*x21 + p8*x22 + p13*x23 + p18*x24 + p23*x25; 1521*4cd38560SBarry Smith pc[23] = m24 = p4*x21 + p9*x22 + p14*x23 + p19*x24 + p24*x25; 1522*4cd38560SBarry Smith pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25; 1523*4cd38560SBarry Smith 1524*4cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 1525*4cd38560SBarry Smith pv += 25; 1526*4cd38560SBarry Smith for (j=0; j<nz; j++) { 1527*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1528*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 1529*4cd38560SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 1530*4cd38560SBarry Smith x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; 1531*4cd38560SBarry Smith x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; 1532*4cd38560SBarry Smith x24 = pv[23]; x25 = pv[24]; 1533*4cd38560SBarry Smith x = rtmp + 25*pj[j]; 1534*4cd38560SBarry Smith x[0] -= m1*x1 + m6*x2 + m11*x3 + m16*x4 + m21*x5; 1535*4cd38560SBarry Smith x[1] -= m2*x1 + m7*x2 + m12*x3 + m17*x4 + m22*x5; 1536*4cd38560SBarry Smith x[2] -= m3*x1 + m8*x2 + m13*x3 + m18*x4 + m23*x5; 1537*4cd38560SBarry Smith x[3] -= m4*x1 + m9*x2 + m14*x3 + m19*x4 + m24*x5; 1538*4cd38560SBarry Smith x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5; 1539*4cd38560SBarry Smith 1540*4cd38560SBarry Smith x[5] -= m1*x6 + m6*x7 + m11*x8 + m16*x9 + m21*x10; 1541*4cd38560SBarry Smith x[6] -= m2*x6 + m7*x7 + m12*x8 + m17*x9 + m22*x10; 1542*4cd38560SBarry Smith x[7] -= m3*x6 + m8*x7 + m13*x8 + m18*x9 + m23*x10; 1543*4cd38560SBarry Smith x[8] -= m4*x6 + m9*x7 + m14*x8 + m19*x9 + m24*x10; 1544*4cd38560SBarry Smith x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10; 1545*4cd38560SBarry Smith 1546*4cd38560SBarry Smith x[10] -= m1*x11 + m6*x12 + m11*x13 + m16*x14 + m21*x15; 1547*4cd38560SBarry Smith x[11] -= m2*x11 + m7*x12 + m12*x13 + m17*x14 + m22*x15; 1548*4cd38560SBarry Smith x[12] -= m3*x11 + m8*x12 + m13*x13 + m18*x14 + m23*x15; 1549*4cd38560SBarry Smith x[13] -= m4*x11 + m9*x12 + m14*x13 + m19*x14 + m24*x15; 1550*4cd38560SBarry Smith x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15; 1551*4cd38560SBarry Smith 1552*4cd38560SBarry Smith x[15] -= m1*x16 + m6*x17 + m11*x18 + m16*x19 + m21*x20; 1553*4cd38560SBarry Smith x[16] -= m2*x16 + m7*x17 + m12*x18 + m17*x19 + m22*x20; 1554*4cd38560SBarry Smith x[17] -= m3*x16 + m8*x17 + m13*x18 + m18*x19 + m23*x20; 1555*4cd38560SBarry Smith x[18] -= m4*x16 + m9*x17 + m14*x18 + m19*x19 + m24*x20; 1556*4cd38560SBarry Smith x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20; 1557*4cd38560SBarry Smith 1558*4cd38560SBarry Smith x[20] -= m1*x21 + m6*x22 + m11*x23 + m16*x24 + m21*x25; 1559*4cd38560SBarry Smith x[21] -= m2*x21 + m7*x22 + m12*x23 + m17*x24 + m22*x25; 1560*4cd38560SBarry Smith x[22] -= m3*x21 + m8*x22 + m13*x23 + m18*x24 + m23*x25; 1561*4cd38560SBarry Smith x[23] -= m4*x21 + m9*x22 + m14*x23 + m19*x24 + m24*x25; 1562*4cd38560SBarry Smith x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25; 1563*4cd38560SBarry Smith pv += 25; 1564*4cd38560SBarry Smith } 1565*4cd38560SBarry Smith PLogFlops(250*nz+225); 1566*4cd38560SBarry Smith } 1567*4cd38560SBarry Smith row = *ajtmp++; 1568*4cd38560SBarry Smith } 1569*4cd38560SBarry Smith /* finished row so stick it into b->a */ 1570*4cd38560SBarry Smith pv = ba + 25*bi[i]; 1571*4cd38560SBarry Smith pj = bj + bi[i]; 1572*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 1573*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 1574*4cd38560SBarry Smith x = rtmp+25*pj[j]; 1575*4cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1576*4cd38560SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 1577*4cd38560SBarry Smith pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 1578*4cd38560SBarry Smith pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16]; pv[17] = x[17]; 1579*4cd38560SBarry Smith pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; 1580*4cd38560SBarry Smith pv[23] = x[23]; pv[24] = x[24]; 1581*4cd38560SBarry Smith pv += 25; 1582*4cd38560SBarry Smith } 1583*4cd38560SBarry Smith /* invert diagonal block */ 1584*4cd38560SBarry Smith w = ba + 25*diag_offset[i]; 1585*4cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_5(w); CHKERRQ(ierr); 1586*4cd38560SBarry Smith } 1587*4cd38560SBarry Smith 1588*4cd38560SBarry Smith PetscFree(rtmp); 1589*4cd38560SBarry Smith C->factor = FACTOR_LU; 1590*4cd38560SBarry Smith C->assembled = PETSC_TRUE; 1591*4cd38560SBarry Smith PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */ 1592*4cd38560SBarry Smith PetscFunctionReturn(0); 1593*4cd38560SBarry Smith } 15946d84be18SBarry Smith 15956d84be18SBarry Smith /* ------------------------------------------------------------*/ 15966d84be18SBarry Smith /* 15976d84be18SBarry Smith Version for when blocks are 4 by 4 15986d84be18SBarry Smith */ 15995615d1e5SSatish Balay #undef __FUNC__ 16005615d1e5SSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_4" 16016d84be18SBarry Smith int MatLUFactorNumeric_SeqBAIJ_4(Mat A,Mat *B) 16026d84be18SBarry Smith { 16036d84be18SBarry Smith Mat C = *B; 16046d84be18SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 16057cdf2dbbSSatish Balay IS isrow = b->row, isicol = b->icol; 16064078e994SBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 16078a36c062SBarry Smith int *ajtmpold, *ajtmp, nz, row; 16088a36c062SBarry Smith int *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j; 16093a40ed3dSBarry Smith register int *pj; 16103f1db9ecSBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 16113f1db9ecSBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 16123f1db9ecSBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 16133f1db9ecSBarry Smith MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 16143f1db9ecSBarry Smith MatScalar m13,m14,m15,m16; 16153f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 16166d84be18SBarry Smith 16173a40ed3dSBarry Smith PetscFunctionBegin; 16186d84be18SBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 16196d84be18SBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 16203f1db9ecSBarry Smith rtmp = (MatScalar *) PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 16216d84be18SBarry Smith 16226d84be18SBarry Smith for ( i=0; i<n; i++ ) { 16234078e994SBarry Smith nz = bi[i+1] - bi[i]; 16244078e994SBarry Smith ajtmp = bj + bi[i]; 16256d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 16266d84be18SBarry Smith x = rtmp+16*ajtmp[j]; 16276d84be18SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 16286d84be18SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 16296d84be18SBarry Smith } 16306d84be18SBarry Smith /* load in initial (unfactored row) */ 16316d84be18SBarry Smith idx = r[i]; 16324078e994SBarry Smith nz = ai[idx+1] - ai[idx]; 16334078e994SBarry Smith ajtmpold = aj + ai[idx]; 16344078e994SBarry Smith v = aa + 16*ai[idx]; 16356d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 16366d84be18SBarry Smith x = rtmp+16*ic[ajtmpold[j]]; 16376d84be18SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 16386d84be18SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 16396d84be18SBarry Smith x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 16406d84be18SBarry Smith x[14] = v[14]; x[15] = v[15]; 16416d84be18SBarry Smith v += 16; 16426d84be18SBarry Smith } 16436d84be18SBarry Smith row = *ajtmp++; 16446d84be18SBarry Smith while (row < i) { 16456d84be18SBarry Smith pc = rtmp + 16*row; 16466d84be18SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 16476d84be18SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 16486d84be18SBarry Smith p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 16496d84be18SBarry Smith p15 = pc[14]; p16 = pc[15]; 16506d84be18SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 16516d84be18SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 16526d84be18SBarry Smith p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 16536d84be18SBarry Smith || p16 != 0.0) { 16544078e994SBarry Smith pv = ba + 16*diag_offset[row]; 16554078e994SBarry Smith pj = bj + diag_offset[row] + 1; 16566d84be18SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 16576d84be18SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 16586d84be18SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 16596d84be18SBarry Smith x15 = pv[14]; x16 = pv[15]; 16606d84be18SBarry Smith pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 16616d84be18SBarry Smith pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 16626d84be18SBarry Smith pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 16636d84be18SBarry Smith pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 16646d84be18SBarry Smith 16656d84be18SBarry Smith pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 16666d84be18SBarry Smith pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 16676d84be18SBarry Smith pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 16686d84be18SBarry Smith pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 16696d84be18SBarry Smith 16706d84be18SBarry Smith pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 16716d84be18SBarry Smith pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 16726d84be18SBarry Smith pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 16736d84be18SBarry Smith pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 16746d84be18SBarry Smith 16756d84be18SBarry Smith pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 16766d84be18SBarry Smith pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 16776d84be18SBarry Smith pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 16786d84be18SBarry Smith pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 16796d84be18SBarry Smith 16804078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 16816d84be18SBarry Smith pv += 16; 16826d84be18SBarry Smith for (j=0; j<nz; j++) { 16836d84be18SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 16846d84be18SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 16856d84be18SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 16866d84be18SBarry Smith x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 16876d84be18SBarry Smith x = rtmp + 16*pj[j]; 16886d84be18SBarry Smith x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 16896d84be18SBarry Smith x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 16906d84be18SBarry Smith x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 16916d84be18SBarry Smith x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 16926d84be18SBarry Smith 16936d84be18SBarry Smith x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 16946d84be18SBarry Smith x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 16956d84be18SBarry Smith x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 16966d84be18SBarry Smith x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 16976d84be18SBarry Smith 16986d84be18SBarry Smith x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 16996d84be18SBarry Smith x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 17006d84be18SBarry Smith x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 17016d84be18SBarry Smith x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 17026d84be18SBarry Smith 17036d84be18SBarry Smith x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 17046d84be18SBarry Smith x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 17056d84be18SBarry Smith x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 17066d84be18SBarry Smith x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 17076d84be18SBarry Smith 17086d84be18SBarry Smith pv += 16; 17096d84be18SBarry Smith } 17106d84be18SBarry Smith PLogFlops(128*nz+112); 17116d84be18SBarry Smith } 17126d84be18SBarry Smith row = *ajtmp++; 17136d84be18SBarry Smith } 17146d84be18SBarry Smith /* finished row so stick it into b->a */ 17154078e994SBarry Smith pv = ba + 16*bi[i]; 17164078e994SBarry Smith pj = bj + bi[i]; 17174078e994SBarry Smith nz = bi[i+1] - bi[i]; 17186d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 17196d84be18SBarry Smith x = rtmp+16*pj[j]; 17206d84be18SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 17216d84be18SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 17226d84be18SBarry Smith pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 17236d84be18SBarry Smith pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 17246d84be18SBarry Smith pv += 16; 17256d84be18SBarry Smith } 17266d84be18SBarry Smith /* invert diagonal block */ 17274078e994SBarry Smith w = ba + 16*diag_offset[i]; 17288a36c062SBarry Smith ierr = Kernel_A_gets_inverse_A_4(w); CHKERRQ(ierr); 17296d84be18SBarry Smith } 17306d84be18SBarry Smith 17316d84be18SBarry Smith PetscFree(rtmp); 17326d84be18SBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 17336d84be18SBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 17346d84be18SBarry Smith C->factor = FACTOR_LU; 17356d84be18SBarry Smith C->assembled = PETSC_TRUE; 17366d84be18SBarry Smith PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */ 17373a40ed3dSBarry Smith PetscFunctionReturn(0); 17386d84be18SBarry Smith } 173983559355SBarry Smith /* 174083559355SBarry Smith Version for when blocks are 4 by 4 Using natural ordering 174183559355SBarry Smith */ 174283559355SBarry Smith #undef __FUNC__ 174383559355SBarry Smith #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 174483559355SBarry Smith int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat A,Mat *B) 174583559355SBarry Smith { 174683559355SBarry Smith Mat C = *B; 174783559355SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 174883559355SBarry Smith int ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 174983559355SBarry Smith int *ajtmpold, *ajtmp, nz, row; 175083559355SBarry Smith int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 17513a40ed3dSBarry Smith register int *pj; 17523f1db9ecSBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 17533f1db9ecSBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 17543f1db9ecSBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 17553f1db9ecSBarry Smith MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 17563f1db9ecSBarry Smith MatScalar m13,m14,m15,m16; 17573f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 175883559355SBarry Smith 17593a40ed3dSBarry Smith PetscFunctionBegin; 17603f1db9ecSBarry Smith rtmp = (MatScalar *) PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 176183559355SBarry Smith 176283559355SBarry Smith for ( i=0; i<n; i++ ) { 176383559355SBarry Smith nz = bi[i+1] - bi[i]; 176483559355SBarry Smith ajtmp = bj + bi[i]; 176583559355SBarry Smith for ( j=0; j<nz; j++ ) { 176683559355SBarry Smith x = rtmp+16*ajtmp[j]; 176783559355SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 176883559355SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 176983559355SBarry Smith } 177083559355SBarry Smith /* load in initial (unfactored row) */ 177183559355SBarry Smith nz = ai[i+1] - ai[i]; 177283559355SBarry Smith ajtmpold = aj + ai[i]; 177383559355SBarry Smith v = aa + 16*ai[i]; 177483559355SBarry Smith for ( j=0; j<nz; j++ ) { 177583559355SBarry Smith x = rtmp+16*ajtmpold[j]; 177683559355SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 177783559355SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 177883559355SBarry Smith x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 177983559355SBarry Smith x[14] = v[14]; x[15] = v[15]; 178083559355SBarry Smith v += 16; 178183559355SBarry Smith } 178283559355SBarry Smith row = *ajtmp++; 178383559355SBarry Smith while (row < i) { 178483559355SBarry Smith pc = rtmp + 16*row; 178583559355SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 178683559355SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 178783559355SBarry Smith p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 178883559355SBarry Smith p15 = pc[14]; p16 = pc[15]; 178983559355SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 179083559355SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 179183559355SBarry Smith p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 179283559355SBarry Smith || p16 != 0.0) { 179383559355SBarry Smith pv = ba + 16*diag_offset[row]; 179483559355SBarry Smith pj = bj + diag_offset[row] + 1; 179583559355SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 179683559355SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 179783559355SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 179883559355SBarry Smith x15 = pv[14]; x16 = pv[15]; 179983559355SBarry Smith pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 180083559355SBarry Smith pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 180183559355SBarry Smith pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 180283559355SBarry Smith pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 180383559355SBarry Smith 180483559355SBarry Smith pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 180583559355SBarry Smith pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 180683559355SBarry Smith pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 180783559355SBarry Smith pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 180883559355SBarry Smith 180983559355SBarry Smith pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 181083559355SBarry Smith pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 181183559355SBarry Smith pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 181283559355SBarry Smith pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 181383559355SBarry Smith 181483559355SBarry Smith pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 181583559355SBarry Smith pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 181683559355SBarry Smith pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 181783559355SBarry Smith pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 181883559355SBarry Smith 181983559355SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 182083559355SBarry Smith pv += 16; 182183559355SBarry Smith for (j=0; j<nz; j++) { 182283559355SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 182383559355SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 182483559355SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 182583559355SBarry Smith x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 182683559355SBarry Smith x = rtmp + 16*pj[j]; 182783559355SBarry Smith x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 182883559355SBarry Smith x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 182983559355SBarry Smith x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 183083559355SBarry Smith x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 183183559355SBarry Smith 183283559355SBarry Smith x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 183383559355SBarry Smith x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 183483559355SBarry Smith x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 183583559355SBarry Smith x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 183683559355SBarry Smith 183783559355SBarry Smith x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 183883559355SBarry Smith x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 183983559355SBarry Smith x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 184083559355SBarry Smith x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 184183559355SBarry Smith 184283559355SBarry Smith x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 184383559355SBarry Smith x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 184483559355SBarry Smith x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 184583559355SBarry Smith x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 184683559355SBarry Smith 184783559355SBarry Smith pv += 16; 184883559355SBarry Smith } 184983559355SBarry Smith PLogFlops(128*nz+112); 185083559355SBarry Smith } 185183559355SBarry Smith row = *ajtmp++; 185283559355SBarry Smith } 185383559355SBarry Smith /* finished row so stick it into b->a */ 185483559355SBarry Smith pv = ba + 16*bi[i]; 185583559355SBarry Smith pj = bj + bi[i]; 185683559355SBarry Smith nz = bi[i+1] - bi[i]; 185783559355SBarry Smith for ( j=0; j<nz; j++ ) { 185883559355SBarry Smith x = rtmp+16*pj[j]; 185983559355SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 186083559355SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 186183559355SBarry Smith pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 186283559355SBarry Smith pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 186383559355SBarry Smith pv += 16; 186483559355SBarry Smith } 186583559355SBarry Smith /* invert diagonal block */ 186683559355SBarry Smith w = ba + 16*diag_offset[i]; 186783559355SBarry Smith ierr = Kernel_A_gets_inverse_A_4(w); CHKERRQ(ierr); 186883559355SBarry Smith } 186983559355SBarry Smith 187083559355SBarry Smith PetscFree(rtmp); 187183559355SBarry Smith C->factor = FACTOR_LU; 187283559355SBarry Smith C->assembled = PETSC_TRUE; 187383559355SBarry Smith PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */ 18743a40ed3dSBarry Smith PetscFunctionReturn(0); 187583559355SBarry Smith } 187683559355SBarry Smith 1877667159a5SBarry Smith 18786d84be18SBarry Smith /* ------------------------------------------------------------*/ 18796d84be18SBarry Smith /* 18806d84be18SBarry Smith Version for when blocks are 3 by 3 18816d84be18SBarry Smith */ 18825615d1e5SSatish Balay #undef __FUNC__ 18835615d1e5SSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_3" 18846d84be18SBarry Smith int MatLUFactorNumeric_SeqBAIJ_3(Mat A,Mat *B) 18856d84be18SBarry Smith { 18866d84be18SBarry Smith Mat C = *B; 18876d84be18SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 18887cdf2dbbSSatish Balay IS isrow = b->row, isicol = b->icol; 18894078e994SBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 1890a3c32e02SBarry Smith int *ajtmpold, *ajtmp, nz, row, *ai=a->i,*aj=a->j; 1891a3c32e02SBarry Smith int *diag_offset = b->diag,idx; 18923a40ed3dSBarry Smith register int *pj; 18933f1db9ecSBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 18943f1db9ecSBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 18953f1db9ecSBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9; 18963f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 18976d84be18SBarry Smith 18983a40ed3dSBarry Smith PetscFunctionBegin; 18996d84be18SBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 19006d84be18SBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 19013f1db9ecSBarry Smith rtmp = (MatScalar *) PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 19026d84be18SBarry Smith 19036d84be18SBarry Smith for ( i=0; i<n; i++ ) { 19044078e994SBarry Smith nz = bi[i+1] - bi[i]; 19054078e994SBarry Smith ajtmp = bj + bi[i]; 19066d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 19076d84be18SBarry Smith x = rtmp + 9*ajtmp[j]; 1908*4cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 19096d84be18SBarry Smith } 19106d84be18SBarry Smith /* load in initial (unfactored row) */ 19116d84be18SBarry Smith idx = r[i]; 19124078e994SBarry Smith nz = ai[idx+1] - ai[idx]; 19134078e994SBarry Smith ajtmpold = aj + ai[idx]; 19144078e994SBarry Smith v = aa + 9*ai[idx]; 19156d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 19166d84be18SBarry Smith x = rtmp + 9*ic[ajtmpold[j]]; 19176d84be18SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 19186d84be18SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 19196d84be18SBarry Smith v += 9; 19206d84be18SBarry Smith } 19216d84be18SBarry Smith row = *ajtmp++; 19226d84be18SBarry Smith while (row < i) { 19236d84be18SBarry Smith pc = rtmp + 9*row; 19246d84be18SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 19256d84be18SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 19266d84be18SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 19276d84be18SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 19284078e994SBarry Smith pv = ba + 9*diag_offset[row]; 19294078e994SBarry Smith pj = bj + diag_offset[row] + 1; 19306d84be18SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 19316d84be18SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 19326d84be18SBarry Smith pc[0] = m1 = p1*x1 + p4*x2 + p7*x3; 19336d84be18SBarry Smith pc[1] = m2 = p2*x1 + p5*x2 + p8*x3; 19346d84be18SBarry Smith pc[2] = m3 = p3*x1 + p6*x2 + p9*x3; 19356d84be18SBarry Smith 19366d84be18SBarry Smith pc[3] = m4 = p1*x4 + p4*x5 + p7*x6; 19376d84be18SBarry Smith pc[4] = m5 = p2*x4 + p5*x5 + p8*x6; 19386d84be18SBarry Smith pc[5] = m6 = p3*x4 + p6*x5 + p9*x6; 19396d84be18SBarry Smith 19406d84be18SBarry Smith pc[6] = m7 = p1*x7 + p4*x8 + p7*x9; 19416d84be18SBarry Smith pc[7] = m8 = p2*x7 + p5*x8 + p8*x9; 19426d84be18SBarry Smith pc[8] = m9 = p3*x7 + p6*x8 + p9*x9; 19434078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 19446d84be18SBarry Smith pv += 9; 19456d84be18SBarry Smith for (j=0; j<nz; j++) { 19466d84be18SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 19476d84be18SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 19486d84be18SBarry Smith x = rtmp + 9*pj[j]; 19496d84be18SBarry Smith x[0] -= m1*x1 + m4*x2 + m7*x3; 19506d84be18SBarry Smith x[1] -= m2*x1 + m5*x2 + m8*x3; 19516d84be18SBarry Smith x[2] -= m3*x1 + m6*x2 + m9*x3; 19526d84be18SBarry Smith 19536d84be18SBarry Smith x[3] -= m1*x4 + m4*x5 + m7*x6; 19546d84be18SBarry Smith x[4] -= m2*x4 + m5*x5 + m8*x6; 19556d84be18SBarry Smith x[5] -= m3*x4 + m6*x5 + m9*x6; 19566d84be18SBarry Smith 19576d84be18SBarry Smith x[6] -= m1*x7 + m4*x8 + m7*x9; 19586d84be18SBarry Smith x[7] -= m2*x7 + m5*x8 + m8*x9; 19596d84be18SBarry Smith x[8] -= m3*x7 + m6*x8 + m9*x9; 19606d84be18SBarry Smith pv += 9; 19616d84be18SBarry Smith } 19626d84be18SBarry Smith PLogFlops(54*nz+36); 19636d84be18SBarry Smith } 19646d84be18SBarry Smith row = *ajtmp++; 19656d84be18SBarry Smith } 19666d84be18SBarry Smith /* finished row so stick it into b->a */ 19674078e994SBarry Smith pv = ba + 9*bi[i]; 19684078e994SBarry Smith pj = bj + bi[i]; 19694078e994SBarry Smith nz = bi[i+1] - bi[i]; 19706d84be18SBarry Smith for ( j=0; j<nz; j++ ) { 19716d84be18SBarry Smith x = rtmp + 9*pj[j]; 19726d84be18SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 19736d84be18SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 19746d84be18SBarry Smith pv += 9; 19756d84be18SBarry Smith } 19766d84be18SBarry Smith /* invert diagonal block */ 19774078e994SBarry Smith w = ba + 9*diag_offset[i]; 19784078e994SBarry Smith ierr = Kernel_A_gets_inverse_A_3(w); CHKERRQ(ierr); 19796d84be18SBarry Smith } 19806d84be18SBarry Smith 19816d84be18SBarry Smith PetscFree(rtmp); 19826d84be18SBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 19836d84be18SBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 19846d84be18SBarry Smith C->factor = FACTOR_LU; 19856d84be18SBarry Smith C->assembled = PETSC_TRUE; 19866d84be18SBarry Smith PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */ 19873a40ed3dSBarry Smith PetscFunctionReturn(0); 19886d84be18SBarry Smith } 1989*4cd38560SBarry Smith /* 1990*4cd38560SBarry Smith Version for when blocks are 3 by 3 Using natural ordering 1991*4cd38560SBarry Smith */ 1992*4cd38560SBarry Smith #undef __FUNC__ 1993*4cd38560SBarry Smith #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering" 1994*4cd38560SBarry Smith int MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat A,Mat *B) 1995*4cd38560SBarry Smith { 1996*4cd38560SBarry Smith Mat C = *B; 1997*4cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 1998*4cd38560SBarry Smith int ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 1999*4cd38560SBarry Smith int *ajtmpold, *ajtmp, nz, row; 2000*4cd38560SBarry Smith int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 2001*4cd38560SBarry Smith register int *pj; 2002*4cd38560SBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2003*4cd38560SBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 2004*4cd38560SBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9; 2005*4cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 2006*4cd38560SBarry Smith 2007*4cd38560SBarry Smith PetscFunctionBegin; 2008*4cd38560SBarry Smith rtmp = (MatScalar *) PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 2009*4cd38560SBarry Smith 2010*4cd38560SBarry Smith for ( i=0; i<n; i++ ) { 2011*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 2012*4cd38560SBarry Smith ajtmp = bj + bi[i]; 2013*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 2014*4cd38560SBarry Smith x = rtmp+9*ajtmp[j]; 2015*4cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 2016*4cd38560SBarry Smith } 2017*4cd38560SBarry Smith /* load in initial (unfactored row) */ 2018*4cd38560SBarry Smith nz = ai[i+1] - ai[i]; 2019*4cd38560SBarry Smith ajtmpold = aj + ai[i]; 2020*4cd38560SBarry Smith v = aa + 9*ai[i]; 2021*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 2022*4cd38560SBarry Smith x = rtmp+9*ajtmpold[j]; 2023*4cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 2024*4cd38560SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 2025*4cd38560SBarry Smith v += 9; 2026*4cd38560SBarry Smith } 2027*4cd38560SBarry Smith row = *ajtmp++; 2028*4cd38560SBarry Smith while (row < i) { 2029*4cd38560SBarry Smith pc = rtmp + 9*row; 2030*4cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 2031*4cd38560SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 2032*4cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 2033*4cd38560SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 2034*4cd38560SBarry Smith pv = ba + 9*diag_offset[row]; 2035*4cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 2036*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2037*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 2038*4cd38560SBarry Smith pc[0] = m1 = p1*x1 + p4*x2 + p7*x3; 2039*4cd38560SBarry Smith pc[1] = m2 = p2*x1 + p5*x2 + p8*x3; 2040*4cd38560SBarry Smith pc[2] = m3 = p3*x1 + p6*x2 + p9*x3; 2041*4cd38560SBarry Smith 2042*4cd38560SBarry Smith pc[3] = m4 = p1*x4 + p4*x5 + p7*x6; 2043*4cd38560SBarry Smith pc[4] = m5 = p2*x4 + p5*x5 + p8*x6; 2044*4cd38560SBarry Smith pc[5] = m6 = p3*x4 + p6*x5 + p9*x6; 2045*4cd38560SBarry Smith 2046*4cd38560SBarry Smith pc[6] = m7 = p1*x7 + p4*x8 + p7*x9; 2047*4cd38560SBarry Smith pc[7] = m8 = p2*x7 + p5*x8 + p8*x9; 2048*4cd38560SBarry Smith pc[8] = m9 = p3*x7 + p6*x8 + p9*x9; 2049*4cd38560SBarry Smith 2050*4cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 2051*4cd38560SBarry Smith pv += 9; 2052*4cd38560SBarry Smith for (j=0; j<nz; j++) { 2053*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2054*4cd38560SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 2055*4cd38560SBarry Smith x = rtmp + 9*pj[j]; 2056*4cd38560SBarry Smith x[0] -= m1*x1 + m4*x2 + m7*x3; 2057*4cd38560SBarry Smith x[1] -= m2*x1 + m5*x2 + m8*x3; 2058*4cd38560SBarry Smith x[2] -= m3*x1 + m6*x2 + m9*x3; 2059*4cd38560SBarry Smith 2060*4cd38560SBarry Smith x[3] -= m1*x4 + m4*x5 + m7*x6; 2061*4cd38560SBarry Smith x[4] -= m2*x4 + m5*x5 + m8*x6; 2062*4cd38560SBarry Smith x[5] -= m3*x4 + m6*x5 + m9*x6; 2063*4cd38560SBarry Smith 2064*4cd38560SBarry Smith x[6] -= m1*x7 + m4*x8 + m7*x9; 2065*4cd38560SBarry Smith x[7] -= m2*x7 + m5*x8 + m8*x9; 2066*4cd38560SBarry Smith x[8] -= m3*x7 + m6*x8 + m9*x9; 2067*4cd38560SBarry Smith pv += 9; 2068*4cd38560SBarry Smith } 2069*4cd38560SBarry Smith PLogFlops(54*nz+36); 2070*4cd38560SBarry Smith } 2071*4cd38560SBarry Smith row = *ajtmp++; 2072*4cd38560SBarry Smith } 2073*4cd38560SBarry Smith /* finished row so stick it into b->a */ 2074*4cd38560SBarry Smith pv = ba + 9*bi[i]; 2075*4cd38560SBarry Smith pj = bj + bi[i]; 2076*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 2077*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 2078*4cd38560SBarry Smith x = rtmp+9*pj[j]; 2079*4cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 2080*4cd38560SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 2081*4cd38560SBarry Smith pv += 9; 2082*4cd38560SBarry Smith } 2083*4cd38560SBarry Smith /* invert diagonal block */ 2084*4cd38560SBarry Smith w = ba + 9*diag_offset[i]; 2085*4cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_3(w); CHKERRQ(ierr); 2086*4cd38560SBarry Smith } 2087*4cd38560SBarry Smith 2088*4cd38560SBarry Smith PetscFree(rtmp); 2089*4cd38560SBarry Smith C->factor = FACTOR_LU; 2090*4cd38560SBarry Smith C->assembled = PETSC_TRUE; 2091*4cd38560SBarry Smith PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */ 2092*4cd38560SBarry Smith PetscFunctionReturn(0); 2093*4cd38560SBarry Smith } 20946d84be18SBarry Smith 20956d84be18SBarry Smith /* ------------------------------------------------------------*/ 20966d84be18SBarry Smith /* 20974eeb42bcSBarry Smith Version for when blocks are 2 by 2 20984eeb42bcSBarry Smith */ 20995615d1e5SSatish Balay #undef __FUNC__ 21005615d1e5SSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_2" 21014eeb42bcSBarry Smith int MatLUFactorNumeric_SeqBAIJ_2(Mat A,Mat *B) 21024eeb42bcSBarry Smith { 21034eeb42bcSBarry Smith Mat C = *B; 21044eeb42bcSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 21057cdf2dbbSSatish Balay IS isrow = b->row, isicol = b->icol; 21064078e994SBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 21076d84be18SBarry Smith int *ajtmpold, *ajtmp, nz, row, v_pivots[2]; 21084078e994SBarry Smith int *diag_offset=b->diag,bs = 2,idx,*ai=a->i,*aj=a->j; 21093a40ed3dSBarry Smith register int *pj; 21103f1db9ecSBarry Smith register MatScalar *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4; 21113f1db9ecSBarry Smith MatScalar p1,p2,p3,p4,v_work[2]; 21123f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 21134eeb42bcSBarry Smith 21143a40ed3dSBarry Smith PetscFunctionBegin; 21154eeb42bcSBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 21164eeb42bcSBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 21173f1db9ecSBarry Smith rtmp = (MatScalar *) PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 21184eeb42bcSBarry Smith 21194eeb42bcSBarry Smith for ( i=0; i<n; i++ ) { 21204078e994SBarry Smith nz = bi[i+1] - bi[i]; 21214078e994SBarry Smith ajtmp = bj + bi[i]; 21224eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 21234eeb42bcSBarry Smith x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0; 21244eeb42bcSBarry Smith } 21254eeb42bcSBarry Smith /* load in initial (unfactored row) */ 21264eeb42bcSBarry Smith idx = r[i]; 21274078e994SBarry Smith nz = ai[idx+1] - ai[idx]; 21284078e994SBarry Smith ajtmpold = aj + ai[idx]; 21294078e994SBarry Smith v = aa + 4*ai[idx]; 21304eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 21314eeb42bcSBarry Smith x = rtmp+4*ic[ajtmpold[j]]; 21324eeb42bcSBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 21334eeb42bcSBarry Smith v += 4; 21344eeb42bcSBarry Smith } 21354eeb42bcSBarry Smith row = *ajtmp++; 21364eeb42bcSBarry Smith while (row < i) { 21374eeb42bcSBarry Smith pc = rtmp + 4*row; 21384eeb42bcSBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 213988685aaeSLois Curfman McInnes if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 21404078e994SBarry Smith pv = ba + 4*diag_offset[row]; 21414078e994SBarry Smith pj = bj + diag_offset[row] + 1; 21424eeb42bcSBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 21434eeb42bcSBarry Smith pc[0] = m1 = p1*x1 + p3*x2; 21444eeb42bcSBarry Smith pc[1] = m2 = p2*x1 + p4*x2; 21454eeb42bcSBarry Smith pc[2] = m3 = p1*x3 + p3*x4; 21464eeb42bcSBarry Smith pc[3] = m4 = p2*x3 + p4*x4; 21474078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 21484eeb42bcSBarry Smith pv += 4; 21494eeb42bcSBarry Smith for (j=0; j<nz; j++) { 21504eeb42bcSBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 21514eeb42bcSBarry Smith x = rtmp + 4*pj[j]; 21524eeb42bcSBarry Smith x[0] -= m1*x1 + m3*x2; 21534eeb42bcSBarry Smith x[1] -= m2*x1 + m4*x2; 21544eeb42bcSBarry Smith x[2] -= m1*x3 + m3*x4; 21554eeb42bcSBarry Smith x[3] -= m2*x3 + m4*x4; 21564eeb42bcSBarry Smith pv += 4; 21574eeb42bcSBarry Smith } 21584eeb42bcSBarry Smith PLogFlops(16*nz+12); 21594eeb42bcSBarry Smith } 21604eeb42bcSBarry Smith row = *ajtmp++; 21614eeb42bcSBarry Smith } 21624eeb42bcSBarry Smith /* finished row so stick it into b->a */ 21634078e994SBarry Smith pv = ba + 4*bi[i]; 21644078e994SBarry Smith pj = bj + bi[i]; 21654078e994SBarry Smith nz = bi[i+1] - bi[i]; 21664eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 21674eeb42bcSBarry Smith x = rtmp+4*pj[j]; 21684eeb42bcSBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 21694eeb42bcSBarry Smith pv += 4; 21704eeb42bcSBarry Smith } 21714eeb42bcSBarry Smith /* invert diagonal block */ 21724078e994SBarry Smith w = ba + 4*diag_offset[i]; 2173*4cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_2(w); CHKERRQ(ierr); 2174*4cd38560SBarry Smith /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/ 21754eeb42bcSBarry Smith } 21764eeb42bcSBarry Smith 21776d84be18SBarry Smith PetscFree(rtmp); 21784eeb42bcSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 21794eeb42bcSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 21804eeb42bcSBarry Smith C->factor = FACTOR_LU; 21814eeb42bcSBarry Smith C->assembled = PETSC_TRUE; 21824eeb42bcSBarry Smith PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 21833a40ed3dSBarry Smith PetscFunctionReturn(0); 21844eeb42bcSBarry Smith } 2185*4cd38560SBarry Smith /* 2186*4cd38560SBarry Smith Version for when blocks are 2 by 2 Using natural ordering 2187*4cd38560SBarry Smith */ 2188*4cd38560SBarry Smith #undef __FUNC__ 2189*4cd38560SBarry Smith #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering" 2190*4cd38560SBarry Smith int MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat A,Mat *B) 2191*4cd38560SBarry Smith { 2192*4cd38560SBarry Smith Mat C = *B; 2193*4cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 2194*4cd38560SBarry Smith int ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 2195*4cd38560SBarry Smith int *ajtmpold, *ajtmp, nz, row, v_pivots[2]; 2196*4cd38560SBarry Smith int *diag_offset = b->diag, bs = 2,*ai=a->i,*aj=a->j; 2197*4cd38560SBarry Smith register int *pj; 2198*4cd38560SBarry Smith register MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2199*4cd38560SBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4,v_work[2]; 2200*4cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 2201*4cd38560SBarry Smith 2202*4cd38560SBarry Smith PetscFunctionBegin; 2203*4cd38560SBarry Smith rtmp = (MatScalar *) PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 2204*4cd38560SBarry Smith 2205*4cd38560SBarry Smith for ( i=0; i<n; i++ ) { 2206*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 2207*4cd38560SBarry Smith ajtmp = bj + bi[i]; 2208*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 2209*4cd38560SBarry Smith x = rtmp+4*ajtmp[j]; 2210*4cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = 0.0; 2211*4cd38560SBarry Smith } 2212*4cd38560SBarry Smith /* load in initial (unfactored row) */ 2213*4cd38560SBarry Smith nz = ai[i+1] - ai[i]; 2214*4cd38560SBarry Smith ajtmpold = aj + ai[i]; 2215*4cd38560SBarry Smith v = aa + 4*ai[i]; 2216*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 2217*4cd38560SBarry Smith x = rtmp+4*ajtmpold[j]; 2218*4cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 2219*4cd38560SBarry Smith v += 4; 2220*4cd38560SBarry Smith } 2221*4cd38560SBarry Smith row = *ajtmp++; 2222*4cd38560SBarry Smith while (row < i) { 2223*4cd38560SBarry Smith pc = rtmp + 4*row; 2224*4cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 2225*4cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 2226*4cd38560SBarry Smith pv = ba + 4*diag_offset[row]; 2227*4cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 2228*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2229*4cd38560SBarry Smith pc[0] = m1 = p1*x1 + p3*x2; 2230*4cd38560SBarry Smith pc[1] = m2 = p2*x1 + p4*x2; 2231*4cd38560SBarry Smith pc[2] = m3 = p1*x3 + p3*x4; 2232*4cd38560SBarry Smith pc[3] = m4 = p2*x3 + p4*x4; 2233*4cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 2234*4cd38560SBarry Smith pv += 4; 2235*4cd38560SBarry Smith for (j=0; j<nz; j++) { 2236*4cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 2237*4cd38560SBarry Smith x = rtmp + 4*pj[j]; 2238*4cd38560SBarry Smith x[0] -= m1*x1 + m3*x2; 2239*4cd38560SBarry Smith x[1] -= m2*x1 + m4*x2; 2240*4cd38560SBarry Smith x[2] -= m1*x3 + m3*x4; 2241*4cd38560SBarry Smith x[3] -= m2*x3 + m4*x4; 2242*4cd38560SBarry Smith pv += 4; 2243*4cd38560SBarry Smith } 2244*4cd38560SBarry Smith PLogFlops(16*nz+12); 2245*4cd38560SBarry Smith } 2246*4cd38560SBarry Smith row = *ajtmp++; 2247*4cd38560SBarry Smith } 2248*4cd38560SBarry Smith /* finished row so stick it into b->a */ 2249*4cd38560SBarry Smith pv = ba + 4*bi[i]; 2250*4cd38560SBarry Smith pj = bj + bi[i]; 2251*4cd38560SBarry Smith nz = bi[i+1] - bi[i]; 2252*4cd38560SBarry Smith for ( j=0; j<nz; j++ ) { 2253*4cd38560SBarry Smith x = rtmp+4*pj[j]; 2254*4cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 2255*4cd38560SBarry Smith pv += 4; 2256*4cd38560SBarry Smith } 2257*4cd38560SBarry Smith /* invert diagonal block */ 2258*4cd38560SBarry Smith w = ba + 4*diag_offset[i]; 2259*4cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_2(w); CHKERRQ(ierr); 2260*4cd38560SBarry Smith /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/ 2261*4cd38560SBarry Smith } 2262*4cd38560SBarry Smith 2263*4cd38560SBarry Smith PetscFree(rtmp); 2264*4cd38560SBarry Smith C->factor = FACTOR_LU; 2265*4cd38560SBarry Smith C->assembled = PETSC_TRUE; 2266*4cd38560SBarry Smith PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 2267*4cd38560SBarry Smith PetscFunctionReturn(0); 2268*4cd38560SBarry Smith } 22697fc0212eSBarry Smith 22707fc0212eSBarry Smith /* ----------------------------------------------------------- */ 22717fc0212eSBarry Smith /* 22727fc0212eSBarry Smith Version for when blocks are 1 by 1. 22737fc0212eSBarry Smith */ 22745615d1e5SSatish Balay #undef __FUNC__ 22755615d1e5SSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqBAIJ_1" 22767fc0212eSBarry Smith int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B) 22777fc0212eSBarry Smith { 22787fc0212eSBarry Smith Mat C = *B; 22797fc0212eSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data; 22807cdf2dbbSSatish Balay IS isrow = b->row, isicol = b->icol; 22814078e994SBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j; 22824078e994SBarry Smith int *ajtmpold, *ajtmp, nz, row,*ai = a->i,*aj = a->j; 22837fc0212eSBarry Smith int *diag_offset = b->diag,diag; 22843a40ed3dSBarry Smith register int *pj; 22853f1db9ecSBarry Smith register MatScalar *pv,*v,*rtmp,multiplier,*pc; 22863f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 22877fc0212eSBarry Smith 22883a40ed3dSBarry Smith PetscFunctionBegin; 22897fc0212eSBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 22907fc0212eSBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 22913f1db9ecSBarry Smith rtmp = (MatScalar *) PetscMalloc((n+1)*sizeof(MatScalar));CHKPTRQ(rtmp); 22927fc0212eSBarry Smith 22937fc0212eSBarry Smith for ( i=0; i<n; i++ ) { 22944078e994SBarry Smith nz = bi[i+1] - bi[i]; 22954078e994SBarry Smith ajtmp = bj + bi[i]; 22967fc0212eSBarry Smith for ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0; 22977fc0212eSBarry Smith 22987fc0212eSBarry Smith /* load in initial (unfactored row) */ 22994078e994SBarry Smith nz = ai[r[i]+1] - ai[r[i]]; 23004078e994SBarry Smith ajtmpold = aj + ai[r[i]]; 23014078e994SBarry Smith v = aa + ai[r[i]]; 23027fc0212eSBarry Smith for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] = v[j]; 23037fc0212eSBarry Smith 23047fc0212eSBarry Smith row = *ajtmp++; 23057fc0212eSBarry Smith while (row < i) { 23067fc0212eSBarry Smith pc = rtmp + row; 23077fc0212eSBarry Smith if (*pc != 0.0) { 23084078e994SBarry Smith pv = ba + diag_offset[row]; 23094078e994SBarry Smith pj = bj + diag_offset[row] + 1; 23107fc0212eSBarry Smith multiplier = *pc * *pv++; 23117fc0212eSBarry Smith *pc = multiplier; 23124078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 23137fc0212eSBarry Smith for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 23144eeb42bcSBarry Smith PLogFlops(1+2*nz); 23157fc0212eSBarry Smith } 23167fc0212eSBarry Smith row = *ajtmp++; 23177fc0212eSBarry Smith } 23187fc0212eSBarry Smith /* finished row so stick it into b->a */ 23194078e994SBarry Smith pv = ba + bi[i]; 23204078e994SBarry Smith pj = bj + bi[i]; 23214078e994SBarry Smith nz = bi[i+1] - bi[i]; 23227fc0212eSBarry Smith for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];} 23234078e994SBarry Smith diag = diag_offset[i] - bi[i]; 23247fc0212eSBarry Smith /* check pivot entry for current row */ 23257fc0212eSBarry Smith if (pv[diag] == 0.0) { 2326e3372554SBarry Smith SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,0,"Zero pivot"); 23277fc0212eSBarry Smith } 23287fc0212eSBarry Smith pv[diag] = 1.0/pv[diag]; 23297fc0212eSBarry Smith } 23307fc0212eSBarry Smith 23317fc0212eSBarry Smith PetscFree(rtmp); 23327fc0212eSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 23337fc0212eSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 23347fc0212eSBarry Smith C->factor = FACTOR_LU; 23357fc0212eSBarry Smith C->assembled = PETSC_TRUE; 23367fc0212eSBarry Smith PLogFlops(b->n); 23373a40ed3dSBarry Smith PetscFunctionReturn(0); 23387fc0212eSBarry Smith } 23397fc0212eSBarry Smith 23405d517e7eSBarry Smith /* ----------------------------------------------------------- */ 23415615d1e5SSatish Balay #undef __FUNC__ 23425615d1e5SSatish Balay #define __FUNC__ "MatLUFactor_SeqBAIJ" 2343ec3a8b7bSBarry Smith int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f) 23445d517e7eSBarry Smith { 2345ec3a8b7bSBarry Smith Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data; 23465d517e7eSBarry Smith int ierr; 23475d517e7eSBarry Smith Mat C; 2348f830108cSBarry Smith PetscOps *Abops; 234917642b18SBarry Smith MatOps Aops; 23505d517e7eSBarry Smith 23513a40ed3dSBarry Smith PetscFunctionBegin; 2352f2582111SSatish Balay ierr = MatLUFactorSymbolic(A,row,col,f,&C); CHKERRQ(ierr); 23537fc0212eSBarry Smith ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr); 23545d517e7eSBarry Smith 23555d517e7eSBarry Smith /* free all the data structures from mat */ 23565d517e7eSBarry Smith PetscFree(mat->a); 23575d517e7eSBarry Smith if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);} 23585d517e7eSBarry Smith if (mat->diag) PetscFree(mat->diag); 23595d517e7eSBarry Smith if (mat->ilen) PetscFree(mat->ilen); 23605d517e7eSBarry Smith if (mat->imax) PetscFree(mat->imax); 23615d517e7eSBarry Smith if (mat->solve_work) PetscFree(mat->solve_work); 2362bb1453f0SSatish Balay if (mat->mult_work) PetscFree(mat->mult_work); 2363e93ccc4dSBarry Smith if (mat->icol) {ierr = ISDestroy(mat->icol);CHKERRQ(ierr);} 23645d517e7eSBarry Smith PetscFree(mat); 23655d517e7eSBarry Smith 236617642b18SBarry Smith ierr = MapDestroy(A->rmap);CHKERRQ(ierr); 236717642b18SBarry Smith ierr = MapDestroy(A->cmap);CHKERRQ(ierr); 236817642b18SBarry Smith 2369f830108cSBarry Smith /* 2370f830108cSBarry Smith This is horrible, horrible code. We need to keep the 2371f830108cSBarry Smith A pointers for the bops and ops but copy everything 2372f830108cSBarry Smith else from C. 2373f830108cSBarry Smith */ 2374f830108cSBarry Smith Abops = A->bops; 2375f830108cSBarry Smith Aops = A->ops; 2376f09e8eb9SSatish Balay PetscMemcpy(A,C,sizeof(struct _p_Mat)); 2377451c4af8SSatish Balay mat = (Mat_SeqBAIJ *) A->data; 2378451c4af8SSatish Balay PLogObjectParent(A,mat->icol); 2379451c4af8SSatish Balay 2380f830108cSBarry Smith A->bops = Abops; 2381f830108cSBarry Smith A->ops = Aops; 238227a8da17SBarry Smith A->qlist = 0; 2383f830108cSBarry Smith 23845d517e7eSBarry Smith PetscHeaderDestroy(C); 23853a40ed3dSBarry Smith PetscFunctionReturn(0); 23865d517e7eSBarry Smith } 2387*4cd38560SBarry Smith 2388*4cd38560SBarry Smith 2389*4cd38560SBarry Smith 2390