15d517e7eSBarry Smith #ifndef lint 2*4eeb42bcSBarry Smith static char vcid[] = "$Id: baijfact.c,v 1.4 1996/02/18 00:40:22 bsmith Exp bsmith $"; 35d517e7eSBarry Smith #endif 45d517e7eSBarry Smith /* 5ec3a8b7bSBarry Smith Factorization code for BAIJ format. 65d517e7eSBarry Smith */ 75d517e7eSBarry Smith 8ec3a8b7bSBarry Smith #include "baij.h" 9ec3a8b7bSBarry Smith 10ec3a8b7bSBarry Smith /* 11ec3a8b7bSBarry Smith The symbolic factorization code is identical to that for AIJ format, 12ec3a8b7bSBarry Smith except for very small changes since this is now a SeqBAIJ datastructure. 13ec3a8b7bSBarry Smith NOT good code reuse. 14ec3a8b7bSBarry Smith */ 15ec3a8b7bSBarry Smith int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B) 165d517e7eSBarry Smith { 17ec3a8b7bSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 185d517e7eSBarry Smith IS isicol; 19ec3a8b7bSBarry Smith int *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j; 20de6a44a3SBarry Smith int *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs; 215d517e7eSBarry Smith int *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im; 225d517e7eSBarry Smith 23ec3a8b7bSBarry Smith if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square"); 24ec3a8b7bSBarry Smith if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation"); 25ec3a8b7bSBarry Smith if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation"); 265d517e7eSBarry Smith 275d517e7eSBarry Smith ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 285d517e7eSBarry Smith ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic); 295d517e7eSBarry Smith 305d517e7eSBarry Smith /* get new row pointers */ 315d517e7eSBarry Smith ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 32de6a44a3SBarry Smith ainew[0] = 0; 335d517e7eSBarry Smith /* don't know how many column pointers are needed so estimate */ 34de6a44a3SBarry Smith jmax = (int) (f*ai[n] + 1); 355d517e7eSBarry Smith ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 365d517e7eSBarry Smith /* fill is a linked list of nonzeros in active row */ 375d517e7eSBarry Smith fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill); 385d517e7eSBarry Smith im = fill + n + 1; 395d517e7eSBarry Smith /* idnew is location of diagonal in factor */ 405d517e7eSBarry Smith idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew); 41de6a44a3SBarry Smith idnew[0] = 0; 425d517e7eSBarry Smith 435d517e7eSBarry Smith for ( i=0; i<n; i++ ) { 445d517e7eSBarry Smith /* first copy previous fill into linked list */ 455d517e7eSBarry Smith nnz = nz = ai[r[i]+1] - ai[r[i]]; 46de6a44a3SBarry Smith ajtmp = aj + ai[r[i]]; 475d517e7eSBarry Smith fill[n] = n; 485d517e7eSBarry Smith while (nz--) { 495d517e7eSBarry Smith fm = n; 50de6a44a3SBarry Smith idx = ic[*ajtmp++]; 515d517e7eSBarry Smith do { 525d517e7eSBarry Smith m = fm; 535d517e7eSBarry Smith fm = fill[m]; 545d517e7eSBarry Smith } while (fm < idx); 555d517e7eSBarry Smith fill[m] = idx; 565d517e7eSBarry Smith fill[idx] = fm; 575d517e7eSBarry Smith } 585d517e7eSBarry Smith row = fill[n]; 595d517e7eSBarry Smith while ( row < i ) { 60de6a44a3SBarry Smith ajtmp = ajnew + idnew[row] + 1; 615d517e7eSBarry Smith nzbd = 1 + idnew[row] - ainew[row]; 625d517e7eSBarry Smith nz = im[row] - nzbd; 635d517e7eSBarry Smith fm = row; 645d517e7eSBarry Smith while (nz-- > 0) { 65de6a44a3SBarry Smith idx = *ajtmp++; 665d517e7eSBarry Smith nzbd++; 675d517e7eSBarry Smith if (idx == i) im[row] = nzbd; 685d517e7eSBarry Smith do { 695d517e7eSBarry Smith m = fm; 705d517e7eSBarry Smith fm = fill[m]; 715d517e7eSBarry Smith } while (fm < idx); 725d517e7eSBarry Smith if (fm != idx) { 735d517e7eSBarry Smith fill[m] = idx; 745d517e7eSBarry Smith fill[idx] = fm; 755d517e7eSBarry Smith fm = idx; 765d517e7eSBarry Smith nnz++; 775d517e7eSBarry Smith } 785d517e7eSBarry Smith } 795d517e7eSBarry Smith row = fill[row]; 805d517e7eSBarry Smith } 815d517e7eSBarry Smith /* copy new filled row into permanent storage */ 825d517e7eSBarry Smith ainew[i+1] = ainew[i] + nnz; 835d517e7eSBarry Smith if (ainew[i+1] > jmax+1) { 845d517e7eSBarry Smith /* allocate a longer ajnew */ 855d517e7eSBarry Smith int maxadd; 86de6a44a3SBarry Smith maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n); 875d517e7eSBarry Smith if (maxadd < nnz) maxadd = (n-i)*(nnz+1); 885d517e7eSBarry Smith jmax += maxadd; 895d517e7eSBarry Smith ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp); 90de6a44a3SBarry Smith PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int)); 915d517e7eSBarry Smith PetscFree(ajnew); 925d517e7eSBarry Smith ajnew = ajtmp; 935d517e7eSBarry Smith realloc++; /* count how many times we realloc */ 945d517e7eSBarry Smith } 95de6a44a3SBarry Smith ajtmp = ajnew + ainew[i]; 965d517e7eSBarry Smith fm = fill[n]; 975d517e7eSBarry Smith nzi = 0; 985d517e7eSBarry Smith im[i] = nnz; 995d517e7eSBarry Smith while (nnz--) { 1005d517e7eSBarry Smith if (fm < i) nzi++; 101de6a44a3SBarry Smith *ajtmp++ = fm; 1025d517e7eSBarry Smith fm = fill[fm]; 1035d517e7eSBarry Smith } 1045d517e7eSBarry Smith idnew[i] = ainew[i] + nzi; 1055d517e7eSBarry Smith } 1065d517e7eSBarry Smith 1075d517e7eSBarry Smith PLogInfo((PetscObject)A, 108ec3a8b7bSBarry Smith "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n", 1095d517e7eSBarry Smith realloc,f,((double)ainew[n])/((double)ai[i])); 1105d517e7eSBarry Smith 1115d517e7eSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 1125d517e7eSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 1135d517e7eSBarry Smith 1145d517e7eSBarry Smith PetscFree(fill); 1155d517e7eSBarry Smith 1165d517e7eSBarry Smith /* put together the new matrix */ 117ec3a8b7bSBarry Smith ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr); 1185d517e7eSBarry Smith PLogObjectParent(*B,isicol); 1195d517e7eSBarry Smith ierr = ISDestroy(isicol); CHKERRQ(ierr); 120ec3a8b7bSBarry Smith b = (Mat_SeqBAIJ *) (*B)->data; 1215d517e7eSBarry Smith PetscFree(b->imax); 1225d517e7eSBarry Smith b->singlemalloc = 0; 123de6a44a3SBarry Smith len = ainew[n]*sizeof(Scalar); 1245d517e7eSBarry Smith /* the next line frees the default space generated by the Create() */ 1255d517e7eSBarry Smith PetscFree(b->a); PetscFree(b->ilen); 126ec3a8b7bSBarry Smith b->a = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a); 1275d517e7eSBarry Smith b->j = ajnew; 1285d517e7eSBarry Smith b->i = ainew; 1295d517e7eSBarry Smith b->diag = idnew; 1305d517e7eSBarry Smith b->ilen = 0; 1315d517e7eSBarry Smith b->imax = 0; 1325d517e7eSBarry Smith b->row = isrow; 1335d517e7eSBarry Smith b->col = iscol; 134de6a44a3SBarry Smith b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 1355d517e7eSBarry Smith CHKPTRQ(b->solve_work); 1365d517e7eSBarry Smith /* In b structure: Free imax, ilen, old a, old j. 1375d517e7eSBarry Smith Allocate idnew, solve_work, new a, new j */ 138de6a44a3SBarry Smith PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar))); 139de6a44a3SBarry Smith b->maxnz = b->nz = ainew[n]; 1405d517e7eSBarry Smith 1415d517e7eSBarry Smith return 0; 1425d517e7eSBarry Smith } 1435d517e7eSBarry Smith 144ec3a8b7bSBarry Smith #include "pinclude/plapack.h" 145ec3a8b7bSBarry Smith /* ----------------------------------------------------------- */ 1467fc0212eSBarry Smith int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B) 1475d517e7eSBarry Smith { 1485d517e7eSBarry Smith Mat C = *B; 149ec3a8b7bSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 1505d517e7eSBarry Smith IS iscol = b->col, isrow = b->row, isicol; 151ec3a8b7bSBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 152*4eeb42bcSBarry Smith int *ajtmpold, *ajtmp, nz, row, bslog,info; 153ec3a8b7bSBarry Smith int *diag_offset=b->diag,diag,bs=a->bs,bs2 = bs*bs,*v_pivots; 154*4eeb42bcSBarry Smith register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc,*w; 155*4eeb42bcSBarry Smith Scalar one = 1.0, zero = 0.0, mone = -1.0; 1565d517e7eSBarry Smith register int *pj; 1575d517e7eSBarry Smith 1585d517e7eSBarry Smith ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 1595d517e7eSBarry Smith PLogObjectParent(*B,isicol); 1605d517e7eSBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 1615d517e7eSBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 162ec3a8b7bSBarry Smith rtmp = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 1635d517e7eSBarry Smith 164ec3a8b7bSBarry Smith /* generate work space needed by dense LU factorization */ 165de6a44a3SBarry Smith v_work = (Scalar *) PetscMalloc((bs+2*bs2)*sizeof(Scalar));CHKPTRQ(v_work); 166de6a44a3SBarry Smith multiplier = v_work + bs2; 167de6a44a3SBarry Smith v_pivots = (int *) (multiplier + bs2); 168de6a44a3SBarry Smith 169de6a44a3SBarry Smith /* flops in while loop */ 170*4eeb42bcSBarry Smith bslog = 2*bs*bs2; 1715d517e7eSBarry Smith 1725d517e7eSBarry Smith for ( i=0; i<n; i++ ) { 1735d517e7eSBarry Smith nz = ai[i+1] - ai[i]; 174de6a44a3SBarry Smith ajtmp = aj + ai[i]; 175*4eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 176*4eeb42bcSBarry Smith PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(Scalar)); 177*4eeb42bcSBarry Smith } 1785d517e7eSBarry Smith /* load in initial (unfactored row) */ 1795d517e7eSBarry Smith nz = a->i[r[i]+1] - a->i[r[i]]; 180de6a44a3SBarry Smith ajtmpold = a->j + a->i[r[i]]; 181de6a44a3SBarry Smith v = a->a + bs2*a->i[r[i]]; 182*4eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 183*4eeb42bcSBarry Smith PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(Scalar)); 184*4eeb42bcSBarry Smith } 185de6a44a3SBarry Smith row = *ajtmp++; 1865d517e7eSBarry Smith while (row < i) { 187de6a44a3SBarry Smith pc = rtmp + bs2*row; 188de6a44a3SBarry Smith /* if (*pc) { */ 189de6a44a3SBarry Smith pv = b->a + bs2*diag_offset[row]; 190de6a44a3SBarry Smith pj = b->j + diag_offset[row] + 1; 191*4eeb42bcSBarry Smith BLgemm_("N","N",&bs,&bs,&bs,&one,pc,&bs,pv,&bs,&zero, 192*4eeb42bcSBarry Smith multiplier,&bs); 193*4eeb42bcSBarry Smith PetscMemcpy(pc,multiplier,bs2*sizeof(Scalar)); 1945d517e7eSBarry Smith nz = ai[row+1] - diag_offset[row] - 1; 195ec3a8b7bSBarry Smith pv += bs2; 196*4eeb42bcSBarry Smith for (j=0; j<nz; j++) { 197*4eeb42bcSBarry Smith BLgemm_("N","N",&bs,&bs,&bs,&mone,multiplier,&bs,pv+bs2*j,&bs, 198*4eeb42bcSBarry Smith &one,rtmp+bs2*pj[j],&bs); 199*4eeb42bcSBarry Smith } 200*4eeb42bcSBarry Smith PLogFlops(bslog*(nz+1)-bs); 201de6a44a3SBarry Smith /* } */ 202de6a44a3SBarry Smith row = *ajtmp++; 2035d517e7eSBarry Smith } 2045d517e7eSBarry Smith /* finished row so stick it into b->a */ 205de6a44a3SBarry Smith pv = b->a + bs2*ai[i]; 206de6a44a3SBarry Smith pj = b->j + ai[i]; 2075d517e7eSBarry Smith nz = ai[i+1] - ai[i]; 208*4eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 209*4eeb42bcSBarry Smith PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(Scalar)); 210*4eeb42bcSBarry Smith } 2115d517e7eSBarry Smith diag = diag_offset[i] - ai[i]; 212ec3a8b7bSBarry Smith /* invert diagonal block */ 213*4eeb42bcSBarry Smith w = pv + bs2*diag; 214*4eeb42bcSBarry Smith LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); 215*4eeb42bcSBarry Smith PetscMemzero(v_work,bs2*sizeof(Scalar)); 216*4eeb42bcSBarry Smith for ( j=0; j<bs; j++ ) { v_work[j + bs*j] = 1.0; } 217*4eeb42bcSBarry Smith LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info); 218*4eeb42bcSBarry Smith PetscMemcpy(w,v_work,bs2*sizeof(Scalar)); 2195d517e7eSBarry Smith } 2205d517e7eSBarry Smith 221de6a44a3SBarry Smith PetscFree(rtmp); PetscFree(v_work); 2225d517e7eSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 2235d517e7eSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 2245d517e7eSBarry Smith ierr = ISDestroy(isicol); CHKERRQ(ierr); 2255d517e7eSBarry Smith C->factor = FACTOR_LU; 2265d517e7eSBarry Smith C->assembled = PETSC_TRUE; 227de6a44a3SBarry Smith PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 2285d517e7eSBarry Smith return 0; 2295d517e7eSBarry Smith } 230*4eeb42bcSBarry Smith /* ------------------------------------------------------------*/ 231*4eeb42bcSBarry Smith /* 232*4eeb42bcSBarry Smith Version for when blocks are 2 by 2 233*4eeb42bcSBarry Smith */ 234*4eeb42bcSBarry Smith int MatLUFactorNumeric_SeqBAIJ_2(Mat A,Mat *B) 235*4eeb42bcSBarry Smith { 236*4eeb42bcSBarry Smith Mat C = *B; 237*4eeb42bcSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 238*4eeb42bcSBarry Smith IS iscol = b->col, isrow = b->row, isicol; 239*4eeb42bcSBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 240*4eeb42bcSBarry Smith int *ajtmpold, *ajtmp, nz, row, info; 241*4eeb42bcSBarry Smith int *diag_offset=b->diag,*v_pivots,bs = 2,idx; 242*4eeb42bcSBarry Smith register Scalar *pv,*v,*rtmp,m1,m2,m3,m4,*v_work,*pc,*w,*x,x1,x2,x3,x4; 243*4eeb42bcSBarry Smith Scalar p1,p2,p3,p4; 244*4eeb42bcSBarry Smith register int *pj; 245*4eeb42bcSBarry Smith 246*4eeb42bcSBarry Smith ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 247*4eeb42bcSBarry Smith PLogObjectParent(*B,isicol); 248*4eeb42bcSBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 249*4eeb42bcSBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 250*4eeb42bcSBarry Smith rtmp = (Scalar *) PetscMalloc(4*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 251*4eeb42bcSBarry Smith 252*4eeb42bcSBarry Smith /* generate work space needed by dense LU factorization */ 253*4eeb42bcSBarry Smith v_work = (Scalar *) PetscMalloc(6*sizeof(Scalar));CHKPTRQ(v_work); 254*4eeb42bcSBarry Smith v_pivots = (int *) (v_work + 4); 255*4eeb42bcSBarry Smith 256*4eeb42bcSBarry Smith 257*4eeb42bcSBarry Smith for ( i=0; i<n; i++ ) { 258*4eeb42bcSBarry Smith nz = ai[i+1] - ai[i]; 259*4eeb42bcSBarry Smith ajtmp = aj + ai[i]; 260*4eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 261*4eeb42bcSBarry Smith x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0; 262*4eeb42bcSBarry Smith } 263*4eeb42bcSBarry Smith /* load in initial (unfactored row) */ 264*4eeb42bcSBarry Smith idx = r[i]; 265*4eeb42bcSBarry Smith nz = a->i[idx+1] - a->i[idx]; 266*4eeb42bcSBarry Smith ajtmpold = a->j + a->i[idx]; 267*4eeb42bcSBarry Smith v = a->a + 4*a->i[idx]; 268*4eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 269*4eeb42bcSBarry Smith x = rtmp+4*ic[ajtmpold[j]]; 270*4eeb42bcSBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 271*4eeb42bcSBarry Smith v += 4; 272*4eeb42bcSBarry Smith } 273*4eeb42bcSBarry Smith row = *ajtmp++; 274*4eeb42bcSBarry Smith while (row < i) { 275*4eeb42bcSBarry Smith pc = rtmp + 4*row; 276*4eeb42bcSBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 277*4eeb42bcSBarry Smith if (p1 || p2 || p3 || p4) { 278*4eeb42bcSBarry Smith pv = b->a + 4*diag_offset[row]; 279*4eeb42bcSBarry Smith pj = b->j + diag_offset[row] + 1; 280*4eeb42bcSBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 281*4eeb42bcSBarry Smith pc[0] = m1 = p1*x1 + p3*x2; 282*4eeb42bcSBarry Smith pc[1] = m2 = p2*x1 + p4*x2; 283*4eeb42bcSBarry Smith pc[2] = m3 = p1*x3 + p3*x4; 284*4eeb42bcSBarry Smith pc[3] = m4 = p2*x3 + p4*x4; 285*4eeb42bcSBarry Smith nz = ai[row+1] - diag_offset[row] - 1; 286*4eeb42bcSBarry Smith pv += 4; 287*4eeb42bcSBarry Smith for (j=0; j<nz; j++) { 288*4eeb42bcSBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 289*4eeb42bcSBarry Smith x = rtmp + 4*pj[j]; 290*4eeb42bcSBarry Smith x[0] -= m1*x1 + m3*x2; 291*4eeb42bcSBarry Smith x[1] -= m2*x1 + m4*x2; 292*4eeb42bcSBarry Smith x[2] -= m1*x3 + m3*x4; 293*4eeb42bcSBarry Smith x[3] -= m2*x3 + m4*x4; 294*4eeb42bcSBarry Smith pv += 4; 295*4eeb42bcSBarry Smith } 296*4eeb42bcSBarry Smith PLogFlops(16*nz+12); 297*4eeb42bcSBarry Smith } 298*4eeb42bcSBarry Smith row = *ajtmp++; 299*4eeb42bcSBarry Smith } 300*4eeb42bcSBarry Smith /* finished row so stick it into b->a */ 301*4eeb42bcSBarry Smith pv = b->a + 4*ai[i]; 302*4eeb42bcSBarry Smith pj = b->j + ai[i]; 303*4eeb42bcSBarry Smith nz = ai[i+1] - ai[i]; 304*4eeb42bcSBarry Smith for ( j=0; j<nz; j++ ) { 305*4eeb42bcSBarry Smith x = rtmp+4*pj[j]; 306*4eeb42bcSBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 307*4eeb42bcSBarry Smith pv += 4; 308*4eeb42bcSBarry Smith } 309*4eeb42bcSBarry Smith /* invert diagonal block */ 310*4eeb42bcSBarry Smith w = b->a + 4*diag_offset[i]; 311*4eeb42bcSBarry Smith LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); 312*4eeb42bcSBarry Smith v_work[0] = 1.0; v_work[1] = 0.0; v_work[2] = 0.0; v_work[3] = 1.0; 313*4eeb42bcSBarry Smith LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info); 314*4eeb42bcSBarry Smith w[0] = v_work[0]; w[1] = v_work[1]; w[2] = v_work[2]; w[3] = v_work[3]; 315*4eeb42bcSBarry Smith } 316*4eeb42bcSBarry Smith 317*4eeb42bcSBarry Smith PetscFree(rtmp); PetscFree(v_work); 318*4eeb42bcSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 319*4eeb42bcSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 320*4eeb42bcSBarry Smith ierr = ISDestroy(isicol); CHKERRQ(ierr); 321*4eeb42bcSBarry Smith C->factor = FACTOR_LU; 322*4eeb42bcSBarry Smith C->assembled = PETSC_TRUE; 323*4eeb42bcSBarry Smith PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 324*4eeb42bcSBarry Smith return 0; 325*4eeb42bcSBarry Smith } 3267fc0212eSBarry Smith 3277fc0212eSBarry Smith /* ----------------------------------------------------------- */ 3287fc0212eSBarry Smith /* 3297fc0212eSBarry Smith Version for when blocks are 1 by 1. 3307fc0212eSBarry Smith */ 3317fc0212eSBarry Smith int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B) 3327fc0212eSBarry Smith { 3337fc0212eSBarry Smith Mat C = *B; 3347fc0212eSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data; 3357fc0212eSBarry Smith IS iscol = b->col, isrow = b->row, isicol; 3367fc0212eSBarry Smith int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 3377fc0212eSBarry Smith int *ajtmpold, *ajtmp, nz, row; 3387fc0212eSBarry Smith int *diag_offset = b->diag,diag; 3397fc0212eSBarry Smith register Scalar *pv,*v,*rtmp,multiplier,*pc; 3407fc0212eSBarry Smith register int *pj; 3417fc0212eSBarry Smith 3427fc0212eSBarry Smith ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 3437fc0212eSBarry Smith PLogObjectParent(*B,isicol); 3447fc0212eSBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 3457fc0212eSBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 3467fc0212eSBarry Smith rtmp = (Scalar *) PetscMalloc((n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 3477fc0212eSBarry Smith 3487fc0212eSBarry Smith for ( i=0; i<n; i++ ) { 3497fc0212eSBarry Smith nz = ai[i+1] - ai[i]; 3507fc0212eSBarry Smith ajtmp = aj + ai[i]; 3517fc0212eSBarry Smith for ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0; 3527fc0212eSBarry Smith 3537fc0212eSBarry Smith /* load in initial (unfactored row) */ 3547fc0212eSBarry Smith nz = a->i[r[i]+1] - a->i[r[i]]; 3557fc0212eSBarry Smith ajtmpold = a->j + a->i[r[i]]; 3567fc0212eSBarry Smith v = a->a + a->i[r[i]]; 3577fc0212eSBarry Smith for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] = v[j]; 3587fc0212eSBarry Smith 3597fc0212eSBarry Smith row = *ajtmp++; 3607fc0212eSBarry Smith while (row < i) { 3617fc0212eSBarry Smith pc = rtmp + row; 3627fc0212eSBarry Smith if (*pc != 0.0) { 3637fc0212eSBarry Smith pv = b->a + diag_offset[row]; 3647fc0212eSBarry Smith pj = b->j + diag_offset[row] + 1; 3657fc0212eSBarry Smith multiplier = *pc * *pv++; 3667fc0212eSBarry Smith *pc = multiplier; 3677fc0212eSBarry Smith nz = ai[row+1] - diag_offset[row] - 1; 3687fc0212eSBarry Smith for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 369*4eeb42bcSBarry Smith PLogFlops(1+2*nz); 3707fc0212eSBarry Smith } 3717fc0212eSBarry Smith row = *ajtmp++; 3727fc0212eSBarry Smith } 3737fc0212eSBarry Smith /* finished row so stick it into b->a */ 3747fc0212eSBarry Smith pv = b->a + ai[i]; 3757fc0212eSBarry Smith pj = b->j + ai[i]; 3767fc0212eSBarry Smith nz = ai[i+1] - ai[i]; 3777fc0212eSBarry Smith for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];} 3787fc0212eSBarry Smith diag = diag_offset[i] - ai[i]; 3797fc0212eSBarry Smith /* check pivot entry for current row */ 3807fc0212eSBarry Smith if (pv[diag] == 0.0) { 3817fc0212eSBarry Smith SETERRQ(1,"MatLUFactorNumeric_SeqAIJ:Zero pivot"); 3827fc0212eSBarry Smith } 3837fc0212eSBarry Smith pv[diag] = 1.0/pv[diag]; 3847fc0212eSBarry Smith } 3857fc0212eSBarry Smith 3867fc0212eSBarry Smith PetscFree(rtmp); 3877fc0212eSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 3887fc0212eSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 3897fc0212eSBarry Smith ierr = ISDestroy(isicol); CHKERRQ(ierr); 3907fc0212eSBarry Smith C->factor = FACTOR_LU; 3917fc0212eSBarry Smith C->assembled = PETSC_TRUE; 3927fc0212eSBarry Smith PLogFlops(b->n); 3937fc0212eSBarry Smith return 0; 3947fc0212eSBarry Smith } 3957fc0212eSBarry Smith 3965d517e7eSBarry Smith /* ----------------------------------------------------------- */ 397ec3a8b7bSBarry Smith int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f) 3985d517e7eSBarry Smith { 399ec3a8b7bSBarry Smith Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data; 4005d517e7eSBarry Smith int ierr; 4015d517e7eSBarry Smith Mat C; 4025d517e7eSBarry Smith 403ec3a8b7bSBarry Smith ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr); 4047fc0212eSBarry Smith ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr); 4055d517e7eSBarry Smith 4065d517e7eSBarry Smith /* free all the data structures from mat */ 4075d517e7eSBarry Smith PetscFree(mat->a); 4085d517e7eSBarry Smith if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);} 4095d517e7eSBarry Smith if (mat->diag) PetscFree(mat->diag); 4105d517e7eSBarry Smith if (mat->ilen) PetscFree(mat->ilen); 4115d517e7eSBarry Smith if (mat->imax) PetscFree(mat->imax); 4125d517e7eSBarry Smith if (mat->solve_work) PetscFree(mat->solve_work); 4135d517e7eSBarry Smith PetscFree(mat); 4145d517e7eSBarry Smith 4155d517e7eSBarry Smith PetscMemcpy(A,C,sizeof(struct _Mat)); 4165d517e7eSBarry Smith PetscHeaderDestroy(C); 4175d517e7eSBarry Smith return 0; 4185d517e7eSBarry Smith } 4195d517e7eSBarry Smith /* ----------------------------------------------------------- */ 420ec3a8b7bSBarry Smith int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx) 4215d517e7eSBarry Smith { 422ec3a8b7bSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 4235d517e7eSBarry Smith IS iscol = a->col, isrow = a->row; 424de6a44a3SBarry Smith int *r,*c, ierr, i, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 425de6a44a3SBarry Smith int nz,bs = a->bs,bs2 = bs*bs,idx,idt,idc, _One = 1; 426de6a44a3SBarry Smith Scalar *xa,*ba,*aa = a->a, *sum, _DOne = 1.0, _DMOne = -1.0; 427de6a44a3SBarry Smith Scalar _DZero = 0.0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 428de6a44a3SBarry Smith register Scalar *x, *b, *lsum, *tmp, *v; 4295d517e7eSBarry Smith 430ec3a8b7bSBarry Smith if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix"); 4315d517e7eSBarry Smith 432de6a44a3SBarry Smith ierr = VecGetArray(bb,&ba); CHKERRQ(ierr); b = ba; 433de6a44a3SBarry Smith ierr = VecGetArray(xx,&xa); CHKERRQ(ierr); x = xa; 4345d517e7eSBarry Smith tmp = a->solve_work; 4355d517e7eSBarry Smith 4365d517e7eSBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4375d517e7eSBarry Smith ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1); 4385d517e7eSBarry Smith 439de6a44a3SBarry Smith switch (bs) { 440de6a44a3SBarry Smith case 1: 4415d517e7eSBarry Smith /* forward solve the lower triangular */ 4425d517e7eSBarry Smith tmp[0] = b[*r++]; 4435d517e7eSBarry Smith for ( i=1; i<n; i++ ) { 444de6a44a3SBarry Smith v = aa + ai[i]; 445de6a44a3SBarry Smith vi = aj + ai[i]; 4465d517e7eSBarry Smith nz = a->diag[i] - ai[i]; 447de6a44a3SBarry Smith sum1 = b[*r++]; 448de6a44a3SBarry Smith while (nz--) { 449de6a44a3SBarry Smith sum1 -= (*v++)*tmp[*vi++]; 4505d517e7eSBarry Smith } 451de6a44a3SBarry Smith tmp[i] = sum1; 452de6a44a3SBarry Smith } 4535d517e7eSBarry Smith /* backward solve the upper triangular */ 4545d517e7eSBarry Smith for ( i=n-1; i>=0; i-- ){ 455de6a44a3SBarry Smith v = aa + a->diag[i] + 1; 456de6a44a3SBarry Smith vi = aj + a->diag[i] + 1; 4575d517e7eSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 458de6a44a3SBarry Smith sum1 = tmp[i]; 459de6a44a3SBarry Smith while (nz--) { 460de6a44a3SBarry Smith sum1 -= (*v++)*tmp[*vi++]; 461de6a44a3SBarry Smith } 462de6a44a3SBarry Smith x[*c--] = tmp[i] = aa[a->diag[i]]*sum1; 463de6a44a3SBarry Smith } 464de6a44a3SBarry Smith break; 465de6a44a3SBarry Smith case 2: 466de6a44a3SBarry Smith /* forward solve the lower triangular */ 467de6a44a3SBarry Smith idx = 2*(*r++); 468de6a44a3SBarry Smith tmp[0] = b[idx]; tmp[1] = b[1+idx]; 469de6a44a3SBarry Smith for ( i=1; i<n; i++ ) { 470de6a44a3SBarry Smith v = aa + 4*ai[i]; 471de6a44a3SBarry Smith vi = aj + ai[i]; 472de6a44a3SBarry Smith nz = a->diag[i] - ai[i]; 473de6a44a3SBarry Smith idx = 2*(*r++); 474de6a44a3SBarry Smith sum1 = b[idx]; sum2 = b[1+idx]; 475de6a44a3SBarry Smith while (nz--) { 476de6a44a3SBarry Smith idx = 2*(*vi++); 477de6a44a3SBarry Smith x1 = tmp[idx]; x2 = tmp[1+idx]; 478de6a44a3SBarry Smith sum1 -= v[0]*x1 + v[2]*x2; 479de6a44a3SBarry Smith sum2 -= v[1]*x1 + v[3]*x2; 480de6a44a3SBarry Smith v += 4; 481de6a44a3SBarry Smith } 482de6a44a3SBarry Smith idx = 2*i; 483de6a44a3SBarry Smith tmp[idx] = sum1; tmp[1+idx] = sum2; 484de6a44a3SBarry Smith } 485de6a44a3SBarry Smith /* backward solve the upper triangular */ 486de6a44a3SBarry Smith for ( i=n-1; i>=0; i-- ){ 487de6a44a3SBarry Smith v = aa + 4*a->diag[i] + 4; 488de6a44a3SBarry Smith vi = aj + a->diag[i] + 1; 489de6a44a3SBarry Smith nz = ai[i+1] - a->diag[i] - 1; 490de6a44a3SBarry Smith idt = 2*i; 491de6a44a3SBarry Smith sum1 = tmp[idt]; sum2 = tmp[1+idt]; 492de6a44a3SBarry Smith while (nz--) { 493de6a44a3SBarry Smith idx = 2*(*vi++); 494de6a44a3SBarry Smith x1 = tmp[idx]; x2 = tmp[1+idx]; 495de6a44a3SBarry Smith sum1 -= v[0]*x1 + v[2]*x2; 496de6a44a3SBarry Smith sum2 -= v[1]*x1 + v[3]*x2; 497de6a44a3SBarry Smith v += 4; 498de6a44a3SBarry Smith } 499de6a44a3SBarry Smith idc = 2*(*c--); 500de6a44a3SBarry Smith v = aa + 4*a->diag[i]; 501de6a44a3SBarry Smith x[idc] = tmp[idt] = v[0]*sum1 + v[2]*sum2; 502de6a44a3SBarry Smith x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2; 503de6a44a3SBarry Smith } 504de6a44a3SBarry Smith break; 505de6a44a3SBarry Smith case 3: 506de6a44a3SBarry Smith /* forward solve the lower triangular */ 507de6a44a3SBarry Smith idx = 3*(*r++); 508de6a44a3SBarry Smith tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx]; 509de6a44a3SBarry Smith for ( i=1; i<n; i++ ) { 510de6a44a3SBarry Smith v = aa + 9*ai[i]; 511de6a44a3SBarry Smith vi = aj + ai[i]; 512de6a44a3SBarry Smith nz = a->diag[i] - ai[i]; 513de6a44a3SBarry Smith idx = 3*(*r++); 514de6a44a3SBarry Smith sum1 = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx]; 515de6a44a3SBarry Smith while (nz--) { 516de6a44a3SBarry Smith idx = 3*(*vi++); 517de6a44a3SBarry Smith x1 = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx]; 518de6a44a3SBarry Smith sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 519de6a44a3SBarry Smith sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 520de6a44a3SBarry Smith sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 521de6a44a3SBarry Smith v += 9; 522de6a44a3SBarry Smith } 523de6a44a3SBarry Smith idx = 3*i; 524de6a44a3SBarry Smith tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3; 525de6a44a3SBarry Smith } 526de6a44a3SBarry Smith /* backward solve the upper triangular */ 527de6a44a3SBarry Smith for ( i=n-1; i>=0; i-- ){ 528de6a44a3SBarry Smith v = aa + 9*a->diag[i] + 9; 529de6a44a3SBarry Smith vi = aj + a->diag[i] + 1; 530de6a44a3SBarry Smith nz = ai[i+1] - a->diag[i] - 1; 531de6a44a3SBarry Smith idt = 3*i; 532de6a44a3SBarry Smith sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt]; 533de6a44a3SBarry Smith while (nz--) { 534de6a44a3SBarry Smith idx = 3*(*vi++); 535de6a44a3SBarry Smith x1 = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx]; 536de6a44a3SBarry Smith sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 537de6a44a3SBarry Smith sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 538de6a44a3SBarry Smith sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 539de6a44a3SBarry Smith v += 9; 540de6a44a3SBarry Smith } 541de6a44a3SBarry Smith idc = 3*(*c--); 542de6a44a3SBarry Smith v = aa + 9*a->diag[i]; 543de6a44a3SBarry Smith x[idc] = tmp[idt] = v[0]*sum1 + v[3]*sum2 + v[6]*sum3; 544de6a44a3SBarry Smith x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3; 545de6a44a3SBarry Smith x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3; 546de6a44a3SBarry Smith } 547de6a44a3SBarry Smith break; 548de6a44a3SBarry Smith case 4: 549de6a44a3SBarry Smith /* forward solve the lower triangular */ 550de6a44a3SBarry Smith idx = 4*(*r++); 551de6a44a3SBarry Smith tmp[0] = b[idx]; tmp[1] = b[1+idx]; 552de6a44a3SBarry Smith tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; 553de6a44a3SBarry Smith for ( i=1; i<n; i++ ) { 554de6a44a3SBarry Smith v = aa + 16*ai[i]; 555de6a44a3SBarry Smith vi = aj + ai[i]; 556de6a44a3SBarry Smith nz = a->diag[i] - ai[i]; 557de6a44a3SBarry Smith idx = 4*(*r++); 558de6a44a3SBarry Smith sum1 = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx]; 559de6a44a3SBarry Smith while (nz--) { 560de6a44a3SBarry Smith idx = 4*(*vi++); 561de6a44a3SBarry Smith x1 = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx]; 562de6a44a3SBarry Smith sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 563de6a44a3SBarry Smith sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 564de6a44a3SBarry Smith sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 565de6a44a3SBarry Smith sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 566de6a44a3SBarry Smith v += 16; 567de6a44a3SBarry Smith } 568de6a44a3SBarry Smith idx = 4*i; 569de6a44a3SBarry Smith tmp[idx] = sum1;tmp[1+idx] = sum2; 570de6a44a3SBarry Smith tmp[2+idx] = sum3;tmp[3+idx] = sum4; 571de6a44a3SBarry Smith } 572de6a44a3SBarry Smith /* backward solve the upper triangular */ 573de6a44a3SBarry Smith for ( i=n-1; i>=0; i-- ){ 574de6a44a3SBarry Smith v = aa + 16*a->diag[i] + 16; 575de6a44a3SBarry Smith vi = aj + a->diag[i] + 1; 576de6a44a3SBarry Smith nz = ai[i+1] - a->diag[i] - 1; 577de6a44a3SBarry Smith idt = 4*i; 578de6a44a3SBarry Smith sum1 = tmp[idt]; sum2 = tmp[1+idt]; 579de6a44a3SBarry Smith sum3 = tmp[2+idt];sum4 = tmp[3+idt]; 580de6a44a3SBarry Smith while (nz--) { 581de6a44a3SBarry Smith idx = 4*(*vi++); 582de6a44a3SBarry Smith x1 = tmp[idx]; x2 = tmp[1+idx]; 583de6a44a3SBarry Smith x3 = tmp[2+idx]; x4 = tmp[3+idx]; 584de6a44a3SBarry Smith sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 585de6a44a3SBarry Smith sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 586de6a44a3SBarry Smith sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 587de6a44a3SBarry Smith sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 588de6a44a3SBarry Smith v += 16; 589de6a44a3SBarry Smith } 590de6a44a3SBarry Smith idc = 4*(*c--); 591de6a44a3SBarry Smith v = aa + 16*a->diag[i]; 592de6a44a3SBarry Smith x[idc] = tmp[idt] = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4; 593de6a44a3SBarry Smith x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4; 594de6a44a3SBarry Smith x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4; 595de6a44a3SBarry Smith x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4; 596de6a44a3SBarry Smith } 597de6a44a3SBarry Smith break; 598de6a44a3SBarry Smith case 5: 599de6a44a3SBarry Smith /* forward solve the lower triangular */ 600de6a44a3SBarry Smith idx = 5*(*r++); 601de6a44a3SBarry Smith tmp[0] = b[idx]; tmp[1] = b[1+idx]; 602de6a44a3SBarry Smith tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx]; 603de6a44a3SBarry Smith for ( i=1; i<n; i++ ) { 604de6a44a3SBarry Smith v = aa + 25*ai[i]; 605de6a44a3SBarry Smith vi = aj + ai[i]; 606de6a44a3SBarry Smith nz = a->diag[i] - ai[i]; 607de6a44a3SBarry Smith idx = 5*(*r++); 608de6a44a3SBarry Smith sum1 = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx]; 609de6a44a3SBarry Smith sum5 = b[4+idx]; 610de6a44a3SBarry Smith while (nz--) { 611de6a44a3SBarry Smith idx = 5*(*vi++); 612de6a44a3SBarry Smith x1 = tmp[idx]; x2 = tmp[1+idx];x3 = tmp[2+idx]; 613de6a44a3SBarry Smith x4 = tmp[3+idx];x5 = tmp[4+idx]; 614de6a44a3SBarry Smith sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 615de6a44a3SBarry Smith sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 616de6a44a3SBarry Smith sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 617de6a44a3SBarry Smith sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 618de6a44a3SBarry Smith sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 619de6a44a3SBarry Smith v += 25; 620de6a44a3SBarry Smith } 621de6a44a3SBarry Smith idx = 5*i; 622de6a44a3SBarry Smith tmp[idx] = sum1;tmp[1+idx] = sum2; 623de6a44a3SBarry Smith tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5; 624de6a44a3SBarry Smith } 625de6a44a3SBarry Smith /* backward solve the upper triangular */ 626de6a44a3SBarry Smith for ( i=n-1; i>=0; i-- ){ 627de6a44a3SBarry Smith v = aa + 25*a->diag[i] + 25; 628de6a44a3SBarry Smith vi = aj + a->diag[i] + 1; 629de6a44a3SBarry Smith nz = ai[i+1] - a->diag[i] - 1; 630de6a44a3SBarry Smith idt = 5*i; 631de6a44a3SBarry Smith sum1 = tmp[idt]; sum2 = tmp[1+idt]; 632de6a44a3SBarry Smith sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt]; 633de6a44a3SBarry Smith while (nz--) { 634de6a44a3SBarry Smith idx = 5*(*vi++); 635de6a44a3SBarry Smith x1 = tmp[idx]; x2 = tmp[1+idx]; 636de6a44a3SBarry Smith x3 = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx]; 637de6a44a3SBarry Smith sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 638de6a44a3SBarry Smith sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 639de6a44a3SBarry Smith sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 640de6a44a3SBarry Smith sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 641de6a44a3SBarry Smith sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 642de6a44a3SBarry Smith v += 25; 643de6a44a3SBarry Smith } 644de6a44a3SBarry Smith idc = 5*(*c--); 645de6a44a3SBarry Smith v = aa + 25*a->diag[i]; 646de6a44a3SBarry Smith x[idc] = tmp[idt] = v[0]*sum1+v[5]*sum2+v[10]*sum3+ 647de6a44a3SBarry Smith v[15]*sum4+v[20]*sum5; 648de6a44a3SBarry Smith x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+ 649de6a44a3SBarry Smith v[16]*sum4+v[21]*sum5; 650de6a44a3SBarry Smith x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+ 651de6a44a3SBarry Smith v[17]*sum4+v[22]*sum5; 652de6a44a3SBarry Smith x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+ 653de6a44a3SBarry Smith v[18]*sum4+v[23]*sum5; 654de6a44a3SBarry Smith x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+ 655de6a44a3SBarry Smith v[19]*sum4+v[24]*sum5; 656de6a44a3SBarry Smith } 657de6a44a3SBarry Smith break; 658de6a44a3SBarry Smith default: { 659de6a44a3SBarry Smith /* forward solve the lower triangular */ 660de6a44a3SBarry Smith PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar)); 661de6a44a3SBarry Smith for ( i=1; i<n; i++ ) { 662de6a44a3SBarry Smith v = aa + bs2*ai[i]; 663de6a44a3SBarry Smith vi = aj + ai[i]; 664de6a44a3SBarry Smith nz = a->diag[i] - ai[i]; 665de6a44a3SBarry Smith sum = tmp + bs*i; 666de6a44a3SBarry Smith PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar)); 667de6a44a3SBarry Smith while (nz--) { 668de6a44a3SBarry Smith LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,sum,&_One); 669de6a44a3SBarry Smith v += bs2; 670de6a44a3SBarry Smith } 671de6a44a3SBarry Smith } 672de6a44a3SBarry Smith /* backward solve the upper triangular */ 673de6a44a3SBarry Smith lsum = a->solve_work + a->n; 674de6a44a3SBarry Smith for ( i=n-1; i>=0; i-- ){ 675de6a44a3SBarry Smith v = aa + bs2*(a->diag[i] + 1); 676de6a44a3SBarry Smith vi = aj + a->diag[i] + 1; 677de6a44a3SBarry Smith nz = ai[i+1] - a->diag[i] - 1; 678de6a44a3SBarry Smith PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar)); 679de6a44a3SBarry Smith while (nz--) { 680de6a44a3SBarry Smith LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,lsum,&_One); 681de6a44a3SBarry Smith v += bs2; 682de6a44a3SBarry Smith } 683de6a44a3SBarry Smith LAgemv_("N",&bs,&bs,&_DOne,aa+bs2*a->diag[i],&bs,lsum,&_One,&_DZero, 684de6a44a3SBarry Smith tmp+i*bs,&_One); 685de6a44a3SBarry Smith PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar)); 686de6a44a3SBarry Smith } 687de6a44a3SBarry Smith } 6885d517e7eSBarry Smith } 6895d517e7eSBarry Smith 6905d517e7eSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 6915d517e7eSBarry Smith ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr); 692de6a44a3SBarry Smith PLogFlops(2*bs2*a->nz - a->n); 6935d517e7eSBarry Smith return 0; 6945d517e7eSBarry Smith } 6955d517e7eSBarry Smith 6965d517e7eSBarry Smith /* ----------------------------------------------------------------*/ 697ec3a8b7bSBarry Smith /* 698ec3a8b7bSBarry Smith This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 699ec3a8b7bSBarry Smith except that the data structure of Mat_SeqAIJ is slightly different. 700ec3a8b7bSBarry Smith Not a good example of code reuse. 701ec3a8b7bSBarry Smith */ 702ec3a8b7bSBarry Smith int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels, 703ec3a8b7bSBarry Smith Mat *fact) 7045d517e7eSBarry Smith { 705ec3a8b7bSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 7065d517e7eSBarry Smith IS isicol; 707ec3a8b7bSBarry Smith int *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j; 7085d517e7eSBarry Smith int *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev; 7095d517e7eSBarry Smith int *dloc, idx, row,m,fm, nzf, nzi,len, realloc = 0; 710de6a44a3SBarry Smith int incrlev,nnz,i,bs = a->bs; 7115d517e7eSBarry Smith 712ec3a8b7bSBarry Smith if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square"); 713ec3a8b7bSBarry Smith if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation"); 714ec3a8b7bSBarry Smith if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation"); 7155d517e7eSBarry Smith 7165d517e7eSBarry Smith /* special case that simply copies fill pattern */ 7175d517e7eSBarry Smith if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) { 718ec3a8b7bSBarry Smith ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr); 7195d517e7eSBarry Smith (*fact)->factor = FACTOR_LU; 720ec3a8b7bSBarry Smith b = (Mat_SeqBAIJ *) (*fact)->data; 7215d517e7eSBarry Smith if (!b->diag) { 722ec3a8b7bSBarry Smith ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr); 7235d517e7eSBarry Smith } 7245d517e7eSBarry Smith b->row = isrow; 7255d517e7eSBarry Smith b->col = iscol; 726de6a44a3SBarry Smith b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 7275d517e7eSBarry Smith return 0; 7285d517e7eSBarry Smith } 7295d517e7eSBarry Smith 7305d517e7eSBarry Smith ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 7315d517e7eSBarry Smith ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 7325d517e7eSBarry Smith ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 7335d517e7eSBarry Smith 7345d517e7eSBarry Smith /* get new row pointers */ 7355d517e7eSBarry Smith ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 736de6a44a3SBarry Smith ainew[0] = 0; 7375d517e7eSBarry Smith /* don't know how many column pointers are needed so estimate */ 738de6a44a3SBarry Smith jmax = (int) (f*ai[n] + 1); 7395d517e7eSBarry Smith ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 7405d517e7eSBarry Smith /* ajfill is level of fill for each fill entry */ 7415d517e7eSBarry Smith ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill); 7425d517e7eSBarry Smith /* fill is a linked list of nonzeros in active row */ 7435d517e7eSBarry Smith fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill); 7445d517e7eSBarry Smith /* im is level for each filled value */ 7455d517e7eSBarry Smith im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im); 7465d517e7eSBarry Smith /* dloc is location of diagonal in factor */ 7475d517e7eSBarry Smith dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc); 7485d517e7eSBarry Smith dloc[0] = 0; 7495d517e7eSBarry Smith for ( prow=0; prow<n; prow++ ) { 7505d517e7eSBarry Smith /* first copy previous fill into linked list */ 7515d517e7eSBarry Smith nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 752de6a44a3SBarry Smith xi = aj + ai[r[prow]]; 7535d517e7eSBarry Smith fill[n] = n; 7545d517e7eSBarry Smith while (nz--) { 7555d517e7eSBarry Smith fm = n; 756de6a44a3SBarry Smith idx = ic[*xi++]; 7575d517e7eSBarry Smith do { 7585d517e7eSBarry Smith m = fm; 7595d517e7eSBarry Smith fm = fill[m]; 7605d517e7eSBarry Smith } while (fm < idx); 7615d517e7eSBarry Smith fill[m] = idx; 7625d517e7eSBarry Smith fill[idx] = fm; 7635d517e7eSBarry Smith im[idx] = 0; 7645d517e7eSBarry Smith } 7655d517e7eSBarry Smith nzi = 0; 7665d517e7eSBarry Smith row = fill[n]; 7675d517e7eSBarry Smith while ( row < prow ) { 7685d517e7eSBarry Smith incrlev = im[row] + 1; 7695d517e7eSBarry Smith nz = dloc[row]; 770de6a44a3SBarry Smith xi = ajnew + ainew[row] + nz; 771de6a44a3SBarry Smith flev = ajfill + ainew[row] + nz + 1; 7725d517e7eSBarry Smith nnz = ainew[row+1] - ainew[row] - nz - 1; 773de6a44a3SBarry Smith if (*xi++ != row) { 774ec3a8b7bSBarry Smith SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot"); 7755d517e7eSBarry Smith } 7765d517e7eSBarry Smith fm = row; 7775d517e7eSBarry Smith while (nnz-- > 0) { 778de6a44a3SBarry Smith idx = *xi++; 7795d517e7eSBarry Smith if (*flev + incrlev > levels) { 7805d517e7eSBarry Smith flev++; 7815d517e7eSBarry Smith continue; 7825d517e7eSBarry Smith } 7835d517e7eSBarry Smith do { 7845d517e7eSBarry Smith m = fm; 7855d517e7eSBarry Smith fm = fill[m]; 7865d517e7eSBarry Smith } while (fm < idx); 7875d517e7eSBarry Smith if (fm != idx) { 7885d517e7eSBarry Smith im[idx] = *flev + incrlev; 7895d517e7eSBarry Smith fill[m] = idx; 7905d517e7eSBarry Smith fill[idx] = fm; 7915d517e7eSBarry Smith fm = idx; 7925d517e7eSBarry Smith nzf++; 7935d517e7eSBarry Smith } 7945d517e7eSBarry Smith else { 7955d517e7eSBarry Smith if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 7965d517e7eSBarry Smith } 7975d517e7eSBarry Smith flev++; 7985d517e7eSBarry Smith } 7995d517e7eSBarry Smith row = fill[row]; 8005d517e7eSBarry Smith nzi++; 8015d517e7eSBarry Smith } 8025d517e7eSBarry Smith /* copy new filled row into permanent storage */ 8035d517e7eSBarry Smith ainew[prow+1] = ainew[prow] + nzf; 804de6a44a3SBarry Smith if (ainew[prow+1] > jmax) { 8055d517e7eSBarry Smith /* allocate a longer ajnew */ 8065d517e7eSBarry Smith int maxadd; 807de6a44a3SBarry Smith maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n); 8085d517e7eSBarry Smith if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 8095d517e7eSBarry Smith jmax += maxadd; 8105d517e7eSBarry Smith xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi); 811de6a44a3SBarry Smith PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int)); 8125d517e7eSBarry Smith PetscFree(ajnew); 8135d517e7eSBarry Smith ajnew = xi; 8145d517e7eSBarry Smith /* allocate a longer ajfill */ 8155d517e7eSBarry Smith xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi); 816de6a44a3SBarry Smith PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int)); 8175d517e7eSBarry Smith PetscFree(ajfill); 8185d517e7eSBarry Smith ajfill = xi; 8195d517e7eSBarry Smith realloc++; 8205d517e7eSBarry Smith } 821de6a44a3SBarry Smith xi = ajnew + ainew[prow]; 822de6a44a3SBarry Smith flev = ajfill + ainew[prow]; 8235d517e7eSBarry Smith dloc[prow] = nzi; 8245d517e7eSBarry Smith fm = fill[n]; 8255d517e7eSBarry Smith while (nzf--) { 826de6a44a3SBarry Smith *xi++ = fm; 8275d517e7eSBarry Smith *flev++ = im[fm]; 8285d517e7eSBarry Smith fm = fill[fm]; 8295d517e7eSBarry Smith } 8305d517e7eSBarry Smith } 8315d517e7eSBarry Smith PetscFree(ajfill); 8325d517e7eSBarry Smith ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 8335d517e7eSBarry Smith ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 8345d517e7eSBarry Smith ierr = ISDestroy(isicol); CHKERRQ(ierr); 8355d517e7eSBarry Smith PetscFree(fill); PetscFree(im); 8365d517e7eSBarry Smith 8375d517e7eSBarry Smith PLogInfo((PetscObject)A, 838ec3a8b7bSBarry Smith "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n", 8395d517e7eSBarry Smith realloc,f,((double)ainew[n])/((double)ai[prow])); 8405d517e7eSBarry Smith 8415d517e7eSBarry Smith /* put together the new matrix */ 842ec3a8b7bSBarry Smith ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 843ec3a8b7bSBarry Smith b = (Mat_SeqBAIJ *) (*fact)->data; 8445d517e7eSBarry Smith PetscFree(b->imax); 8455d517e7eSBarry Smith b->singlemalloc = 0; 846de6a44a3SBarry Smith len = bs*bs*ainew[n]*sizeof(Scalar); 8475d517e7eSBarry Smith /* the next line frees the default space generated by the Create() */ 8485d517e7eSBarry Smith PetscFree(b->a); PetscFree(b->ilen); 8495d517e7eSBarry Smith b->a = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a); 8505d517e7eSBarry Smith b->j = ajnew; 8515d517e7eSBarry Smith b->i = ainew; 8525d517e7eSBarry Smith for ( i=0; i<n; i++ ) dloc[i] += ainew[i]; 8535d517e7eSBarry Smith b->diag = dloc; 8545d517e7eSBarry Smith b->ilen = 0; 8555d517e7eSBarry Smith b->imax = 0; 8565d517e7eSBarry Smith b->row = isrow; 8575d517e7eSBarry Smith b->col = iscol; 858de6a44a3SBarry Smith b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 8595d517e7eSBarry Smith CHKPTRQ(b->solve_work); 8605d517e7eSBarry Smith /* In b structure: Free imax, ilen, old a, old j. 8615d517e7eSBarry Smith Allocate dloc, solve_work, new a, new j */ 862de6a44a3SBarry Smith PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs*bs*ainew[n]*sizeof(Scalar)); 863de6a44a3SBarry Smith b->maxnz = b->nz = ainew[n]; 8645d517e7eSBarry Smith (*fact)->factor = FACTOR_LU; 8655d517e7eSBarry Smith return 0; 8665d517e7eSBarry Smith } 8675d517e7eSBarry Smith 8685d517e7eSBarry Smith 8695d517e7eSBarry Smith 8705d517e7eSBarry Smith 871