#ifndef lint
static char vcid[] = "$Id: baijfact.c,v 1.5 1996/02/19 03:51:17 bsmith Exp curfman $";
#endif
/*
    Factorization code for BAIJ format. 
*/

#include "baij.h"

/*
    The symbolic factorization code is identical to that for AIJ format,
  except for very small changes since this is now a SeqBAIJ datastructure.
  NOT good code reuse.
*/
int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B)
{
  Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
  IS          isicol;
  int         *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j;
  int         *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs;
  int         *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im;
 
  if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square");
  if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation");
  if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation");

  ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
  ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic);

  /* get new row pointers */
  ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
  ainew[0] = 0;
  /* don't know how many column pointers are needed so estimate */
  jmax = (int) (f*ai[n] + 1);
  ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
  /* fill is a linked list of nonzeros in active row */
  fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill);
  im = fill + n + 1;
  /* idnew is location of diagonal in factor */
  idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew);
  idnew[0] = 0;

  for ( i=0; i<n; i++ ) {
    /* first copy previous fill into linked list */
    nnz     = nz    = ai[r[i]+1] - ai[r[i]];
    ajtmp   = aj + ai[r[i]];
    fill[n] = n;
    while (nz--) {
      fm  = n;
      idx = ic[*ajtmp++];
      do {
        m  = fm;
        fm = fill[m];
      } while (fm < idx);
      fill[m]   = idx;
      fill[idx] = fm;
    }
    row = fill[n];
    while ( row < i ) {
      ajtmp = ajnew + idnew[row] + 1;
      nzbd  = 1 + idnew[row] - ainew[row];
      nz    = im[row] - nzbd;
      fm    = row;
      while (nz-- > 0) {
        idx = *ajtmp++;
        nzbd++;
        if (idx == i) im[row] = nzbd;
        do {
          m  = fm;
          fm = fill[m];
        } while (fm < idx);
        if (fm != idx) {
          fill[m]   = idx;
          fill[idx] = fm;
          fm        = idx;
          nnz++;
        }
      }
      row = fill[row];
    }
    /* copy new filled row into permanent storage */
    ainew[i+1] = ainew[i] + nnz;
    if (ainew[i+1] > jmax+1) {
      /* allocate a longer ajnew */
      int maxadd;
      maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n);
      if (maxadd < nnz) maxadd = (n-i)*(nnz+1);
      jmax += maxadd;
      ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp);
      PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int));
      PetscFree(ajnew);
      ajnew = ajtmp;
      realloc++; /* count how many times we realloc */
    }
    ajtmp = ajnew + ainew[i];
    fm    = fill[n];
    nzi   = 0;
    im[i] = nnz;
    while (nnz--) {
      if (fm < i) nzi++;
      *ajtmp++ = fm;
      fm       = fill[fm];
    }
    idnew[i] = ainew[i] + nzi;
  }

  PLogInfo((PetscObject)A,
    "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
                             realloc,f,((double)ainew[n])/((double)ai[i]));

  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);

  PetscFree(fill);

  /* put together the new matrix */
  ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr);
  PLogObjectParent(*B,isicol); 
  ierr = ISDestroy(isicol); CHKERRQ(ierr);
  b = (Mat_SeqBAIJ *) (*B)->data;
  PetscFree(b->imax);
  b->singlemalloc = 0;
  len             = ainew[n]*sizeof(Scalar);
  /* the next line frees the default space generated by the Create() */
  PetscFree(b->a); PetscFree(b->ilen);
  b->a          = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a);
  b->j          = ajnew;
  b->i          = ainew;
  b->diag       = idnew;
  b->ilen       = 0;
  b->imax       = 0;
  b->row        = isrow;
  b->col        = iscol;
  b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 
  CHKPTRQ(b->solve_work);
  /* In b structure:  Free imax, ilen, old a, old j.  
     Allocate idnew, solve_work, new a, new j */
  PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar)));
  b->maxnz = b->nz = ainew[n];

  return 0; 
}

#include "pinclude/plapack.h"
/* ----------------------------------------------------------- */
int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B)
{
  Mat             C = *B;
  Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
  IS              iscol = b->col, isrow = b->row, isicol;
  int             *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
  int             *ajtmpold, *ajtmp, nz, row, bslog,info;
  int             *diag_offset=b->diag,diag,bs=a->bs,bs2 = bs*bs,*v_pivots;
  register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc,*w;
  Scalar          one = 1.0, zero = 0.0, mone = -1.0;
  register int    *pj;

  ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
  PLogObjectParent(*B,isicol);
  ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
  ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
  rtmp  = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);

  /* generate work space needed by dense LU factorization */
  v_work     = (Scalar *) PetscMalloc((bs+2*bs2)*sizeof(Scalar));CHKPTRQ(v_work);
  multiplier = v_work + bs2;
  v_pivots   = (int *) (multiplier + bs2);

  /* flops in while loop */
  bslog = 2*bs*bs2;

  for ( i=0; i<n; i++ ) {
    nz    = ai[i+1] - ai[i];
    ajtmp = aj + ai[i];
    for  ( j=0; j<nz; j++ ) {
      PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(Scalar));
    }
    /* load in initial (unfactored row) */
    nz       = a->i[r[i]+1] - a->i[r[i]];
    ajtmpold = a->j + a->i[r[i]];
    v        = a->a + bs2*a->i[r[i]];
    for ( j=0; j<nz; j++ ) {
      PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(Scalar)); 
    }
    row = *ajtmp++;
    while (row < i) {
      pc = rtmp + bs2*row;
/*      if (*pc) { */
        pv = b->a + bs2*diag_offset[row];
        pj = b->j + diag_offset[row] + 1;
        BLgemm_("N","N",&bs,&bs,&bs,&one,pc,&bs,pv,&bs,&zero,
                multiplier,&bs);
        PetscMemcpy(pc,multiplier,bs2*sizeof(Scalar));
        nz = ai[row+1] - diag_offset[row] - 1;
        pv += bs2;
        for (j=0; j<nz; j++) {
          BLgemm_("N","N",&bs,&bs,&bs,&mone,multiplier,&bs,pv+bs2*j,&bs,
                  &one,rtmp+bs2*pj[j],&bs);
        }
        PLogFlops(bslog*(nz+1)-bs);
/*      } */
        row = *ajtmp++;
    }
    /* finished row so stick it into b->a */
    pv = b->a + bs2*ai[i];
    pj = b->j + ai[i];
    nz = ai[i+1] - ai[i];
    for ( j=0; j<nz; j++ ) {
      PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(Scalar)); 
    }
    diag = diag_offset[i] - ai[i];
    /* invert diagonal block */
    w = pv + bs2*diag; 
    LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); 
    PetscMemzero(v_work,bs2*sizeof(Scalar));  
    for ( j=0; j<bs; j++ ) { v_work[j + bs*j] = 1.0; } 
    LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);
    PetscMemcpy(w,v_work,bs2*sizeof(Scalar)); 
  }

  PetscFree(rtmp); PetscFree(v_work);
  ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISDestroy(isicol); CHKERRQ(ierr);
  C->factor = FACTOR_LU;
  C->assembled = PETSC_TRUE;
  PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
  return 0;
}
/* ------------------------------------------------------------*/
/*
      Version for when blocks are 2 by 2
*/
int MatLUFactorNumeric_SeqBAIJ_2(Mat A,Mat *B)
{
  Mat             C = *B;
  Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
  IS              iscol = b->col, isrow = b->row, isicol;
  int             *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
  int             *ajtmpold, *ajtmp, nz, row, info;
  int             *diag_offset=b->diag,*v_pivots,bs = 2,idx;
  register Scalar *pv,*v,*rtmp,m1,m2,m3,m4,*v_work,*pc,*w,*x,x1,x2,x3,x4;
  Scalar          p1,p2,p3,p4;
  register int    *pj;

  ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
  PLogObjectParent(*B,isicol);
  ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
  ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
  rtmp  = (Scalar *) PetscMalloc(4*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);

  /* generate work space needed by dense LU factorization */
  v_work     = (Scalar *) PetscMalloc(6*sizeof(Scalar));CHKPTRQ(v_work);
  v_pivots   = (int *) (v_work + 4);

  for ( i=0; i<n; i++ ) {
    nz    = ai[i+1] - ai[i];
    ajtmp = aj + ai[i];
    for  ( j=0; j<nz; j++ ) {
      x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
    }
    /* load in initial (unfactored row) */
    idx      = r[i];
    nz       = a->i[idx+1] - a->i[idx];
    ajtmpold = a->j + a->i[idx];
    v        = a->a + 4*a->i[idx];
    for ( j=0; j<nz; j++ ) {
      x    = rtmp+4*ic[ajtmpold[j]];
      x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
      v    += 4;
    }
    row = *ajtmp++;
    while (row < i) {
      pc = rtmp + 4*row;
      p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
      if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 
        pv = b->a + 4*diag_offset[row];
        pj = b->j + diag_offset[row] + 1;
        x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
        pc[0] = m1 = p1*x1 + p3*x2;
        pc[1] = m2 = p2*x1 + p4*x2;
        pc[2] = m3 = p1*x3 + p3*x4;
        pc[3] = m4 = p2*x3 + p4*x4;
        nz = ai[row+1] - diag_offset[row] - 1;
        pv += 4;
        for (j=0; j<nz; j++) {
          x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
          x    = rtmp + 4*pj[j];
          x[0] -= m1*x1 + m3*x2;
          x[1] -= m2*x1 + m4*x2;
          x[2] -= m1*x3 + m3*x4;
          x[3] -= m2*x3 + m4*x4;
          pv   += 4;
        }
        PLogFlops(16*nz+12);
      } 
      row = *ajtmp++;
    }
    /* finished row so stick it into b->a */
    pv = b->a + 4*ai[i];
    pj = b->j + ai[i];
    nz = ai[i+1] - ai[i];
    for ( j=0; j<nz; j++ ) {
      x     = rtmp+4*pj[j];
      pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
      pv   += 4;
    }
    /* invert diagonal block */
    w = b->a + 4*diag_offset[i];
    LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info);
    v_work[0] = 1.0; v_work[1] = 0.0; v_work[2] = 0.0; v_work[3] = 1.0; 
    LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);
    w[0] = v_work[0]; w[1] = v_work[1]; w[2] = v_work[2]; w[3] = v_work[3];
  }

  PetscFree(rtmp); PetscFree(v_work);
  ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISDestroy(isicol); CHKERRQ(ierr);
  C->factor = FACTOR_LU;
  C->assembled = PETSC_TRUE;
  PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
  return 0;
}

/* ----------------------------------------------------------- */
/*
     Version for when blocks are 1 by 1.
*/
int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B)
{
  Mat         C = *B;
  Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data;
  IS          iscol = b->col, isrow = b->row, isicol;
  int         *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
  int         *ajtmpold, *ajtmp, nz, row;
  int         *diag_offset = b->diag,diag;
  register Scalar *pv,*v,*rtmp,multiplier,*pc;
  register int    *pj;

  ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
  PLogObjectParent(*B,isicol);
  ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
  ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
  rtmp  = (Scalar *) PetscMalloc((n+1)*sizeof(Scalar));CHKPTRQ(rtmp);

  for ( i=0; i<n; i++ ) {
    nz    = ai[i+1] - ai[i];
    ajtmp = aj + ai[i];
    for  ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0;

    /* load in initial (unfactored row) */
    nz       = a->i[r[i]+1] - a->i[r[i]];
    ajtmpold = a->j + a->i[r[i]];
    v        = a->a + a->i[r[i]];
    for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] =  v[j];

    row = *ajtmp++;
    while (row < i) {
      pc = rtmp + row;
      if (*pc != 0.0) {
        pv         = b->a + diag_offset[row];
        pj         = b->j + diag_offset[row] + 1;
        multiplier = *pc * *pv++;
        *pc        = multiplier;
        nz         = ai[row+1] - diag_offset[row] - 1;
        for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
        PLogFlops(1+2*nz);
      }
      row = *ajtmp++;
    }
    /* finished row so stick it into b->a */
    pv = b->a + ai[i];
    pj = b->j + ai[i];
    nz = ai[i+1] - ai[i];
    for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];}
    diag = diag_offset[i] - ai[i];
    /* check pivot entry for current row */
    if (pv[diag] == 0.0) {
      SETERRQ(1,"MatLUFactorNumeric_SeqAIJ:Zero pivot");
    }
    pv[diag] = 1.0/pv[diag];
  }

  PetscFree(rtmp);
  ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISDestroy(isicol); CHKERRQ(ierr);
  C->factor    = FACTOR_LU;
  C->assembled = PETSC_TRUE;
  PLogFlops(b->n);
  return 0;
}

/* ----------------------------------------------------------- */
int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f)
{
  Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data;
  int         ierr;
  Mat         C;

  ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr);
  ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr);

  /* free all the data structures from mat */
  PetscFree(mat->a); 
  if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);}
  if (mat->diag) PetscFree(mat->diag);
  if (mat->ilen) PetscFree(mat->ilen);
  if (mat->imax) PetscFree(mat->imax);
  if (mat->solve_work) PetscFree(mat->solve_work);
  PetscFree(mat);

  PetscMemcpy(A,C,sizeof(struct _Mat));
  PetscHeaderDestroy(C);
  return 0;
}
/* ----------------------------------------------------------- */
int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx)
{
  Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
  IS          iscol = a->col, isrow = a->row;
  int         *r,*c, ierr, i,  n = a->mbs, *vi, *ai = a->i, *aj = a->j;
  int         nz,bs = a->bs,bs2 = bs*bs,idx,idt,idc, _One = 1;
  Scalar      *xa,*ba,*aa = a->a, *sum, _DOne = 1.0, _DMOne = -1.0;
  Scalar      _DZero = 0.0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
  register Scalar *x, *b, *lsum, *tmp, *v;

  if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix");

  ierr = VecGetArray(bb,&ba); CHKERRQ(ierr); b = ba;
  ierr = VecGetArray(xx,&xa); CHKERRQ(ierr); x = xa;
  tmp  = a->solve_work;

  ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
  ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1);

  switch (bs) {
    case 1: 
      /* forward solve the lower triangular */
      tmp[0] = b[*r++];
      for ( i=1; i<n; i++ ) {
        v     = aa + ai[i];
        vi    = aj + ai[i];
        nz    = a->diag[i] - ai[i];
        sum1  = b[*r++];
        while (nz--) {
          sum1 -= (*v++)*tmp[*vi++];
        }
        tmp[i] = sum1;
      }
      /* backward solve the upper triangular */
      for ( i=n-1; i>=0; i-- ){
        v    = aa + a->diag[i] + 1;
        vi   = aj + a->diag[i] + 1;
        nz   = ai[i+1] - a->diag[i] - 1;
        sum1 = tmp[i];
        while (nz--) {
          sum1 -= (*v++)*tmp[*vi++];
        }
        x[*c--] = tmp[i] = aa[a->diag[i]]*sum1;
      }
      break;
    case 2: 
      /* forward solve the lower triangular */
      idx    = 2*(*r++); 
      tmp[0] = b[idx]; tmp[1] = b[1+idx];
      for ( i=1; i<n; i++ ) {
        v     = aa + 4*ai[i];
        vi    = aj + ai[i];
        nz    = a->diag[i] - ai[i];
        idx   = 2*(*r++); 
        sum1  = b[idx]; sum2 = b[1+idx];
        while (nz--) {
          idx   = 2*(*vi++);
          x1    = tmp[idx]; x2 = tmp[1+idx];
          sum1 -= v[0]*x1 + v[2]*x2;
          sum2 -= v[1]*x1 + v[3]*x2;
          v += 4;
        }
        idx = 2*i;
        tmp[idx] = sum1; tmp[1+idx] = sum2;
      }
      /* backward solve the upper triangular */
      for ( i=n-1; i>=0; i-- ){
        v    = aa + 4*a->diag[i] + 4;
        vi   = aj + a->diag[i] + 1;
        nz   = ai[i+1] - a->diag[i] - 1;
        idt  = 2*i;
        sum1 = tmp[idt]; sum2 = tmp[1+idt];
        while (nz--) {
          idx   = 2*(*vi++);
          x1    = tmp[idx]; x2 = tmp[1+idx];
          sum1 -= v[0]*x1 + v[2]*x2;
          sum2 -= v[1]*x1 + v[3]*x2;
          v += 4;
        }
        idc = 2*(*c--);
        v   = aa + 4*a->diag[i];
        x[idc]   = tmp[idt]   = v[0]*sum1 + v[2]*sum2;
        x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2;
      }
      break;
    case 3: 
      /* forward solve the lower triangular */
      idx    = 3*(*r++); 
      tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx];
      for ( i=1; i<n; i++ ) {
        v     = aa + 9*ai[i];
        vi    = aj + ai[i];
        nz    = a->diag[i] - ai[i];
        idx   = 3*(*r++); 
        sum1  = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx];
        while (nz--) {
          idx   = 3*(*vi++);
          x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
          sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
          sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
          sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
          v += 9;
        }
        idx = 3*i;
        tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3;
      }
      /* backward solve the upper triangular */
      for ( i=n-1; i>=0; i-- ){
        v    = aa + 9*a->diag[i] + 9;
        vi   = aj + a->diag[i] + 1;
        nz   = ai[i+1] - a->diag[i] - 1;
        idt  = 3*i;
        sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt];
        while (nz--) {
          idx   = 3*(*vi++);
          x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
          sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
          sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
          sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
          v += 9;
        }
        idc = 3*(*c--);
        v   = aa + 9*a->diag[i];
        x[idc]   = tmp[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
        x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
        x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
      }
      break;
    case 4: 
      /* forward solve the lower triangular */
      idx    = 4*(*r++); 
      tmp[0] = b[idx];   tmp[1] = b[1+idx]; 
      tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
      for ( i=1; i<n; i++ ) {
        v     = aa + 16*ai[i];
        vi    = aj + ai[i];
        nz    = a->diag[i] - ai[i];
        idx   = 4*(*r++); 
        sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
        while (nz--) {
          idx   = 4*(*vi++);
          x1    = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx];
          sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
          sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
          sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
          sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
          v += 16;
        }
        idx = 4*i;
        tmp[idx]   = sum1;tmp[1+idx] = sum2;
        tmp[2+idx] = sum3;tmp[3+idx] = sum4;
      }
      /* backward solve the upper triangular */
      for ( i=n-1; i>=0; i-- ){
        v    = aa + 16*a->diag[i] + 16;
        vi   = aj + a->diag[i] + 1;
        nz   = ai[i+1] - a->diag[i] - 1;
        idt  = 4*i;
        sum1 = tmp[idt];  sum2 = tmp[1+idt]; 
        sum3 = tmp[2+idt];sum4 = tmp[3+idt];
        while (nz--) {
          idx   = 4*(*vi++);
          x1    = tmp[idx];   x2 = tmp[1+idx];
          x3    = tmp[2+idx]; x4 = tmp[3+idx];
          sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
          sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4; 
          sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
          sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
          v += 16;
        }
        idc = 4*(*c--);
        v   = aa + 16*a->diag[i];
        x[idc]   = tmp[idt]   = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4;
        x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4;
        x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4;
        x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4;
      }
      break;
    case 5: 
      /* forward solve the lower triangular */
      idx    = 5*(*r++); 
      tmp[0] = b[idx];   tmp[1] = b[1+idx]; 
      tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
      for ( i=1; i<n; i++ ) {
        v     = aa + 25*ai[i];
        vi    = aj + ai[i];
        nz    = a->diag[i] - ai[i];
        idx   = 5*(*r++); 
        sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
        sum5  = b[4+idx];
        while (nz--) {
          idx   = 5*(*vi++);
          x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
          x4    = tmp[3+idx];x5 = tmp[4+idx];
          sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
          sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
          sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
          sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
          sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
          v += 25;
        }
        idx = 5*i;
        tmp[idx]   = sum1;tmp[1+idx] = sum2;
        tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
      }
      /* backward solve the upper triangular */
      for ( i=n-1; i>=0; i-- ){
        v    = aa + 25*a->diag[i] + 25;
        vi   = aj + a->diag[i] + 1;
        nz   = ai[i+1] - a->diag[i] - 1;
        idt  = 5*i;
        sum1 = tmp[idt];  sum2 = tmp[1+idt]; 
        sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
        while (nz--) {
          idx   = 5*(*vi++);
          x1    = tmp[idx];   x2 = tmp[1+idx];
          x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
          sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
          sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 
          sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
          sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
          sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
          v += 25;
        }
        idc = 5*(*c--);
        v   = aa + 25*a->diag[i];
        x[idc]   = tmp[idt]   = v[0]*sum1+v[5]*sum2+v[10]*sum3+
                                v[15]*sum4+v[20]*sum5;
        x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+
                                v[16]*sum4+v[21]*sum5;
        x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+
                                v[17]*sum4+v[22]*sum5;
        x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+
                                v[18]*sum4+v[23]*sum5;
        x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+
                                v[19]*sum4+v[24]*sum5;
      }
      break;
    default: {
      /* forward solve the lower triangular */
      PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar));
      for ( i=1; i<n; i++ ) {
        v   = aa + bs2*ai[i];
        vi  = aj + ai[i];
        nz  = a->diag[i] - ai[i];
        sum = tmp + bs*i;
        PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar));
        while (nz--) {
          LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,sum,&_One);
          v += bs2;
        }
      }
      /* backward solve the upper triangular */
      lsum = a->solve_work + a->n;
      for ( i=n-1; i>=0; i-- ){
        v   = aa + bs2*(a->diag[i] + 1);
        vi  = aj + a->diag[i] + 1;
        nz  = ai[i+1] - a->diag[i] - 1;
        PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar));
        while (nz--) {
          LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,lsum,&_One);
          v += bs2;
        }
        LAgemv_("N",&bs,&bs,&_DOne,aa+bs2*a->diag[i],&bs,lsum,&_One,&_DZero,
                tmp+i*bs,&_One);
        PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar));
      }
    }
  }

  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr);
  PLogFlops(2*bs2*a->nz - a->n);
  return 0;
}

/* ----------------------------------------------------------------*/
/*
     This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
   except that the data structure of Mat_SeqAIJ is slightly different.
   Not a good example of code reuse.
   */
int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels,
                                 Mat *fact)
{
  Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
  IS          isicol;
  int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
  int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
  int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0;
  int         incrlev,nnz,i,bs = a->bs;
 
  if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square");
  if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation");
  if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation");

  /* special case that simply copies fill pattern */
  if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) {
    ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr);
    (*fact)->factor = FACTOR_LU;
    b               = (Mat_SeqBAIJ *) (*fact)->data;
    if (!b->diag) {
      ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr);
    }
    b->row        = isrow;
    b->col        = iscol;
    b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
    return 0;
  }

  ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
  ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr);

  /* get new row pointers */
  ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
  ainew[0] = 0;
  /* don't know how many column pointers are needed so estimate */
  jmax = (int) (f*ai[n] + 1);
  ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
  /* ajfill is level of fill for each fill entry */
  ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill);
  /* fill is a linked list of nonzeros in active row */
  fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill);
  /* im is level for each filled value */
  im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im);
  /* dloc is location of diagonal in factor */
  dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc);
  dloc[0]  = 0;
  for ( prow=0; prow<n; prow++ ) {
    /* first copy previous fill into linked list */
    nzf     = nz  = ai[r[prow]+1] - ai[r[prow]];
    xi      = aj + ai[r[prow]];
    fill[n] = n;
    while (nz--) {
      fm  = n;
      idx = ic[*xi++];
      do {
        m  = fm;
        fm = fill[m];
      } while (fm < idx);
      fill[m]   = idx;
      fill[idx] = fm;
      im[idx]   = 0;
    }
    nzi = 0;
    row = fill[n];
    while ( row < prow ) {
      incrlev = im[row] + 1;
      nz      = dloc[row];
      xi      = ajnew  + ainew[row] + nz;
      flev    = ajfill + ainew[row] + nz + 1;
      nnz     = ainew[row+1] - ainew[row] - nz - 1;
      if (*xi++ != row) {
        SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot");
      }
      fm      = row;
      while (nnz-- > 0) {
        idx = *xi++;
        if (*flev + incrlev > levels) {
          flev++;
          continue;
        }
        do {
          m  = fm;
          fm = fill[m];
        } while (fm < idx);
        if (fm != idx) {
          im[idx]   = *flev + incrlev;
          fill[m]   = idx;
          fill[idx] = fm;
          fm        = idx;
          nzf++;
        }
        else {
          if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
        }
        flev++;
      }
      row = fill[row];
      nzi++;
    }
    /* copy new filled row into permanent storage */
    ainew[prow+1] = ainew[prow] + nzf;
    if (ainew[prow+1] > jmax) {
      /* allocate a longer ajnew */
      int maxadd;
      maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n);
      if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
      jmax += maxadd;
      xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
      PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
      PetscFree(ajnew);
      ajnew = xi;
      /* allocate a longer ajfill */
      xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
      PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
      PetscFree(ajfill);
      ajfill = xi;
      realloc++;
    }
    xi          = ajnew + ainew[prow];
    flev        = ajfill + ainew[prow];
    dloc[prow]  = nzi;
    fm          = fill[n];
    while (nzf--) {
      *xi++   = fm;
      *flev++ = im[fm];
      fm      = fill[fm];
    }
  }
  PetscFree(ajfill); 
  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
  ierr = ISDestroy(isicol); CHKERRQ(ierr);
  PetscFree(fill); PetscFree(im);

  PLogInfo((PetscObject)A,
    "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n",
                             realloc,f,((double)ainew[n])/((double)ai[prow]));

  /* put together the new matrix */
  ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
  b = (Mat_SeqBAIJ *) (*fact)->data;
  PetscFree(b->imax);
  b->singlemalloc = 0;
  len = bs*bs*ainew[n]*sizeof(Scalar);
  /* the next line frees the default space generated by the Create() */
  PetscFree(b->a); PetscFree(b->ilen);
  b->a          = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
  b->j          = ajnew;
  b->i          = ainew;
  for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
  b->diag       = dloc;
  b->ilen       = 0;
  b->imax       = 0;
  b->row        = isrow;
  b->col        = iscol;
  b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 
  CHKPTRQ(b->solve_work);
  /* In b structure:  Free imax, ilen, old a, old j.  
     Allocate dloc, solve_work, new a, new j */
  PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs*bs*ainew[n]*sizeof(Scalar));
  b->maxnz          = b->nz = ainew[n];
  (*fact)->factor   = FACTOR_LU;
  return 0; 
}




