#ifndef lint
static char vcid[] = "$Id: baijfact.c,v 1.1 1996/02/13 22:33:11 bsmith Exp bsmith $";
#endif
/*
    Factorization code for BAIJ format. 
*/

#include "baij.h"

/*
    The symbolic factorization code is identical to that for AIJ format,
  except for very small changes since this is now a SeqBAIJ datastructure.
  NOT good code reuse.
*/
int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B)
{
  Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
  IS          isicol;
  int         *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j;
  int         *ainew,*ajnew, jmax,*fill, *ajtmp, nz,shift = a->indexshift;
  int         *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im;
  int         bs = a->bs;
 
  if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square");
  if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation");
  if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation");

  ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
  ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic);

  /* get new row pointers */
  ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
  ainew[0] = -shift;
  /* don't know how many column pointers are needed so estimate */
  jmax = (int) (f*ai[n]+(!shift));
  ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
  /* fill is a linked list of nonzeros in active row */
  fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill);
  im = fill + n + 1;
  /* idnew is location of diagonal in factor */
  idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew);
  idnew[0] = -shift;

  for ( i=0; i<n; i++ ) {
    /* first copy previous fill into linked list */
    nnz     = nz    = ai[r[i]+1] - ai[r[i]];
    ajtmp   = aj + ai[r[i]] + shift;
    fill[n] = n;
    while (nz--) {
      fm  = n;
      idx = ic[*ajtmp++ + shift];
      do {
        m  = fm;
        fm = fill[m];
      } while (fm < idx);
      fill[m]   = idx;
      fill[idx] = fm;
    }
    row = fill[n];
    while ( row < i ) {
      ajtmp = ajnew + idnew[row] + (!shift);
      nzbd  = 1 + idnew[row] - ainew[row];
      nz    = im[row] - nzbd;
      fm    = row;
      while (nz-- > 0) {
        idx = *ajtmp++ + shift;
        nzbd++;
        if (idx == i) im[row] = nzbd;
        do {
          m  = fm;
          fm = fill[m];
        } while (fm < idx);
        if (fm != idx) {
          fill[m]   = idx;
          fill[idx] = fm;
          fm        = idx;
          nnz++;
        }
      }
      row = fill[row];
    }
    /* copy new filled row into permanent storage */
    ainew[i+1] = ainew[i] + nnz;
    if (ainew[i+1] > jmax+1) {
      /* allocate a longer ajnew */
      int maxadd;
      maxadd = (int) ((f*(ai[n]+(!shift))*(n-i+5))/n);
      if (maxadd < nnz) maxadd = (n-i)*(nnz+1);
      jmax += maxadd;
      ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp);
      PetscMemcpy(ajtmp,ajnew,(ainew[i]+shift)*sizeof(int));
      PetscFree(ajnew);
      ajnew = ajtmp;
      realloc++; /* count how many times we realloc */
    }
    ajtmp = ajnew + ainew[i] + shift;
    fm    = fill[n];
    nzi   = 0;
    im[i] = nnz;
    while (nnz--) {
      if (fm < i) nzi++;
      *ajtmp++ = fm - shift;
      fm       = fill[fm];
    }
    idnew[i] = ainew[i] + nzi;
  }

  PLogInfo((PetscObject)A,
    "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
                             realloc,f,((double)ainew[n])/((double)ai[i]));

  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);

  PetscFree(fill);

  /* put together the new matrix */
  ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr);
  PLogObjectParent(*B,isicol); 
  ierr = ISDestroy(isicol); CHKERRQ(ierr);
  b = (Mat_SeqBAIJ *) (*B)->data;
  PetscFree(b->imax);
  b->singlemalloc = 0;
  len             = (ainew[n] + shift)*sizeof(Scalar);
  /* the next line frees the default space generated by the Create() */
  PetscFree(b->a); PetscFree(b->ilen);
  b->a          = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a);
  b->j          = ajnew;
  b->i          = ainew;
  b->diag       = idnew;
  b->ilen       = 0;
  b->imax       = 0;
  b->row        = isrow;
  b->col        = iscol;
  b->solve_work = (Scalar *) PetscMalloc( n*sizeof(Scalar)); 
  CHKPTRQ(b->solve_work);
  /* In b structure:  Free imax, ilen, old a, old j.  
     Allocate idnew, solve_work, new a, new j */
  PLogObjectMemory(*B,(ainew[n]+shift-n)*(sizeof(int)+sizeof(Scalar)));
  b->maxnz = b->nz = ainew[n] + shift;

  return 0; 
}

#include "pinclude/plapack.h"

#define BlockZero(v,idx) {PetscMemzero(v+bs2*(idx),bs2*sizeof(Scalar));}

#define BlockCopy(v_in,v_out,idx_in,idx_out) \
  {PetscMemcpy(v_out + bs2*(idx_out),v_in + bs2*(idx_in),bs2*sizeof(Scalar));}

#define BlockInvert(vv,idx) \
  { int _i,info; Scalar *w = vv + bs2*idx; \
printf("vv idx %g %d \n",*vv,idx); \
    LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); \
printf("bs %d w %g\n",bs,*w);\
    PetscMemzero(v_work,bs2*sizeof(Scalar));  \
printf("vwork %g\n",*v_work);\
    for ( _i=0; _i<bs; _i++ ) { v_work[_i + bs*_i] = 1.0; } \
printf("vwork %g\n",*v_work);\
    LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);\
printf("w %g vwork %g\n",*w,*v_work);\
    PetscMemcpy(w,v_work,bs2*sizeof(Scalar)); \
printf("w %g vwork %g\n",*w,*v_work);\
  }

#define BlockMult(v1,v2,v3
/* ----------------------------------------------------------- */
int MatLUFactorNumeric_SeqBAIJ(Mat A,Mat *B)
{
  Mat         C = *B;
  Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data;
  IS          iscol = b->col, isrow = b->row, isicol;
  int         *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
  int         *ajtmpold, *ajtmp, nz, row, *ics, shift = a->indexshift;
  int         *diag_offset = b->diag,diag,bs = a->bs,bs2 = bs*bs, *v_pivots;
  Scalar      *rtmp,*v, *pc, multiplier,*v_work;
  /* These declarations are for optimizations.  They reduce the number of
     memory references that are made by locally storing information; the
     word "register" used here with pointers can be viewed as "private" or 
     "known only to me"
   */
  register Scalar *pv, *rtmps;
  register int    *pj;

  ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
  PLogObjectParent(*B,isicol);
  ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
  ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
  rtmp  = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
  rtmps = rtmp + shift; ics = ic + shift;

  /* generate work space needed by dense LU factorization */
  v_work   = (Scalar *) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(v_work);
  v_pivots = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(v_pivots);

  for ( i=0; i<n; i++ ) {
    nz    = ai[i+1] - ai[i];
    ajtmp = aj + ai[i] + shift;
    for  ( j=0; j<nz; j++ ) BlockZero(rtmps,ajtmp[j]);
malloc_verify();
    /* load in initial (unfactored row) */
    nz       = a->i[r[i]+1] - a->i[r[i]];
    ajtmpold = a->j + a->i[r[i]] + shift;
    v        = a->a + bs2*a->i[r[i]] + shift;
    for ( j=0; j<nz; j++ ) BlockCopy(v,rtmp,j,ics[ajtmpold[j]]);
malloc_verify();

    row = *ajtmp++ + shift;
    while (row < i) {
      pc = rtmp + row;
      if (*pc != 0.0) {
        pv         = b->a + bs2*diag_offset[row] + shift;
        pj         = b->j + diag_offset[row] + (!shift);
        multiplier = *pc * (*pv);
        *pc        = multiplier;
        nz         = ai[row+1] - diag_offset[row] - 1;
        for (j=0; j<nz; j++) rtmps[pj[j]] -= multiplier * pv[j];
        pv        += bs2;
        PLogFlops(2*nz);
      }
      row = *ajtmp++ + shift;
    }
    /* finished row so stick it into b->a */
    pv = b->a + bs2*ai[i] + shift;
    pj = b->j + ai[i] + shift;
    nz = ai[i+1] - ai[i];
    for ( j=0; j<nz; j++ ) BlockCopy(rtmps,pv,pj[j],j);
malloc_verify();
    diag = diag_offset[i] - ai[i];
    /* invert diagonal block */
    BlockInvert(pv,diag);
malloc_verify();
  }

  PetscFree(rtmp);
  ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISDestroy(isicol); CHKERRQ(ierr);
  C->factor = FACTOR_LU;
  C->assembled = PETSC_TRUE;
  PLogFlops(b->n);
  return 0;
}
/* ----------------------------------------------------------- */
int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f)
{
  Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data;
  int         ierr;
  Mat         C;

  ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr);
  ierr = MatLUFactorNumeric_SeqBAIJ(A,&C); CHKERRQ(ierr);

  /* free all the data structures from mat */
  PetscFree(mat->a); 
  if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);}
  if (mat->diag) PetscFree(mat->diag);
  if (mat->ilen) PetscFree(mat->ilen);
  if (mat->imax) PetscFree(mat->imax);
  if (mat->solve_work) PetscFree(mat->solve_work);
  PetscFree(mat);

  PetscMemcpy(A,C,sizeof(struct _Mat));
  PetscHeaderDestroy(C);
  return 0;
}
/* ----------------------------------------------------------- */
int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx)
{
  Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
  IS          iscol = a->col, isrow = a->row;
  int         *r,*c, ierr, i,  n = a->m, *vi, *ai = a->i, *aj = a->j;
  int         nz,shift = a->indexshift;
  Scalar      *x,*b,*tmp, *tmps, *aa = a->a, sum, *v;

  if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix");

  ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
  ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
  tmp  = a->solve_work;

  ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
  ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1);

  /* forward solve the lower triangular */
  tmp[0] = b[*r++];
  tmps   = tmp + shift;
  for ( i=1; i<n; i++ ) {
    v   = aa + ai[i] + shift;
    vi  = aj + ai[i] + shift;
    nz  = a->diag[i] - ai[i];
    sum = b[*r++];
    while (nz--) sum -= *v++ * tmps[*vi++];
    tmp[i] = sum;
  }

  /* backward solve the upper triangular */
  for ( i=n-1; i>=0; i-- ){
    v   = aa + a->diag[i] + (!shift);
    vi  = aj + a->diag[i] + (!shift);
    nz  = ai[i+1] - a->diag[i] - 1;
    sum = tmp[i];
    while (nz--) sum -= *v++ * tmps[*vi++];
    x[*c--] = tmp[i] = sum*aa[a->diag[i]+shift];
  }

  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr);
  PLogFlops(2*a->nz - a->n);
  return 0;
}

/* ----------------------------------------------------------------*/
/*
     This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
   except that the data structure of Mat_SeqAIJ is slightly different.
   Not a good example of code reuse.
   */
int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels,
                                 Mat *fact)
{
  Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
  IS          isicol;
  int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
  int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
  int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0;
  int         incrlev,nnz,i,shift = a->indexshift,bs = a->bs;
 
  if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square");
  if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation");
  if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation");

  /* special case that simply copies fill pattern */
  if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) {
    ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr);
    (*fact)->factor = FACTOR_LU;
    b               = (Mat_SeqBAIJ *) (*fact)->data;
    if (!b->diag) {
      ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr);
    }
    b->row        = isrow;
    b->col        = iscol;
    b->solve_work = (Scalar *) PetscMalloc((b->m+1)*sizeof(Scalar));CHKPTRQ(b->solve_work);
    return 0;
  }

  ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
  ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr);

  /* get new row pointers */
  ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
  ainew[0] = -shift;
  /* don't know how many column pointers are needed so estimate */
  jmax = (int) (f*(ai[n]+!shift));
  ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
  /* ajfill is level of fill for each fill entry */
  ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill);
  /* fill is a linked list of nonzeros in active row */
  fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill);
  /* im is level for each filled value */
  im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im);
  /* dloc is location of diagonal in factor */
  dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc);
  dloc[0]  = 0;
  for ( prow=0; prow<n; prow++ ) {
    /* first copy previous fill into linked list */
    nzf     = nz  = ai[r[prow]+1] - ai[r[prow]];
    xi      = aj + ai[r[prow]] + shift;
    fill[n] = n;
    while (nz--) {
      fm  = n;
      idx = ic[*xi++ + shift];
      do {
        m  = fm;
        fm = fill[m];
      } while (fm < idx);
      fill[m]   = idx;
      fill[idx] = fm;
      im[idx]   = 0;
    }
    nzi = 0;
    row = fill[n];
    while ( row < prow ) {
      incrlev = im[row] + 1;
      nz      = dloc[row];
      xi      = ajnew  + ainew[row] + shift + nz;
      flev    = ajfill + ainew[row] + shift + nz + 1;
      nnz     = ainew[row+1] - ainew[row] - nz - 1;
      if (*xi++ + shift != row) {
        SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot");
      }
      fm      = row;
      while (nnz-- > 0) {
        idx = *xi++ + shift;
        if (*flev + incrlev > levels) {
          flev++;
          continue;
        }
        do {
          m  = fm;
          fm = fill[m];
        } while (fm < idx);
        if (fm != idx) {
          im[idx]   = *flev + incrlev;
          fill[m]   = idx;
          fill[idx] = fm;
          fm        = idx;
          nzf++;
        }
        else {
          if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
        }
        flev++;
      }
      row = fill[row];
      nzi++;
    }
    /* copy new filled row into permanent storage */
    ainew[prow+1] = ainew[prow] + nzf;
    if (ainew[prow+1] > jmax-shift) {
      /* allocate a longer ajnew */
      int maxadd;
      maxadd = (int) ((f*(ai[n]+!shift)*(n-prow+5))/n);
      if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
      jmax += maxadd;
      xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
      PetscMemcpy(xi,ajnew,(ainew[prow]+shift)*sizeof(int));
      PetscFree(ajnew);
      ajnew = xi;
      /* allocate a longer ajfill */
      xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
      PetscMemcpy(xi,ajfill,(ainew[prow]+shift)*sizeof(int));
      PetscFree(ajfill);
      ajfill = xi;
      realloc++;
    }
    xi          = ajnew + ainew[prow] + shift;
    flev        = ajfill + ainew[prow] + shift;
    dloc[prow]  = nzi;
    fm          = fill[n];
    while (nzf--) {
      *xi++   = fm - shift;
      *flev++ = im[fm];
      fm      = fill[fm];
    }
  }
  PetscFree(ajfill); 
  ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
  ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
  ierr = ISDestroy(isicol); CHKERRQ(ierr);
  PetscFree(fill); PetscFree(im);

  PLogInfo((PetscObject)A,
    "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n",
                             realloc,f,((double)ainew[n])/((double)ai[prow]));

  /* put together the new matrix */
  ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
  b = (Mat_SeqBAIJ *) (*fact)->data;
  PetscFree(b->imax);
  b->singlemalloc = 0;
  len = (ainew[n] + shift)*sizeof(Scalar);
  /* the next line frees the default space generated by the Create() */
  PetscFree(b->a); PetscFree(b->ilen);
  b->a          = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
  b->j          = ajnew;
  b->i          = ainew;
  for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
  b->diag       = dloc;
  b->ilen       = 0;
  b->imax       = 0;
  b->row        = isrow;
  b->col        = iscol;
  b->solve_work = (Scalar *) PetscMalloc( (n+1)*sizeof(Scalar)); 
  CHKPTRQ(b->solve_work);
  /* In b structure:  Free imax, ilen, old a, old j.  
     Allocate dloc, solve_work, new a, new j */
  PLogObjectMemory(*fact,(ainew[n]+shift-n) * (sizeof(int)+sizeof(Scalar)));
  b->maxnz          = b->nz = ainew[n] + shift;
  (*fact)->factor   = FACTOR_LU;
  return 0; 
}




