!  Program usage: mpiexec -n 1 chwirut1f [-help] [all TAO options]
!
!  Description:  This example demonstrates use of the TAO package to solve a
!  nonlinear least-squares problem on a single processor.  We minimize the
!  Chwirut function:
!       sum_{i=0}^{n/2-1} ( alpha*(x_{2i+1}-x_{2i}^2)^2 + (1-x_{2i})^2)
!
!  The C version of this code is chwirut1.c
!
module chwirut2fmodule
  use petscmpi              ! or mpi or mpi_f08
  use petsctao
#include <petsc/finclude/petsctao.h>
  PetscReal t(0:213)
  PetscReal y(0:213)
  PetscInt m, n
  PetscMPIInt nn
  PetscMPIInt rank
  PetscMPIInt size
  PetscMPIInt idle_tag, die_tag
  PetscMPIInt zero, one
  parameter(m=214)
  parameter(n=3)
  parameter(nn=n)
  parameter(idle_tag=2000)
  parameter(die_tag=3000)
  parameter(zero=0, one=1)
end module chwirut2fmodule

program main
  use chwirut2fmodule

! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
!                   Variable declarations
! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
!
!  See additional variable declarations in the file chwirut2f.h

  PetscErrorCode ierr    ! used to check for functions returning nonzeros
  Vec x       ! solution vector
  Vec f       ! vector of functions
  Tao ta     ! Tao context

!  Note: Any user-defined Fortran routines (such as FormGradient)
!  MUST be declared as external.

  external FormFunction

!  Initialize TAO and PETSc
  PetscCallA(PetscInitialize(ierr))
  PetscCallMPIA(MPI_Comm_size(PETSC_COMM_WORLD, size, ierr))
  PetscCallMPIA(MPI_Comm_rank(PETSC_COMM_WORLD, rank, ierr))

!  Initialize problem parameters
  call InitializeData()

  if (rank == 0) then
!  Allocate vectors for the solution and gradient
    PetscCallA(VecCreateSeq(PETSC_COMM_SELF, n, x, ierr))
    PetscCallA(VecCreateSeq(PETSC_COMM_SELF, m, f, ierr))

!     The TAO code begins here

!     Create TAO solver
    PetscCallA(TaoCreate(PETSC_COMM_SELF, ta, ierr))
    PetscCallA(TaoSetType(ta, TAOPOUNDERS, ierr))

!     Set routines for function, gradient, and hessian evaluation
    PetscCallA(TaoSetResidualRoutine(ta, f, FormFunction, 0, ierr))

!     Optional: Set initial guess
    call FormStartingPoint(x)
    PetscCallA(TaoSetSolution(ta, x, ierr))

!     Check for TAO command line options
    PetscCallA(TaoSetFromOptions(ta, ierr))
!     SOLVE THE APPLICATION
    PetscCallA(TaoSolve(ta, ierr))

!     Free TAO data structures
    PetscCallA(TaoDestroy(ta, ierr))

!     Free PETSc data structures
    PetscCallA(VecDestroy(x, ierr))
    PetscCallA(VecDestroy(f, ierr))
    PetscCallA(StopWorkers(ierr))

  else
    PetscCallA(TaskWorker(ierr))
  end if

  PetscCallA(PetscFinalize(ierr))
end

! --------------------------------------------------------------------
!  FormFunction - Evaluates the function f(X) and gradient G(X)
!
!  Input Parameters:
!  tao - the Tao context
!  X   - input vector
!  dummy - not used
!
!  Output Parameters:
!  f - function vector

subroutine FormFunction(ta, x, f, dummy, ierr)
  use chwirut2fmodule

  Tao ta
  Vec x, f
  PetscErrorCode ierr

  PetscInt i, checkedin
  PetscInt finished_tasks
  PetscMPIInt next_task
  PetscMPIInt status(MPI_STATUS_SIZE), tag, source
  PetscInt dummy

  PetscReal, pointer :: f_v(:), x_v(:)
  PetscReal fval(1)

  ierr = 0

!     Get pointers to vector data
  PetscCall(VecGetArrayRead(x, x_v, ierr))
  PetscCall(VecGetArray(f, f_v, ierr))

!     Compute F(X)
  if (size == 1) then
    ! Single processor
    do i = 1, m
      PetscCall(RunSimulation(x_v, i, f_v(i), ierr))
    end do
  else
    ! Multiprocessor main
    next_task = zero
    finished_tasks = 0
    checkedin = 0

    do while (finished_tasks < m .or. checkedin < size - 1)
      PetscCallMPI(MPI_Recv(fval, one, MPIU_SCALAR, MPI_ANY_SOURCE, MPI_ANY_TAG, PETSC_COMM_WORLD, status, ierr))
      tag = status(MPI_TAG)
      source = status(MPI_SOURCE)
      if (tag == IDLE_TAG) then
        checkedin = checkedin + 1
      else
        f_v(tag + 1) = fval(1)
        finished_tasks = finished_tasks + 1
      end if
      if (next_task < m) then
        ! Send task to worker
        PetscCallMPI(MPI_Send(x_v, nn, MPIU_SCALAR, source, next_task, PETSC_COMM_WORLD, ierr))
        next_task = next_task + one
      else
        ! Send idle message to worker
        PetscCallMPI(MPI_Send(x_v, nn, MPIU_SCALAR, source, IDLE_TAG, PETSC_COMM_WORLD, ierr))
      end if
    end do
  end if

!     Restore vectors
  PetscCall(VecRestoreArrayRead(x, x_v, ierr))
  PetscCall(VecRestoreArray(F, f_v, ierr))
end

subroutine FormStartingPoint(x)
  use chwirut2fmodule

  Vec x
  PetscReal, pointer :: x_v(:)
  PetscErrorCode ierr

  PetscCall(VecGetArray(x, x_v, ierr))
  x_v(1) = 0.15
  x_v(2) = 0.008
  x_v(3) = 0.01
  PetscCall(VecRestoreArray(x, x_v, ierr))
end

subroutine InitializeData()
  use chwirut2fmodule

  PetscInt i
  i = 0
  y(i) = 92.9000; t(i) = 0.5000; i = i + 1
  y(i) = 78.7000; t(i) = 0.6250; i = i + 1
  y(i) = 64.2000; t(i) = 0.7500; i = i + 1
  y(i) = 64.9000; t(i) = 0.8750; i = i + 1
  y(i) = 57.1000; t(i) = 1.0000; i = i + 1
  y(i) = 43.3000; t(i) = 1.2500; i = i + 1
  y(i) = 31.1000; t(i) = 1.7500; i = i + 1
  y(i) = 23.6000; t(i) = 2.2500; i = i + 1
  y(i) = 31.0500; t(i) = 1.7500; i = i + 1
  y(i) = 23.7750; t(i) = 2.2500; i = i + 1
  y(i) = 17.7375; t(i) = 2.7500; i = i + 1
  y(i) = 13.8000; t(i) = 3.2500; i = i + 1
  y(i) = 11.5875; t(i) = 3.7500; i = i + 1
  y(i) = 9.4125; t(i) = 4.2500; i = i + 1
  y(i) = 7.7250; t(i) = 4.7500; i = i + 1
  y(i) = 7.3500; t(i) = 5.2500; i = i + 1
  y(i) = 8.0250; t(i) = 5.7500; i = i + 1
  y(i) = 90.6000; t(i) = 0.5000; i = i + 1
  y(i) = 76.9000; t(i) = 0.6250; i = i + 1
  y(i) = 71.6000; t(i) = 0.7500; i = i + 1
  y(i) = 63.6000; t(i) = 0.8750; i = i + 1
  y(i) = 54.0000; t(i) = 1.0000; i = i + 1
  y(i) = 39.2000; t(i) = 1.2500; i = i + 1
  y(i) = 29.3000; t(i) = 1.7500; i = i + 1
  y(i) = 21.4000; t(i) = 2.2500; i = i + 1
  y(i) = 29.1750; t(i) = 1.7500; i = i + 1
  y(i) = 22.1250; t(i) = 2.2500; i = i + 1
  y(i) = 17.5125; t(i) = 2.7500; i = i + 1
  y(i) = 14.2500; t(i) = 3.2500; i = i + 1
  y(i) = 9.4500; t(i) = 3.7500; i = i + 1
  y(i) = 9.1500; t(i) = 4.2500; i = i + 1
  y(i) = 7.9125; t(i) = 4.7500; i = i + 1
  y(i) = 8.4750; t(i) = 5.2500; i = i + 1
  y(i) = 6.1125; t(i) = 5.7500; i = i + 1
  y(i) = 80.0000; t(i) = 0.5000; i = i + 1
  y(i) = 79.0000; t(i) = 0.6250; i = i + 1
  y(i) = 63.8000; t(i) = 0.7500; i = i + 1
  y(i) = 57.2000; t(i) = 0.8750; i = i + 1
  y(i) = 53.2000; t(i) = 1.0000; i = i + 1
  y(i) = 42.5000; t(i) = 1.2500; i = i + 1
  y(i) = 26.8000; t(i) = 1.7500; i = i + 1
  y(i) = 20.4000; t(i) = 2.2500; i = i + 1
  y(i) = 26.8500; t(i) = 1.7500; i = i + 1
  y(i) = 21.0000; t(i) = 2.2500; i = i + 1
  y(i) = 16.4625; t(i) = 2.7500; i = i + 1
  y(i) = 12.5250; t(i) = 3.2500; i = i + 1
  y(i) = 10.5375; t(i) = 3.7500; i = i + 1
  y(i) = 8.5875; t(i) = 4.2500; i = i + 1
  y(i) = 7.1250; t(i) = 4.7500; i = i + 1
  y(i) = 6.1125; t(i) = 5.2500; i = i + 1
  y(i) = 5.9625; t(i) = 5.7500; i = i + 1
  y(i) = 74.1000; t(i) = 0.5000; i = i + 1
  y(i) = 67.3000; t(i) = 0.6250; i = i + 1
  y(i) = 60.8000; t(i) = 0.7500; i = i + 1
  y(i) = 55.5000; t(i) = 0.8750; i = i + 1
  y(i) = 50.3000; t(i) = 1.0000; i = i + 1
  y(i) = 41.0000; t(i) = 1.2500; i = i + 1
  y(i) = 29.4000; t(i) = 1.7500; i = i + 1
  y(i) = 20.4000; t(i) = 2.2500; i = i + 1
  y(i) = 29.3625; t(i) = 1.7500; i = i + 1
  y(i) = 21.1500; t(i) = 2.2500; i = i + 1
  y(i) = 16.7625; t(i) = 2.7500; i = i + 1
  y(i) = 13.2000; t(i) = 3.2500; i = i + 1
  y(i) = 10.8750; t(i) = 3.7500; i = i + 1
  y(i) = 8.1750; t(i) = 4.2500; i = i + 1
  y(i) = 7.3500; t(i) = 4.7500; i = i + 1
  y(i) = 5.9625; t(i) = 5.2500; i = i + 1
  y(i) = 5.6250; t(i) = 5.7500; i = i + 1
  y(i) = 81.5000; t(i) = .5000; i = i + 1
  y(i) = 62.4000; t(i) = .7500; i = i + 1
  y(i) = 32.5000; t(i) = 1.5000; i = i + 1
  y(i) = 12.4100; t(i) = 3.0000; i = i + 1
  y(i) = 13.1200; t(i) = 3.0000; i = i + 1
  y(i) = 15.5600; t(i) = 3.0000; i = i + 1
  y(i) = 5.6300; t(i) = 6.0000; i = i + 1
  y(i) = 78.0000; t(i) = .5000; i = i + 1
  y(i) = 59.9000; t(i) = .7500; i = i + 1
  y(i) = 33.2000; t(i) = 1.5000; i = i + 1
  y(i) = 13.8400; t(i) = 3.0000; i = i + 1
  y(i) = 12.7500; t(i) = 3.0000; i = i + 1
  y(i) = 14.6200; t(i) = 3.0000; i = i + 1
  y(i) = 3.9400; t(i) = 6.0000; i = i + 1
  y(i) = 76.8000; t(i) = .5000; i = i + 1
  y(i) = 61.0000; t(i) = .7500; i = i + 1
  y(i) = 32.9000; t(i) = 1.5000; i = i + 1
  y(i) = 13.8700; t(i) = 3.0000; i = i + 1
  y(i) = 11.8100; t(i) = 3.0000; i = i + 1
  y(i) = 13.3100; t(i) = 3.0000; i = i + 1
  y(i) = 5.4400; t(i) = 6.0000; i = i + 1
  y(i) = 78.0000; t(i) = .5000; i = i + 1
  y(i) = 63.5000; t(i) = .7500; i = i + 1
  y(i) = 33.8000; t(i) = 1.5000; i = i + 1
  y(i) = 12.5600; t(i) = 3.0000; i = i + 1
  y(i) = 5.6300; t(i) = 6.0000; i = i + 1
  y(i) = 12.7500; t(i) = 3.0000; i = i + 1
  y(i) = 13.1200; t(i) = 3.0000; i = i + 1
  y(i) = 5.4400; t(i) = 6.0000; i = i + 1
  y(i) = 76.8000; t(i) = .5000; i = i + 1
  y(i) = 60.0000; t(i) = .7500; i = i + 1
  y(i) = 47.8000; t(i) = 1.0000; i = i + 1
  y(i) = 32.0000; t(i) = 1.5000; i = i + 1
  y(i) = 22.2000; t(i) = 2.0000; i = i + 1
  y(i) = 22.5700; t(i) = 2.0000; i = i + 1
  y(i) = 18.8200; t(i) = 2.5000; i = i + 1
  y(i) = 13.9500; t(i) = 3.0000; i = i + 1
  y(i) = 11.2500; t(i) = 4.0000; i = i + 1
  y(i) = 9.0000; t(i) = 5.0000; i = i + 1
  y(i) = 6.6700; t(i) = 6.0000; i = i + 1
  y(i) = 75.8000; t(i) = .5000; i = i + 1
  y(i) = 62.0000; t(i) = .7500; i = i + 1
  y(i) = 48.8000; t(i) = 1.0000; i = i + 1
  y(i) = 35.2000; t(i) = 1.5000; i = i + 1
  y(i) = 20.0000; t(i) = 2.0000; i = i + 1
  y(i) = 20.3200; t(i) = 2.0000; i = i + 1
  y(i) = 19.3100; t(i) = 2.5000; i = i + 1
  y(i) = 12.7500; t(i) = 3.0000; i = i + 1
  y(i) = 10.4200; t(i) = 4.0000; i = i + 1
  y(i) = 7.3100; t(i) = 5.0000; i = i + 1
  y(i) = 7.4200; t(i) = 6.0000; i = i + 1
  y(i) = 70.5000; t(i) = .5000; i = i + 1
  y(i) = 59.5000; t(i) = .7500; i = i + 1
  y(i) = 48.5000; t(i) = 1.0000; i = i + 1
  y(i) = 35.8000; t(i) = 1.5000; i = i + 1
  y(i) = 21.0000; t(i) = 2.0000; i = i + 1
  y(i) = 21.6700; t(i) = 2.0000; i = i + 1
  y(i) = 21.0000; t(i) = 2.5000; i = i + 1
  y(i) = 15.6400; t(i) = 3.0000; i = i + 1
  y(i) = 8.1700; t(i) = 4.0000; i = i + 1
  y(i) = 8.5500; t(i) = 5.0000; i = i + 1
  y(i) = 10.1200; t(i) = 6.0000; i = i + 1
  y(i) = 78.0000; t(i) = .5000; i = i + 1
  y(i) = 66.0000; t(i) = .6250; i = i + 1
  y(i) = 62.0000; t(i) = .7500; i = i + 1
  y(i) = 58.0000; t(i) = .8750; i = i + 1
  y(i) = 47.7000; t(i) = 1.0000; i = i + 1
  y(i) = 37.8000; t(i) = 1.2500; i = i + 1
  y(i) = 20.2000; t(i) = 2.2500; i = i + 1
  y(i) = 21.0700; t(i) = 2.2500; i = i + 1
  y(i) = 13.8700; t(i) = 2.7500; i = i + 1
  y(i) = 9.6700; t(i) = 3.2500; i = i + 1
  y(i) = 7.7600; t(i) = 3.7500; i = i + 1
  y(i) = 5.4400; t(i) = 4.2500; i = i + 1
  y(i) = 4.8700; t(i) = 4.7500; i = i + 1
  y(i) = 4.0100; t(i) = 5.2500; i = i + 1
  y(i) = 3.7500; t(i) = 5.7500; i = i + 1
  y(i) = 24.1900; t(i) = 3.0000; i = i + 1
  y(i) = 25.7600; t(i) = 3.0000; i = i + 1
  y(i) = 18.0700; t(i) = 3.0000; i = i + 1
  y(i) = 11.8100; t(i) = 3.0000; i = i + 1
  y(i) = 12.0700; t(i) = 3.0000; i = i + 1
  y(i) = 16.1200; t(i) = 3.0000; i = i + 1
  y(i) = 70.8000; t(i) = .5000; i = i + 1
  y(i) = 54.7000; t(i) = .7500; i = i + 1
  y(i) = 48.0000; t(i) = 1.0000; i = i + 1
  y(i) = 39.8000; t(i) = 1.5000; i = i + 1
  y(i) = 29.8000; t(i) = 2.0000; i = i + 1
  y(i) = 23.7000; t(i) = 2.5000; i = i + 1
  y(i) = 29.6200; t(i) = 2.0000; i = i + 1
  y(i) = 23.8100; t(i) = 2.5000; i = i + 1
  y(i) = 17.7000; t(i) = 3.0000; i = i + 1
  y(i) = 11.5500; t(i) = 4.0000; i = i + 1
  y(i) = 12.0700; t(i) = 5.0000; i = i + 1
  y(i) = 8.7400; t(i) = 6.0000; i = i + 1
  y(i) = 80.7000; t(i) = .5000; i = i + 1
  y(i) = 61.3000; t(i) = .7500; i = i + 1
  y(i) = 47.5000; t(i) = 1.0000; i = i + 1
  y(i) = 29.0000; t(i) = 1.5000; i = i + 1
  y(i) = 24.0000; t(i) = 2.0000; i = i + 1
  y(i) = 17.7000; t(i) = 2.5000; i = i + 1
  y(i) = 24.5600; t(i) = 2.0000; i = i + 1
  y(i) = 18.6700; t(i) = 2.5000; i = i + 1
  y(i) = 16.2400; t(i) = 3.0000; i = i + 1
  y(i) = 8.7400; t(i) = 4.0000; i = i + 1
  y(i) = 7.8700; t(i) = 5.0000; i = i + 1
  y(i) = 8.5100; t(i) = 6.0000; i = i + 1
  y(i) = 66.7000; t(i) = .5000; i = i + 1
  y(i) = 59.2000; t(i) = .7500; i = i + 1
  y(i) = 40.8000; t(i) = 1.0000; i = i + 1
  y(i) = 30.7000; t(i) = 1.5000; i = i + 1
  y(i) = 25.7000; t(i) = 2.0000; i = i + 1
  y(i) = 16.3000; t(i) = 2.5000; i = i + 1
  y(i) = 25.9900; t(i) = 2.0000; i = i + 1
  y(i) = 16.9500; t(i) = 2.5000; i = i + 1
  y(i) = 13.3500; t(i) = 3.0000; i = i + 1
  y(i) = 8.6200; t(i) = 4.0000; i = i + 1
  y(i) = 7.2000; t(i) = 5.0000; i = i + 1
  y(i) = 6.6400; t(i) = 6.0000; i = i + 1
  y(i) = 13.6900; t(i) = 3.0000; i = i + 1
  y(i) = 81.0000; t(i) = .5000; i = i + 1
  y(i) = 64.5000; t(i) = .7500; i = i + 1
  y(i) = 35.5000; t(i) = 1.5000; i = i + 1
  y(i) = 13.3100; t(i) = 3.0000; i = i + 1
  y(i) = 4.8700; t(i) = 6.0000; i = i + 1
  y(i) = 12.9400; t(i) = 3.0000; i = i + 1
  y(i) = 5.0600; t(i) = 6.0000; i = i + 1
  y(i) = 15.1900; t(i) = 3.0000; i = i + 1
  y(i) = 14.6200; t(i) = 3.0000; i = i + 1
  y(i) = 15.6400; t(i) = 3.0000; i = i + 1
  y(i) = 25.5000; t(i) = 1.7500; i = i + 1
  y(i) = 25.9500; t(i) = 1.7500; i = i + 1
  y(i) = 81.7000; t(i) = .5000; i = i + 1
  y(i) = 61.6000; t(i) = .7500; i = i + 1
  y(i) = 29.8000; t(i) = 1.7500; i = i + 1
  y(i) = 29.8100; t(i) = 1.7500; i = i + 1
  y(i) = 17.1700; t(i) = 2.7500; i = i + 1
  y(i) = 10.3900; t(i) = 3.7500; i = i + 1
  y(i) = 28.4000; t(i) = 1.7500; i = i + 1
  y(i) = 28.6900; t(i) = 1.7500; i = i + 1
  y(i) = 81.3000; t(i) = .5000; i = i + 1
  y(i) = 60.9000; t(i) = .7500; i = i + 1
  y(i) = 16.6500; t(i) = 2.7500; i = i + 1
  y(i) = 10.0500; t(i) = 3.7500; i = i + 1
  y(i) = 28.9000; t(i) = 1.7500; i = i + 1
  y(i) = 28.9500; t(i) = 1.7500; i = i + 1

end

subroutine TaskWorker(ierr)
  use chwirut2fmodule

  PetscErrorCode ierr
  PetscReal x(n), f(1)
  PetscMPIInt tag
  PetscInt index
  PetscMPIInt status(MPI_STATUS_SIZE)

  tag = IDLE_TAG
  f = 0.0
  ! Send check-in message to rank-0
  PetscCallMPI(MPI_Send(f, one, MPIU_SCALAR, zero, IDLE_TAG, PETSC_COMM_WORLD, ierr))
  do while (tag /= DIE_TAG)
    PetscCallMPI(MPI_Recv(x, nn, MPIU_SCALAR, zero, MPI_ANY_TAG, PETSC_COMM_WORLD, status, ierr))
    tag = status(MPI_TAG)
    if (tag == IDLE_TAG) then
      PetscCallMPI(MPI_Send(f, one, MPIU_SCALAR, zero, IDLE_TAG, PETSC_COMM_WORLD, ierr))
    else if (tag /= DIE_TAG) then
      index = tag
      ! Compute local part of residual
      PetscCall(RunSimulation(x, index, f(1), ierr))

      ! Return residual to rank-0
      PetscCallMPI(MPI_Send(f, one, MPIU_SCALAR, zero, tag, PETSC_COMM_WORLD, ierr))
    end if
  end do
  ierr = 0
end

subroutine RunSimulation(x, i, f, ierr)
  use chwirut2fmodule

  PetscReal x(n), f
  PetscInt i
  PetscErrorCode ierr
  f = y(i) - exp(-x(1)*t(i))/(x(2) + x(3)*t(i))
  ierr = 0
end

subroutine StopWorkers(ierr)
  use chwirut2fmodule

  integer checkedin
  PetscMPIInt status(MPI_STATUS_SIZE)
  PetscMPIInt source
  PetscReal f(1), x(n)
  PetscErrorCode ierr
  PetscInt i

  checkedin = 0
  do while (checkedin < size - 1)
    PetscCallMPI(MPI_Recv(f, one, MPIU_SCALAR, MPI_ANY_SOURCE, MPI_ANY_TAG, PETSC_COMM_WORLD, status, ierr))
    checkedin = checkedin + 1
    source = status(MPI_SOURCE)
    do i = 1, n
      x(i) = 0.0
    end do
    PetscCallMPI(MPI_Send(x, nn, MPIU_SCALAR, source, DIE_TAG, PETSC_COMM_WORLD, ierr))
  end do
  ierr = 0
end

!/*TEST
!
!   build:
!      requires: !complex
!
!   test:
!      nsize: 3
!      args: -tao_monitor_short -tao_max_it 100 -tao_type pounders -tao_gatol 1.e-5
!      requires: !single
!
!
!TEST*/
