program mpi2

  use DQMC_SHEET
  use DQMC_HUBBARD
  use DQMC_CONFIG
  use DQMC_GTAU
  
  implicit none
  include 'mpif.h'

  integer, parameter  :: nSimulator = 10

  ! variables
  type(config)  :: cfg
  type(Hubbard) :: Hub                     ! Hubbard model
  type(GTAU)    :: tau
  integer       :: nx, ny, nz, n, L, nxy, nzl
  integer       :: nBin, nIter, ntausk
  integer       :: iBin, cnt, avg, err
  integer       :: i, j, k
  real(wp), pointer :: Gup(:,:), Gdn(:,:), Gavg(:,:), sgn(:)
  real(wp)          :: tt1, tt2
  real(wp), pointer  :: Aup(:,:), Adn(:,:)
  integer       :: fdebug, stat
  character(50) :: fname
  character(8)  :: date
  character(10) :: time
  integer       :: fhandle

  ! var for mpi
  integer       :: g_rank, aggr_rank
  integer       :: g_ntask
  integer       :: status(MPI_STATUS_SIZE)
  integer       :: orig_group, aggr_group 
  integer       :: aggr_comm
  integer       :: ranks(nSimulator)
  integer       :: simSize                 ! no of nodes in a simulator
  integer       :: aggregator 
  real, pointer :: data(:,:)
  integer       :: ierr, rc
  integer       :: nData, nG, nClass
  

  ! --------------- !
  ! Config MPI      !
  ! --------------- !
  ! Initialize MPI
  call MPI_INIT(ierr)
  if (ierr .ne. MPI_SUCCESS) goto 1
   
  call MPI_COMM_RANK(MPI_COMM_WORLD, g_rank, ierr)
  call MPI_COMM_SIZE(MPI_COMM_WORLD, g_ntask, ierr)
  
  ! We use static node mapping here. change it to dynamic later.
  
  call MPI_COMM_GROUP(MPI_COMM_WORLD, orig_group, ierr)

  ! fill in aggr_rank
  simSize = g_ntask / nSimulator
  ranks(1) = 0
  do i = 2, nSimulator
     ranks(i) = ranks(i-1) + simSize
  end do
  
  aggr_rank = -1
  aggregator = 0
  if (mod(g_rank,simSize) .eq. 0) then
     call MPI_GROUP_INCL(orig_group, nSimulator, ranks, aggr_group, ierr)
     call MPI_COMM_CREATE(MPI_COMM_WORLD, aggr_group, aggr_comm, ierr)
     call MPI_GROUP_RANK(aggr_group, aggr_rank, ierr)  
  endif

  if (ierr .ne. MPI_SUCCESS) goto 1

  ! --------------- !
  ! Configuration   !
  ! --------------- !
  tt1 = MPI_WTIME()
  ! read parameters from config file
  call Get_command_Argument(1, fName)
  fhandle = 14
  open (unit = fhandle, file = fName)
  call DQMC_Config_Read(cfg, fhandle, CONFIG_CONFIG)

  ! Initialize the geometry
  nx = CFG_Get(cfg, PARAM_NX)
  ny = CFG_Get(cfg, PARAM_NY)
  nz = CFG_Get(cfg, PARAM_NZ)
  nxy = nx*ny
  n  = nxy*nz
  call DQMC_Init_Sheet(nx, ny, nz, Hub%S)  

  ! Read HSF from a file
  call Get_command_Argument(2, fName)
  fhandle = 30
  open (unit = fhandle, file = fName)
  call CFG_Set(cfg, PARAM_HSF, fhandle)

  ! Initialize the rest data
  call DQMC_Hub_Config(Hub, cfg)
  Hub%meas2 = .false.
  print *, "Machine ", g_rank, ", random seed = ", Hub%idum

  L      = Hub%L
  nClass = Hub%S%nClass
  nG     = (L+1)*nz
  nData  = P0_N + 5*nClass + 3*nG + 1

  call DQMC_Gtau_Init(n, L, TAU_T0, &
       Hub%SB%nOrth, Hub%G_up%nWrap, tau, Hub%B, Hub%WS)

  if (g_rank .eq. aggregator) then
     nBin   = nSimulator
  else
     nBin   = 1
  end if
  iBin   = 1
  cnt    = 0
  avg    = nBin + 1
  err    = nBin + 2

  ! Initialize meas data
  nzl = nz*L+nz
#ifdef _BIGG
  allocate(Adn(n*L,n*L))
  allocate(Aup(n*L,n*L))
#endif

  allocate(Gup (nzl, nBin+2))
  allocate(Gdn (nzl, nBin+2))
  allocate(Gavg(nzl, nBin+2))
  allocate(sgn(nBin+2))

  Gup  = ZERO
  Gdn  = ZERO
  Gavg = ZERO
  sgn  = ZERO

#ifdef _DEBUG1
  fdebug = 30
  call date_and_time(date,time)
  write(fname,*) date//time(1:6)//".sheet.log"
  open (unit=fdebug, iostat=stat, file=fname, status='new', action='write')
  write(fdebug,*) "Debug file for sheet code at ", time(1:6), " on ", date
#endif

  ! ---------- !
  ! Execution  !
  ! ---------- !
  ! Execution MC loop
  ! Warmup sweep

!  do i = 1, Hub%nWarm
     ! The second parameter means no measurement should be made.
!     call DQMC_Hub_Sweep(Hub, NO_MEAS0)
!  end do
  
  ! We divide all the measurement into nBin,
  ! each having nPass/nBin pass.
  ntausk = CFG_Get(cfg, PARAM_tausk)
  nIter  = Hub%nPass/ntausk
  do j = 1, nIter
     do k = 1, ntausk
        call DQMC_Hub_Sweep(Hub, Hub%nMeas)
     end do

#ifdef _BIGG
     call MeasureWithBigG()
#else
     call Measure()
#endif

  end do
     ! Accumulate results for each bin
  call Average() 
  call DQMC_Phy0_Avg(Hub%P0)

  ! -------------- !
  ! Postprocessing !
  ! -------------- !
  call Bundle_data

  if (aggr_rank .ge. 0) then
     call MPI_GATHER(data(:,1), nData, MPI_REAL, data(:,1:nSimulator), &
          nData, MPI_REAL, aggregator, aggr_COMM, ierr)
     if (ierr .ne. MPI_SUCCESS) goto 1
  end if
  
  if (g_rank .eq. aggregator) then
     call Unpack_data
     call GetError()
     
     call DQMC_Phy0_GetErr(Hub%P0)
     call DQMC_Hub_OutputParam(Hub, STDOUT)
     write(STDOUT, FMT_DBLINE) 
     call DQMC_Phy0_Print(Hub%P0, Hub%S, STDOUT)
     write(STDOUT, FMT_DBLINE)
     call PrintOut(STDOUT, Gup,  "sheet, tau, Gup (sheet, tau) +- error")
     call PrintOut(STDOUT, Gdn,  "sheet, tau, Gdn (sheet, tau) +- error")
     call PrintOut(STDOUT, Gavg, "sheet, tau, Gavg(sheet, tau) +- error")
     tt2 = MPI_WTIME()
     write(STDOUT, *) "Execution time=", tt2-tt1
  end if

  ! Clean up the used storage
  call DQMC_Hub_Free(Hub)
  call DQMC_Config_Free(cfg)

#ifdef _DEBUG1
  close(fdebug)
#endif

  call MPI_FINALIZE(ierr)

  if (ierr .eq. MPI_SUCCESS) stop

  ! Error handler
1 print *,'Error starting MPI program. Terminating.'
  call MPI_ABORT(MPI_COMM_WORLD, rc, ierr)

2 print *, "Error in reading config file", g_rank

contains

  ! -------------------- !
  ! supporting functios  !
  ! ---------------------!

  ! ----------------------------------------------------------------- !

  subroutine MeasureWithBigG()
    ! Purpose
    ! =======
    !    compute G(i, tau) for time dependent G_up and down
    !
    ! ... Local variables ...
    integer :: li, lj, dl, j, k, gi, idx
    real(wp) ::factor, s1(4), s2(4)
    real(wp), pointer :: up(:,:), dn(:,:)

    ! ... Executable ...
    ! compute the first row of G tau

    factor = ONE/nxy/L
    Gup(:,avg) = ZERO
    Gdn(:,avg) = ZERO
    Gavg(:,avg) = ZERO
    
    call DQMC_Gtau_Big(tau, Aup, Adn, Hub%G_up, Hub%G_dn)

    s1 = ZERO
    s2 = ZERO

    up => Aup(1:n,n+1:2*n)
    dn => Adn(1:n,n+1:2*n)
    idx = 1
    do j = 1, nz
       ! each site
       do k = 1, nxy
          s1(j)  = s1(j)  + up(idx,idx)
          s2(j)  = s2(j)  + dn(idx,idx)
          idx = idx + 1
       end do
    end do
    
    write(*,"(8f12.8)") s1+s2


    ! For each time slice
    do li = 0, L-1
       do lj = 0, L-1
          up => Aup(n*lj+1:n*(lj+1),n*li+1:n*(li+1)) 
          dn => Adn(n*lj+1:n*(lj+1),n*li+1:n*(li+1))
          dl = mod(L+li-lj,L)+1
          !          print *, n*lj+1, n*li+1, dl
          ! in each submatrix
          ! for each layer
          idx = 1
          do j = 1, nz
             gi = dl + (j-1)*L
             ! each site
             do k = 1, nxy
                Gup(gi, avg)  = Gup(gi, avg)  + up(idx,idx)
                Gdn(gi, avg)  = Gdn(gi, avg)  + dn(idx,idx)
                idx = idx + 1
             end do
          end do
       end do
    end do

    Gup(:, iBin)  = Gup(:, iBin)  + Gup(:,avg)*factor
    Gdn(:, iBin)  = Gdn(:, iBin)  + Gdn(:,avg)*factor
    factor = factor/2
    Gavg(:, iBin) = Gavg(:, iBin) +  (Gup(:,avg)+Gdn(:,avg))*factor
    cnt = cnt + 1
    
  end subroutine MeasureWithBigG

  ! ----------------------------------------------------------------- !

  subroutine Measure()
    ! Purpose
    ! =======
    !    compute G(i, tau) for time dependent G_up and down
    !
    ! ... Local variables ...
    integer :: i, j, k, gi, idx, L1
    real(wp) ::factor

    ! ... Executable ...
    ! compute the first row of G tau

    L1 = L + 1
    sgn(avg) = Hub%G_up%sgn*Hub%G_dn%sgn
    factor = sgn(avg)/nxy
    Gup(:,avg) = ZERO
    Gdn(:,avg) = ZERO
    Gavg(:,avg) = ZERO

    ! For each time slice
    do i = 1, L
       call DQMC_MakeGtau(tau, Hub%G_up, Hub%G_dn, 1, i-1)
       idx = 0
       ! For each sheet j
       do j = 1, nz
          gi = i+(j-1)*L1
          ! For each site k on sheet j 
          do k = idx+1, idx+nxy
             Gup(gi, avg)  = Gup(gi, avg)  + tau%upt0(k, k)
             Gdn(gi, avg)  = Gdn(gi, avg)  + tau%dnt0(k, k)
          end do
          idx = idx + nxy
       end do
    end do

    ! special case for L+1
    do i = L1, nzl, L1
       Gup(i,avg) = Gup(i,avg)  + nxy - Gup(i-L1+1,avg)
       Gdn(i,avg) = Gdn(i,avg)  + nxy - Gdn(i-L1+1,avg)
    end do

#ifdef _DEBUG1
    idx = 1
    write(fdebug, FMT_DBLINE)
    do i = 1, nz
       write(fdebug, "('Layer',i3,'       G_up           G_dn ')") i
       do j = 0, L
          write(fdebug, "(i8,2f15.10)") j, Gup(idx,avg), Gdn(idx,avg)
          idx = idx + 1
       end do
    end do
#endif

    ! average
    Gup(:, iBin)  = Gup(:, iBin)  + Gup(:,avg)*factor
    Gdn(:, iBin)  = Gdn(:, iBin)  + Gdn(:,avg)*factor
    factor = factor*HALF
    Gavg(:, iBin) = Gavg(:, iBin) +  (Gup(:,avg)+Gdn(:,avg))*factor

    sgn(iBin)     = sgn(iBin) + sgn(avg)
    cnt = cnt + 1
    
  end subroutine Measure

  ! ----------------------------------------------------------------- !

  subroutine Average()
    ! Purpose
    ! =======
    !    partially averge Green's function
    !
    ! ... Local variables ...
    real(wp) :: factor

    ! ... Executable ...
    
    ! average all the values
    factor = ONE/cnt
    Gup (:, iBin) = Gup (:, iBin)*factor
    Gdn (:, iBin) = Gdn (:, iBin)*factor
    Gavg(:, iBin) = Gavg(:, iBin)*factor
    sgn (ibin)    = sgn(iBin)*factor

    ! reset counter and adavence to next bin
    cnt = 0
    iBin = iBin + 1

  end subroutine Average

  ! ----------------------------------------------------------------- !

  subroutine GetError()
    ! Purpose
    ! =======
    !    Compute std err and average
    !
    ! ... Local variables ...
    integer  :: i
    real(wp) :: data(nBin)
    real(wp) :: sum_sgn, z(nBin), y(nBin)

    ! ... Executable ...
    
    call DQMC_JackKnife(nBin, sgn(avg), sgn(err), sgn(1:nBin), &
         y, z, sum_sgn)

    do i = 1, nzl
       ! average all the values
       call DQMC_SignJackKnife(nBin, Gup(i, avg), Gup(i, err), &
            Gup(i, 1:nBin), y, z, sum_sgn)
       call DQMC_SignJackKnife(nBin, Gdn(i, avg), Gdn(i, err), &
            Gdn(i, 1:nBin), y, z, sum_sgn)
       call DQMC_SignJackKnife(nBin, Gavg(i, avg), Gavg(i, err), &
            Gavg(i, 1:nBin), y, z, sum_sgn)
    end do

  end subroutine GetError

  
  ! ----------------------------------------------------------------- !
  
  subroutine PrintOut(OPT, G, title)
    ! Purpose
    ! =======
    !    Print out values
    !
    ! Argument
    ! ========
    integer, intent(in)  :: OPT
    real(wp), intent(in) :: G(:,:)
    character(*), intent(in) :: title

    ! ... Local variables ...
    integer :: i, j, idx
    character(*), parameter :: OPTFMT = "(i5,i5,f20.16,'   +- ', f20.16)"

    write(OPT,*) title
    idx = 1
    do j = 1, nz
       do i = 0, L
          write(OPT,OPTFMT) j, i, G(idx,avg), G(idx,err)
          idx = idx + 1
       end do
    end do
    write(STDOUT, FMT_DBLINE)

  end subroutine PrintOut
  
  ! ----------------------------------------------------------------- !

  subroutine Bundle_data
    ! 
    ! This subroutine bundles data into an array.
    !
    ! ... Local variables...
    integer :: i, offset

    ! ... Executable ...

    ! allocate space for data transmition
    if (g_rank .eq. aggregator) then
       allocate(data(nData, nSimulator+2))
    else
       allocate(data(nData, 1))
    end if

    data = 0.0E0
    ! marshal data into array
    nClass = Hub%S%nClass
    data(1:P0_N,1) = Hub%P0%S(1:P0_N,1)
    offset = P0_N
    data(1+offset:nClass+offset, 1) =  Hub%P0%G_fun(1:nClass,1)
    offset = offset + nClass
    data(1+offset:nClass+offset, 1) =  Hub%P0%Den0(1:nClass,1)
    offset = offset + nClass
    data(1+offset:nClass+offset, 1) =  Hub%P0%Den1(1:nClass,1)
    offset = offset + nClass
    data(1+offset:nClass+offset, 1) =  Hub%P0%SpinXX(1:nClass,1)
    offset = offset + nClass
    data(1+offset:nClass+offset, 1) =  Hub%P0%SpinZZ(1:nClass,1)

    ! unequal time measurements
    offset = offset + nClass
    data(1+offset:nG+offset, 1) = Gup(1:nG, 1)
    offset = offset + nG
    data(1+offset:nG+offset, 1) = Gdn(1:nG, 1)
    offset = offset + nG
    data(1+offset:nG+offset, 1) = Gavg(1:nG, 1)
    offset = offset + nG
    data(1+offset, 1)           = sgn(1)

  end subroutine Bundle_data

  ! ----------------------------------------------------------------- !

  subroutine Unpack_data
    ! 
    ! This subroutine bundles data into an array.
    !
    ! ... Local variables...
    integer :: i, offset, nS

    ! ... Executable ...
    nS = nSimulator

    ! allocate space for data transmition

    ! marshal data into array
    Hub%P0%S(1:P0_N,1:nS)        = data(1:P0_N,1:nS)
    offset = P0_N
    Hub%P0%G_fun(1:nClass,1:nS)  = data(1+offset:nClass+offset, 1:nS) 
    offset = offset + nClass
    Hub%P0%Den0(1:nClass,1:nS)   = data(1+offset:nClass+offset, 1:nS)
    offset = offset + nClass
    Hub%P0%Den1(1:nClass,1:nS)   = data(1+offset:nClass+offset, 1:nS) 
    offset = offset + nClass
    Hub%P0%SpinXX(1:nClass,1:nS) = data(1+offset:nClass+offset, 1:nS)
    offset = offset + nClass
    Hub%P0%SpinZZ(1:nClass,1:nS) = data(1+offset:nClass+offset, 1:nS)

    ! unequal time measurements
    offset = offset + nClass
    Gup(1:nG, 1:nS)    = data(1+offset:nG+offset, 1:nS)
    offset = offset + nG
    Gdn(1:nG, 1:nS)    = data(1+offset:nG+offset, 1:nS)
    offset = offset + nG
    Gavg(1:nG, 1:nS)   = data(1+offset:nG+offset, 1:nS)
    offset = offset + nG
    sgn(1:nS)          = data(1+offset, 1:nS)

  end subroutine Unpack_data

  ! ----------------------------------------------------------------- !

end program mpi2
