module DQMC_GTAU
#include "dqmc_include.h"

  use DQMC_UTIL
  use DQMC_WSPACE
  use _DQMC_MATB
  use DQMC_SEQB
  use DQMC_GFUN

  implicit none 
  
  !
  ! This module is designed for the computation of time dependent 
  ! measurement (TDM), which requires the unequal time Green's 
  ! function Gup_tau (Gdn_tau).
  ! Mathematically, Gup_tau(Gdn_tau) is the inverse of the following
  ! matrix.
  !
  !                  [  I                   B_L] 
  !                  [-B_1   I                 ]
  !              M = [     -B_2  I             ]
  !                  [          ...   ...      ]
  !                  [             -B_{L-1}  I ]
  !                   
  !
  ! The current implementation only considers to return the ith row
  ! of Gup_tau (Gdn_tau). Let Gup_ij be the (i,j) block matrix of 
  ! Gup_tau.
  !
  !     Gup_ii = inv(I+B_iB_{i-1}...B_1B_L...B_{i+1})   
  !     Gup_ij = -Gup_ii(B_iB_{i-1}...B_1B_L...B_{j+1})  for j = i...L  
  !     Gup_ij =  Gup_ii(B_iB_{i-1}...B_1B_L...B_{j+1})  for j = 1...i-1  
  !
  ! The corresponding Gdn_ij has the same structure.
  !
  ! [1] Z. Bai, W.Chen, R. Scalettar, I. Yamazaki, "Lecture Notes 
  !     on Advances of Numerical Methods for Hubbard Quantum Monte
  !     Carlo Simulation." 
  !
  type Gtau
     integer  :: n                           ! dimension of G_j
     integer  :: L                           ! number of block columns
     integer  :: ii, ib                      ! index of the block
     integer  :: sfc                         ! safe count
     integer  :: nWrap                       ! safe wrapping
     integer  :: which                       ! which part of gtau
                                             ! should be perfomed

     ! working space for constructing unequal time Green's function
     real(wp), pointer :: upt0(:,:)          ! used for UDT decomp
     real(wp), pointer :: up0t(:,:)
     real(wp), pointer :: dnt0(:,:)
     real(wp), pointer :: dn0t(:,:)
     real(wp), pointer :: U(:,:)             ! used in UDT decomp
     real(wp), pointer :: T(:,:)
     real(wp), pointer :: D(:) 
     real(wp), pointer :: v1(:), v2(:)       ! used in G computing
     real(wp), pointer :: v3(:), v4(:)
     real(wp), pointer :: W1(:,:)

     type(SeqB) :: SB1_up, SB2_up, SB1_dn, SB2_dn
     type(MatB), pointer :: B_up, B_dn
     
     real(wp) :: sgnup, sgndn

  end type Gtau
  
  integer, parameter :: TAU_T0   =  0   ! Column
  integer, parameter :: TAU_BOTH =  1   ! column and row
  integer, parameter :: TAU_0T   =  2   ! ROW

  integer, parameter :: TAU_UP   =  1   ! Spin up
  integer, parameter :: TAU_DN   = -1  ! Spin down

contains

  ! Subroutines
  ! ==================================================================

  subroutine DQMC_Gtau_Init(n, L, which, nOrth, nWrap, tau, B_up, B_dn, WS)
    !
    ! Purpose
    ! =======
    !    This subroutine initializes Phy2.
    !
    ! Arguments
    ! =========
    !
    integer, intent(in)       :: n, L
    integer, intent(in)       :: which   ! 
    integer, intent(in)       :: nOrth
    integer, intent(in)       :: nWrap
    type(Gtau), intent(inout) :: tau     ! time dependent measurement
    type(WSpace), intent(in)  :: WS
    type(MatB), target, intent(in)    :: B_up
    type(MatB), target, intent(in)    :: B_dn

    ! ... Executable ...

    tau%n     = n
    tau%L     = L
    tau%which = which
    tau%nWrap = nWrap
    tau%sfc   = 0
    tau%ii    = 0
    tau%ib    = 0

    ! Allocate storages
    allocate(tau%upt0(n,n))
    allocate(tau%dnt0(n,n))
    allocate(tau%up0t(n,n))
    allocate(tau%dn0t(n,n))
    allocate(tau%U(n,n))
    allocate(tau%T(n,n))
    allocate(tau%D(n))
    allocate(tau%v1(n))
    allocate(tau%v2(n))
    allocate(tau%v3(n))
    allocate(tau%v4(n))

    call DQMC_SeqB_Init(n, L, nOrth, B_up, tau%SB1_up, WS)
    call DQMC_SeqB_Init2(n, L, nOrth, B_up, &
         tau%SB2_up, tau%U, tau%D, tau%T, WS)

    call DQMC_SeqB_Init(n, L, nOrth, B_dn, tau%SB1_dn, WS)
    call DQMC_SeqB_Init2(n, L, nOrth, B_dn, &
         tau%SB2_dn, tau%U, tau%D, tau%T, WS)

    tau%SB2_up%piv => WS%I2
    tau%SB2_dn%piv => WS%I2
    tau%W1         => WS%R1
    tau%B_up       => B_up
    tau%B_dn       => B_dn

  end subroutine DQMC_Gtau_Init

  !--------------------------------------------------------------------!
  
  subroutine DQMC_Gtau_Init2(n, L, which, nOrth, nWrap, tau, B_up, B_dn, WS)
    !
    ! Purpose
    ! =======
    !    This subroutine initializes Phy2.
    !
    ! Arguments
    ! =========
    !
    integer, intent(in)       :: n, L
    integer, intent(in)       :: which   ! 
    integer, intent(in)       :: nOrth
    integer, intent(in)       :: nWrap
    type(Gtau), intent(inout) :: tau     ! time dependent measurement
    type(WSpace), intent(in)  :: WS
    type(MatB), target, intent(in)    :: B_up
    type(MatB), target, intent(in)    :: B_dn

    integer :: dum

    ! ... Executable ...

    tau%n     = n
    tau%L     = L
    tau%which = which
    tau%nWrap = north
    tau%sfc   = 0
    tau%ii    = 0
    tau%ib    = 0
    dum       = nwrap

    ! Allocate storages
    allocate(tau%upt0(n,n))
    allocate(tau%dnt0(n,n))
    allocate(tau%up0t(n,n))
    allocate(tau%dn0t(n,n))
    allocate(tau%U(n,n))
    allocate(tau%T(n,n))
    allocate(tau%D(n))
    allocate(tau%v1(n))
    allocate(tau%v2(n))
    allocate(tau%v3(n))
    allocate(tau%v4(n))

    call DQMC_SeqB_Init(n, L, nOrth, B_up, tau%SB1_up, WS)
    call DQMC_SeqB_Init2(n, L, nOrth, B_up, &
         tau%SB2_up, tau%U, tau%D, tau%T, WS)

    call DQMC_SeqB_Init(n, L, nOrth, B_dn, tau%SB1_dn, WS)
    call DQMC_SeqB_Init2(n, L, nOrth, B_dn, &
         tau%SB2_dn, tau%U, tau%D, tau%T, WS)

    tau%SB2_up%piv => WS%I2
    tau%SB2_dn%piv => WS%I2
    tau%W1         => WS%R1
    tau%B_up       => B_up
    tau%B_dn       => B_dn

  end subroutine DQMC_Gtau_Init2

  !--------------------------------------------------------------------!

  subroutine DQMC_Gtau_Free(tau)
    !
    ! Purpose
    ! =======
    !    This subroutine frees TDM.
    !
    ! Arguments
    ! =========
    !
    type(Gtau), intent(inout) :: tau      ! TDM to be initialized

    ! ... Executable ...

    deallocate(tau%upt0, tau%up0t, tau%dnt0, tau%dn0t)
    deallocate(tau%U, tau%D, tau%T, tau%v1, tau%v2, tau%v3, tau%v4)

    call DQMC_SeqB_Free(tau%SB1_up)
    call DQMC_SeqB_Free(tau%SB2_up)
    call DQMC_SeqB_Free(tau%SB1_dn)
    call DQMC_SeqB_Free(tau%SB2_dn)
 
  end subroutine DQMC_Gtau_Free

  !--------------------------------------------------------------------!
  !
  ! Probably bugged. Use DQMC_MakeGtau2.
  !
  subroutine DQMC_MakeGtau(tau, G_up, G_dn, ii, ib)
    !
    ! Purpose
    ! =======
    !    This subroutine generates Gtau.
    !
    ! Arguments
    ! =========
    !
    type(Gtau), intent(inout)    :: tau
    type(G_fun), intent(inout)   :: G_up, G_dn
    integer, intent(in)          :: ii, ib     ! the index of block matrix
    
    ! ... local scalar
    integer  :: n, idx, i
    logical  :: recompute

    ! ... executable ...
    !
    !  meaning of indices
    !     ii: the ii-th block row or column
    !     ib: the block offset
    !
    !     id: 

    ! initialization
    n = tau%n
    recompute = .false.
    
    ! start computation
    if (ib .gt. 0) then ! Off diagoanl block

       ! If the current Gtau is not a neighbor of requested one,
       ! set count=0 for recomputing.
       if (ii .eq. tau%ii .and. ib .eq. tau%ib+1) then
          ! recude safe count
          tau%sfc = tau%sfc - 1
       else
          ! recompute if cannot use update
          tau%sfc = 0
       end if

       ! idx is the absolute row/column index
       idx = ii+ib
       if (idx .gt. tau%L) then
          idx = idx - tau%L
       end if


       ! compute Gtau
       if (tau%sfc .ne. 0) then 
          ! Within a toerlable range, the Gtau can be updated
          ! by simple matrix multiplication.
          ! see [1] for more details.
          
          ! use update
          if (tau%which .le. TAU_BOTH) then
             
             ! column             
             call DQMC_MultB_Left(n, tau%upt0, tau%B_up, G_up%V(:,idx), &
                  tau%W1)
             call DQMC_MultB_Left(n, tau%dnt0, tau%B_dn, G_dn%V(:,idx), &
                  tau%W1)
          end if
          
          if (tau%which .ge. TAU_BOTH) then
             ! row
             call DQMC_MultBi_Right(n, tau%up0t, tau%B_up, G_up%V(:,idx), &
                  tau%W1)
             call DQMC_MultBi_Right(n, tau%dn0t, tau%B_dn, G_dn%V(:,idx), &
                  tau%W1)
          end if
       else
          ! we need to change index, becuse the index system used in get
          ! gtau is absolute. (ib,jb)
          ! Recompute Gtau from scratch
          
          call DQMC_GetGtau(ii, idx, TAU_UP, tau%upt0, tau%up0t, G_up%V, tau)
          call DQMC_GetGtau(ii, idx, TAU_DN, tau%dnt0, tau%dn0t, G_dn%V, tau)
          recompute = .true.
          tau%sfc = tau%nWrap

       end if

    else ! The initial ii block
       ! Construct G from getG function
       ! In this case, upt0 = -up0t and dnt0 = -dn0t       
       call DQMC_GetG(ii, G_up, tau%SB1_up)
       tau%sgnup = G_up%sgn
       tau%upt0  = G_up%G
       
       call DQMC_GetG(ii, G_dn, tau%SB1_dn)
       tau%sgndn = G_dn%sgn
       tau%dnt0  = G_dn%G
       
       !Where is tau%SB2 defined here?
       
       ! if compute both, then use up0t as companion matrices
       if (tau%which .eq. TAU_BOTH) then
          tau%up0t = tau%upt0
          tau%dn0t = tau%dnt0
          do i = 1, n
             tau%up0t(i,i) = tau%up0t(i,i) - ONE
             tau%dn0t(i,i) = tau%dn0t(i,i) - ONE
          end do
       end if
       
       ! set the counter
       tau%sfc = tau%nWrap
       recompute = .true.
    end if
    
    ! determine the sign
    if ((recompute .and. ii+ib.gt.tau%L).or.(ii+ib.eq.tau%L+1))then
       tau%upt0 = -tau%upt0
       tau%dnt0 = -tau%dnt0
       tau%up0t = -tau%up0t
       tau%dn0t = -tau%dn0t
    end if
    
    tau%ii = ii
    tau%ib = ib

  end subroutine DQMC_MakeGtau

  !--------------------------------------------------------------------!
  !
  ! Probably bugged. Use DQMC_GetGtau2.
  !
  subroutine DQMC_GetGtau(ib, jb, spin, G_ij, G_ji, V, tau)
    !
    ! Purpose
    ! =======
    !    
    !    This subroutine computes the (i,j) submatrix of Gtau if which
    !    equals to'R'ow or 'B'oth. and computes the (j,i) submatrix of 
    !    Gtau if which equals to 'C'olumn or 'B'oth 
    !
    ! Mathematically, Gtau(Gdn_tau) is the inverse of 
    !
    !                  [  I                   B_1] 
    !                  [-B_2   I                 ]
    !              M = [     -B_3  I             ]
    !                  [          ...   ...      ]
    !                  [             -B_{L}    I ]
    !                   
    !
    ! The (i,j) submatrix of Gtau is given as
    !
    !     G_ii =    inv(I+B_iB_{i-1}...B_1B_L...B_{i+1})   
    !     G_ij = -Gtau_ii(B_iB_{i-1}...B_1B_L...B_{j+1})  for j = i+1...L  
    !     G_ij =  Gtau_ii(B_iB_{i-1}...B_{j+1})           for j = 1...i-1 
    !
    ! In general, we can write G_ij as
    !
    !         G_ij = (+/-) inv(I+A_1A_2)A_1
    !              = (+/-) inv(inv(A_1)+A_2)
    ! where
    !
    !          A_1 = B_{i}...B_{j+1} and
    !          A_2 = B_{j}...B_{i+1}
    !
    ! The following procedure compute G_ij in a stable way
    ! 
    ! 1. Perform UDT decomposition on inv(A_1) and A_2
    !    
    !       inv(A_1) = U_1D_1T_1
    !           A_2  = U_2D_2T_2
    !
    !    See the DQMC_UDTD in DQMC_B.F90 for detail of UDT decomposition.
    !
    ! 2. Decompose D_1 = barD_1*hatD_1
    !              D_2 = barD_2*hatD_2
    !    where
    !           barD_1(i,i) = max(1, D_1(i,i)) and
    !           hatD_1(i,i) = min(1, D_1(i,i))
    !
    ! 3. Compute
    !
    !    C = hatD_2*T_2*inv(T_1)*inv(barD_1)+inv(barD_2)*inv(U_2)*U_1*hatD_1
    !    
    ! 4. Assemble G as 
    !
    !    G = inv(T_1)*inv(barD_1)*inv(C)*inv(barD_2)*inv(U_2)
    !      = inv(T_1)*inv(D_2T_2inv(T_1)+inv(U_2)U_1D_1)*inv(U_2)
    !      = inv(U_1D_1T_1+U_2D_2T_2)
    !      = inv(inv(A_1)+A_2)
    !      = inv(I+A_1A_2)A_1
    !      = inv(I+B_{i}...B_1*B_l...B_{i-1})B_i...B_{j+1}
    !
    ! Matrix G_ji has very similar structure with G_ij.
    !
    !     G_jj =    inv(I+B_jB_{j-1}...B_1B_L...B_{j+1})   
    !     G_ji = -Gtau_jj(B_jB_{j-1}...B_1B_L...B_{i+1})  for i = j+1...L  
    !     G_ji =  Gtau_jj(B_jB_{j-1}...B_{i+1})           for i = 1...j-1 
    !
    ! For a fixed i and j,
    !
    !         G_ji = (+/-) inv(I+A_2A_1)A_2
    !              = (+/-) inv(inv(A_2)+A_1)
    ! 
    ! where A_1 and A_2 are as defined before.
    ! Therefore, 
    !
    !         G_ji = inv(inv(U_1D_1T_1)+inv(U_2D_2T_2))
    !              = inv(inv(T_1)inv(D_1)inv(U_1)+inv(T_2)inv(D_2)inv(U_2))
    !              = U_2*inv(inv(D_1)inv(U_1)U_2+T_1*inv(T_2)*inv(D_2))*T_1
    !
    ! The same trick of bar and hat is also applied to inv(D_1) and inv(D_2).
    !        
    !         G_ji = U_2*inv(barD_2)*inv(...)*inv(barD_1)*T_1
    !
    ! where (...) = hatD_1*inv(U_1)U_2*inv(barD_2)+
    !               inv(barD_1)T_1*inv(T_2)hatD_2
    !
    ! NOTE: the hatD_1, barD_1, hatD_2, and barD_2 here are different from
    !       the previous ones.
    !
    ! See working notes for more detail.
    !
    ! Arguments
    ! =========
    !
    integer,  intent(in)       :: ib, jb        ! block indices
    integer,  intent(in)       :: spin
    real(wp), intent(inout)    :: G_ij(:,:)     ! submatrix of Gtau
    real(wp), intent(inout)    :: G_ji(:,:)     ! submatrix of Gtau
    real(wp), intent(inout)    :: V(:,:)        ! HSF

    type(Gtau), intent(inout), target  :: tau

    ! ... local scalars    ...
    integer :: info           ! parameters for lapack's sub
    integer :: i              ! iterator
    integer :: n

    real(wp), pointer :: U1(:,:)       ! 
    real(wp), pointer :: D1(:)         ! 
    real(wp), pointer :: T1(:,:)       ! 
    real(wp), pointer :: U2(:,:)       ! 
    real(wp), pointer :: D2(:)         ! 
    real(wp), pointer :: T2(:,:)       ! 
    real(wp), pointer :: W1(:,:)       ! working space
    real(wp), pointer :: W2(:,:)       !
    real(wp), pointer :: rw(:)         ! working space
    integer,  pointer :: lw(:)         !
    integer,  pointer :: pvt1(:)       !
    integer,  pointer :: pvt2(:)       !

    real(wp), pointer :: bar1i(:)      ! 
    real(wp), pointer :: bar2i(:)      ! 
    real(wp), pointer :: hat1(:)       ! 
    real(wp), pointer :: hat2(:)       ! 
    
     type(SeqB), pointer :: SB1, SB2

    ! ... Executable ...

    ! STEP 0. Initialization
    n = tau%n
    bar1i => tau%v1
    bar2i => tau%v2
    hat1  => tau%v3
    hat2  => tau%v4

    if(spin .eq. TAU_UP) then
       SB1 => tau%SB1_up
       SB2 => tau%SB2_up
    else
       SB1 => tau%SB1_dn
       SB2 => tau%SB2_dn
    endif
    
    U1 => SB1%U
    D1 => SB1%D
    T1 => SB1%T

    U2 => SB2%U
    D2 => SB2%D
    T2 => SB2%T

    W1 => SB1%W1
    W2 => SB1%W2
    rw => SB1%rw
    lw => SB1%lw
    pvt1 => SB1%piv
    pvt2 => SB2%piv

    info = 0

    ! STEP 1. Cmpute UDT decomposition of 
    !         inv(A_1) = inv(B_{i}...B_{j+1})
    !         and A_2  = B_j...B_{i+1}.
    ! ==========================================
    ! W1, W2, rw, lwork, tau, pvt1 can be reused.

    call DQMC_SeqMultB (ib, jb+1, SB1, V)
    call DQMC_SeqMultBi(jb, ib+1, SB2, V)
    
    if (tau%which .eq. TAU_T0 .or. tau%which .eq. TAU_BOTH) then
       !
       ! STEP 2.  D_1 = inv(barD_1)*hatD_1
       !          D_2 = inv(barD_2)*hatD_2
       ! ==================================
       
       do i = 1, n
          bar1i(i) = ONE / max(ONE, abs(D1(i)))
          hat1(i)  = D1(i) * bar1i(i)
          bar2i(i) = ONE / max(ONE, abs(D2(i)))
          hat2(i)  = D2(i) * bar2i(i)
       end do

       !   
       ! STEP 3. Compute C = hatD_2*T_2*inv(T_1)*inv(barD_1)+
       !                     inv(barD_2)*inv(U_2)*U_1*hatD_1
       ! =======================================================   
       
       !! Compute  T_2*inv(T_1)
       ! copy T_1 to W_2, because we may need T_1 later
       call blas_dcopy(n*n,T1,1,W2,1)
       
       ! W_1 = T_2'
       call DQMC_trans(n, W1, T2)

       ! W_1 = inv(W_2')*W_1 = inv(T_1')*T_2'
       call lapack_dgetrf(n, n, W2, n, pvt1, info)
       call lapack_dgetrs('T', n, n, W2, n, pvt1, W1, n, info)
       if (info .ne. 0) then
          call DQMC_Error("Error: dgetrs(1) in dqmc_getgtau.", info)
       end if

       ! T_2 = transpose(W_1) = transpose(inv(T_1')*T_2') = T_2*inv(T_1)
       call DQMC_trans(n, T2, W1)
       
       if (tau%which.eq.TAU_T0) then
          ! if only Row is computed, then T1 is not reference, reuse it 
          call blas_dcopy(n*n,G_ij,1,T1,1)
       end if
       
       ! U_1 = G_ij = U_2'*U_1
       ! ** G_ij here is used as a temp variable
       call blas_dgemm('T', 'N', n, n, n, ONE, U2, n, U1, n, ZERO, G_ij, n)
       call blas_dcopy(n*n,G_ij,1,U1,1)
       
       !! *** We need to keep T2 and U1 for later use.
       
       ! compute U_1 = barD_2*U_2'*U_1*hatD_1
       call DQMC_ScaleRow(n, G_ij, bar2i)
       call DQMC_ScaleCol(n, G_ij, hat1)
       
       ! compute W_1 = hatD_2*T_2*inv(T_1)*barD_1
       call blas_dcopy(n*n,T2,1,W1,1)
       call DQMC_ScaleRow(n, W1, hat2)
       call DQMC_ScaleCol(n, W1, bar1i)
       
       ! W_1 = W_1 + G_ij
       call blas_daxpy(n*n, ONE, G_ij, 1, W1, 1)
       
       !   
       ! STEP 4. Compute inv(T_1)*inv(barD_1)*inv(C)*inv(barD_2)*inv(U_2)
       ! =================================================================   
       
       ! Let G_ij = inv(barD_2) * inv(U2)
       call DQMC_trans(n, G_ij, U2)
       call DQMC_ScaleRow(n, G_ij, bar2i)

       ! G_ij = inv(W_1)*inv(barD_2)*inv(U_2)
       call lapack_dgesv(n, n, W1, n, pvt2, G_ij, n, info)

       if (info .ne. 0) then
          call DQMC_Error("Error: dgesv(2) in dqmc_getgtau.", info)
       end if
       
       ! G_ij = inv(barD_1)*G_ij = inv(barD_1)*inv(W_1)*inv(barD_2)*inv(U_2)
       call DQMC_ScaleRow(n, G_ij, bar1i)
       
       ! G_ij = inv(T_1)*G_ij
       !      = inv(T_1)*inv(barD_1)*inv(C)*inv(barD_2)*inv(U_2)
       call lapack_dgetrs('N', n, n, W2, n, pvt1, G_ij, n, info)
       if (info .ne. 0) then
          call DQMC_Error("Error: dgetrs(1) in dqmc_getgtau.", info)
       end if

    end if

    !
    ! Compute G_ji, repeat step 2, 3, 4 for Gji
    ! ==========================================

    if (tau%which.eq.TAU_0T .or. tau%which .eq. TAU_BOTH) then       
       !
       ! STEP 5.  inv(D_1) = barD_1*hatD_1
       !          inv(D_2) = barD_2*hatD_2
       ! ======================================
       !
       do i = 1, n
          if (D1(i) .eq. ZERO) then
             call DQMC_Error("Error: in dqmc_getgtau, D1(i)=0.0, i=", i)
          end if
          D1(i) = ONE / D1(i)
          bar1i(i) = ONE / max(ONE, D1(i))
          hat1(i) = D1(i) * bar1i(i)

          if (D2(i) .eq. ZERO) then
             call DQMC_Error("Error: in dqmc_getgtau, D2(i)=0.0, i=", i)
          end if
          D2(i) = ONE / D2(i)
          bar2i(i) = ONE / max(ONE, D2(i))
          hat2(i) = D2(i) * bar2i(i)
       end do
       
       !   
       ! STEP 6. Compute G_ji = hatD_1*inv(U_1)U_2*inv(barD_2)+
       !                        inv(barD_1)T_1*inv(T_2)hatD_2
       ! =======================================================   
       if (tau%which .eq. TAU_BOTH) then
          ! Previously, T_2 = T_2*inv(T_1)
          !             U_1 = inv(U_2)*U_1
          ! Therefore, we only need to invert them.
          
          ! first, compute inv(barD_1)T_1*inv(T_2)hatD_2          
          call lapack_dgetrf(n, n, T2, n, pvt1, info)
          if (info .ne. 0) then
             call DQMC_Error("Error: dgetrf(1) in dqmc_getgtau.", info)
          end if
          call lapack_dgetri(n, T2, n, pvt1, rw, lw(LA_GETRI), info)
          
          ! W1 = U1' = inv(inv(U_2)*U_1) = inv(U_1)*U_2
          call DQMC_Trans(n, W1, U1)
           
       else
          ! No previous computed results. Compute them from scratch.
          !
          ! (1) Compute T_1*inv(T_2) 
          !     Let W_1 = T_1'
          call DQMC_trans(n, W1, T1)

          !     W_1 = inv(T_2')*W_1 = inv(T_2')*T_1'
          call lapack_dgetrf(n, n, T2, n, pvt1, info)
          call lapack_dgetrs('T', n, n, T2, n, pvt1, W1, n, info)
          if (info .ne. 0) then
             call DQMC_Error("Error: dgetrs(1) in dqmc_getgtau.", info)
          end if
          !     T_2 = W_1' = (inv(T_2')*T_1')' = T_1*inv(T_2)
          call DQMC_trans(n, T2, W1)
          
          ! (2) Compute W_1 = U_2'*U_1
          call blas_dgemm('T', 'N', n, n, n, ONE, U1, n, U2, n, ZERO, W1, n)
       end if


       ! Compute inv(barD_1)T_1*inv(T_2)hatD_2
       call DQMC_ScaleRow(n, T2, bar1i)
       call DQMC_ScaleCol(n, T2, hat2)
          
       ! Compute hatD_1*inv(U_1)U_2*inv(barD_2)
       call DQMC_ScaleRow(n, W1, hat1)
       call DQMC_ScaleCol(n, W1, bar2i)
       

       ! W1 = W1 + T2
       call blas_daxpy(n*n, ONE, T2, 1, W1, 1)

       !
       ! STEP 7. Compute U_2*inv(barD_2)*inv(...)*inv(barD_1)*T_1
       ! =========================================================
       !
       call DQMC_ScaleRow(n, T1, bar1i)
       call lapack_dgesv(n, n, W1, n, pvt1, T1, n, info)
       if (info .ne. 0) then
          call DQMC_Error("Error: dgesv(3) in dqmc_getgtau.", info)
       end if
       
       ! inv(barD_2)*inv(...)*inv(barD_1)*T_1
       call DQMC_ScaleRow(n, T1, bar2i)

       ! copy the previous result
       call blas_dcopy(n*n,G_ij,1,W2,1)

       ! multiply -U2, the sign is negative
       call blas_dgemm('N', 'N', n, n, n, -ONE, U2, n, T1, n, ZERO, G_ji, n)

    end if
    
  end subroutine DQMC_GetGtau

  !--------------------------------------------------------------------!

  subroutine DQMC_Gtau_Part(isl, SB, A, G, sgn)
    !
    ! This solves the gtau explicitly by Lapack.
    !

    integer, intent(in) :: isl
    type(SeqB), intent(inout)    :: SB
    type(G_fun), intent(in)   :: G
    real(wp), intent(out), target :: A(:,:)
    real(wp), intent(out) :: sgn
    
    ! ... Local var ...
    integer, parameter :: BS = 64
    integer  :: north
    integer  :: i, j, m, n, L, nL
    integer  :: lwork, info 
    integer  :: ipiv(SB%n*SB%L/SB%north)
    real(wp) :: work(BS*SB%n*SB%L/SB%north)

    real(wp), pointer :: up(:,:)
    
    ! ... Executable ...
    
    north = SB%north
    n     = SB%n
    L     = SB%L/SB%north
    nL    = n*L

    A    = ZERO
    ipiv = 0
    info = 0
    work = ZERO

    ! making A
    do i = 1, nL
       A(i,i) = ONE
    end do

    !up => A(1:n, nL-n+1:nL)
    !
    !m = SB%L - (north - 1)/2 + 1
    !call DQMC_GetB(n, up, SB%B, G%V(:,m), SB%W1)
    !do j = m+1, SB%L
    !   call DQMC_MultB_Left(n, up, SB%B, G%V(:,j), SB%W1)
    !enddo
    !do j = 1, (north+1)/2
    !   call DQMC_MultB_Left(n, up, SB%B, G%V(:,j), SB%W1)
    !enddo

    !m = (north + 1)/2 + 1
    !do i = 1, L - 1
    !   up =>  A(i*n+1:(i+1)*n, (i-1)*n+1:i*n)
    !   call DQMC_GetB(n, up, SB%B, G%V(:,m), SB%%W1)
    !   do j = m+1, m+north-1
    !      call DQMC_MultB_Left(n, up, SB%B, G%V(:,j), SB%W1)
    !   enddo
    !   up = -up
    !   m = north + m
    !end do

    m = isl - north + 1
    if (m .le. 0) m = m + SB%L
    do j = 1, SB%L 
       ipiv(j) = m
       m = m + 1
       if (m .gt. SB%L) m = m - SB%L
    enddo

    up => A(1:n, nL-n+1:nL)
    call DQMC_GetB(n, up, SB%B, G%V(:,ipiv(1)), SB%W1)
    do j = 2, north
       call DQMC_MultB_Left(n, up, SB%B, G%V(:,ipiv(j)), SB%W1)
    enddo

    m = north
    do i = 1, L - 1
       up =>  A(i*n+1:(i+1)*n, (i-1)*n+1:i*n)
       m = m + 1
       call DQMC_GetB(n, up, SB%B, G%V(:,ipiv(m)), SB%W1)
       do j = 2, north
          m = m + 1
          call DQMC_MultB_Left(n, up, SB%B, G%V(:,ipiv(m)), SB%W1)
       enddo
       up = -up
    end do

    ! invert A
    lwork = BS*nL
    sgn   = ONE
    call lapack_dgetrf(nL, nL, A, nL, ipiv, info)
    do i = 1, nL
      if (ipiv(i).ne. i) sgn = -sgn
      if (A(i,i) .lt. ZERO) sgn = -sgn
    enddo
    call lapack_dgetri(nL, A, nL, ipiv, work, lwork, info)

  end subroutine DQMC_Gtau_Part
 
  !---------------------------------------------------------!

  subroutine DQMC_Gtau_Big(tau, Aup, Adn, G_up, G_dn, sgn_up, sgn_dn)
    !
    ! This solves the gtau explicitly by Lapack.
    !

    type(Gtau), intent(inout)    :: tau
    type(G_fun), intent(inout)   :: G_up, G_dn
    real(wp), intent(inout), target :: Aup(:,:), Adn(:,:)
    real(wp), intent(inout) :: sgn_up, sgn_dn
    
    ! ... Local var ...
    integer, parameter :: BS = 64
    integer  :: i, n, L, nL, lwork, info, ipiv(tau%n*tau%L)
    real(wp) :: work(BS*tau%n*tau%L)
    real(wp), pointer :: up(:,:), dn(:,:)
    
    ! ... Executable ...
    
    n   = tau%n
    L   = tau%L
    nL  = n*L
    Aup = ZERO
    Adn = ZERO
    ipiv = 0
    info = 0
    work = ZERO

    ! making A
    do i = 1, nL
       Aup(i,i) = ONE
       Adn(i,i) = ONE
    end do

    up => Aup(1:n, nL-n+1:nL)
    dn => Adn(1:n, nL-n+1:nL)

    call DQMC_GetB(n, up, tau%B_up, G_up%V(:,1), tau%W1)
    call DQMC_GetB(n, dn, tau%B_dn, G_dn%V(:,1), tau%W1)

    do i = 1, L-1
       up =>  Aup(i*n+1:(i+1)*n, (i-1)*n+1:i*n)
       dn =>  Adn(i*n+1:(i+1)*n, (i-1)*n+1:i*n)
       call DQMC_GetB(n, up, tau%B_up, G_up%V(:,i+1), tau%W1)
       call DQMC_GetB(n, dn, tau%B_dn, G_dn%V(:,i+1), tau%W1)
       up = -up
       dn = -dn
    end do

    ! invert A
    lwork = BS*nL
    call lapack_dgetrf(nL, nL, Aup, nL, ipiv, info)
    do i = 1, nL
      if (ipiv(i).ne. i) sgn_up = -sgn_up
      if (Aup(i,i) .lt. ZERO) sgn_up = -sgn_up
    enddo
    call lapack_dgetri(nL, Aup, nL, ipiv, work, lwork, info)
    call lapack_dgetrf(nL, nL, Adn, nL, ipiv, info)
    do i = 1, nL
      if (ipiv(i).ne. i) sgn_dn = -sgn_dn
      if (Adn(i,i) .lt. ZERO) sgn_dn = -sgn_dn
    enddo
    call lapack_dgetri(nL, Adn, nL, ipiv, work, lwork, info)
  end subroutine DQMC_Gtau_Big

  !--------------------------------------------------------------------!

  subroutine DQMC_MakeGtau2(tau, G_up, G_dn, ii, jj)
    !
    ! Purpose
    ! =======
    !    This subroutine generates Gtau.
    !
    ! Arguments
    ! =========
    !
    type(Gtau), intent(inout)    :: tau
    type(G_fun), intent(inout)   :: G_up, G_dn
    integer, intent(in)          :: ii, jj    
    
    ! ... local scalar
    integer  :: n, idx, inc_ii, inc_jj
    logical  :: recompute

    ! ... executable ...
    !
    !  meaning of indices
    !     ii: the ii-th block row or column
    !     ib: the block offset
    !
    !     id: 

    ! initialization
    n = tau%n
    recompute = .false.
    
    ! start computation

    inc_jj = jj - tau%ib
    if( abs(inc_jj) > tau%L/2 ) inc_jj = inc_jj - sign( tau%L, inc_jj)

    inc_ii = ii - tau%ii
    if( abs(inc_ii) > tau%L/2 ) inc_ii = inc_ii - sign( tau%L, inc_ii)

    if (abs(inc_jj) + abs(inc_ii) ==  1) then

       ! reduce safe count
       tau%sfc = tau%sfc - 1

       ! Map the increment in a direction of motion
       if (inc_ii == 1) then
          idx = 1
       elseif (inc_ii == -1) then
          idx = 2
       elseif (inc_jj ==  1) then
          idx = 3
       elseif (inc_jj == -1) then
          idx = 4
       endif
    else

       ! recompute if cannot use update
       tau%sfc = 0
    end if

    ! compute Gtau
    if (tau%sfc .ne. 0) then 

       ! Update Gtau 
       call DQMC_change_gtau_time(idx, tau, G_up, G_dn)
    else

       ! Recompute Gtau from scratch
       call DQMC_GetGtau2(ii, jj, TAU_UP, tau%upt0, tau%up0t, G_up%V, tau)
       call DQMC_GetGtau2(ii, jj, TAU_DN, tau%dnt0, tau%dn0t, G_dn%V, tau)
       recompute = .true.
       tau%sfc = tau%nWrap
    end if

  end subroutine DQMC_MakeGtau2

  !-----------------------------------------------------!

  subroutine DQMC_GetGtau2(ib, jb, spin, G_ij, G_ji, V, tau)
    !
    ! Derived from DQMC_GetGtau with a few bug fixes from Simone
    !
    ! Purpose
    ! =======
    !    
    !    This subroutine computes the (i,j) submatrix of Gtau if which
    !    equals to'R'ow or 'B'oth. and computes the (j,i) submatrix of 
    !    Gtau if which equals to 'C'olumn or 'B'oth 
    !
    ! Mathematically, Gtau(Gdn_tau) is the inverse of 
    !
    !                  [  I                   B_1] 
    !                  [-B_2   I                 ]
    !              M = [     -B_3  I             ]
    !                  [          ...   ...      ]
    !                  [             -B_{L}    I ]
    !                   
    !
    ! The (i,j) submatrix of Gtau is given as
    !
    !     G_ii =    inv(I+B_iB_{i-1}...B_1B_L...B_{i+1})   
    !     G_ij = -Gtau_ii(B_iB_{i-1}...B_1B_L...B_{j+1})  for j = i+1...L  
    !     G_ij =  Gtau_ii(B_iB_{i-1}...B_{j+1})           for j = 1...i-1 
    !
    ! In general, we can write G_ij as
    !
    !         G_ij = (+/-) inv(I+A_1A_2)A_1
    !              = (+/-) inv(inv(A_1)+A_2)
    ! where
    !
    !          A_1 = B_{i}...B_{j+1} and
    !          A_2 = B_{j}...B_{i+1}
    !
    ! The following procedure compute G_ij in a stable way
    ! 
    ! 1. Perform UDT decomposition on inv(A_1) and A_2
    !    
    !       inv(A_1) = U_1D_1T_1
    !           A_2  = U_2D_2T_2
    !
    !    See the DQMC_UDTD in DQMC_B.F90 for detail of UDT decomposition.
    !
    ! 2. Decompose D_1 = barD_1*hatD_1
    !              D_2 = barD_2*hatD_2
    !    where
    !           barD_1(i,i) = max(1, D_1(i,i)) and
    !           hatD_1(i,i) = min(1, D_1(i,i))
    !
    ! 3. Compute
    !
    !    C = hatD_2*T_2*inv(T_1)*inv(barD_1)+inv(barD_2)*inv(U_2)*U_1*hatD_1
    !    
    ! 4. Assemble G as 
    !
    !    G = inv(T_1)*inv(barD_1)*inv(C)*inv(barD_2)*inv(U_2)
    !      = inv(T_1)*inv(D_2T_2inv(T_1)+inv(U_2)U_1D_1)*inv(U_2)
    !      = inv(U_1D_1T_1+U_2D_2T_2)
    !      = inv(inv(A_1)+A_2)
    !      = inv(I+A_1A_2)A_1
    !      = inv(I+B_{i}...B_1*B_l...B_{i-1})B_i...B_{j+1}
    !
    ! Matrix G_ji has very similar structure with G_ij.
    !
    !     G_jj =    inv(I+B_jB_{j-1}...B_1B_L...B_{j+1})   
    !     G_ji = -Gtau_jj(B_jB_{j-1}...B_1B_L...B_{i+1})  for i = j+1...L  
    !     G_ji =  Gtau_jj(B_jB_{j-1}...B_{i+1})           for i = 1...j-1 
    !
    ! For a fixed i and j,
    !
    !         G_ji = (+/-) inv(I+A_2A_1)A_2
    !              = (+/-) inv(inv(A_2)+A_1)
    ! 
    ! where A_1 and A_2 are as defined before.
    ! Therefore, 
    !
    !         G_ji = inv(inv(U_1D_1T_1)+inv(U_2D_2T_2))
    !              = inv(inv(T_1)inv(D_1)inv(U_1)+inv(T_2)inv(D_2)inv(U_2))
    !              = U_2*inv(inv(D_1)inv(U_1)U_2+T_1*inv(T_2)*inv(D_2))*T_1
    !
    ! The same trick of bar and hat is also applied to inv(D_1) and inv(D_2).
    !        
    !         G_ji = U_2*inv(barD_2)*inv(...)*inv(barD_1)*T_1
    !
    ! where (...) = hatD_1*inv(U_1)U_2*inv(barD_2)+
    !               inv(barD_1)T_1*inv(T_2)hatD_2
    !
    ! NOTE: the hatD_1, barD_1, hatD_2, and barD_2 here are different from
    !       the previous ones.
    !
    ! See working notes for more detail.
    !
    ! Arguments
    ! =========
    !
    integer,  intent(in)       :: ib, jb        ! block indices
    integer,  intent(in)       :: spin
    real(wp), intent(inout)    :: G_ij(:,:)     ! submatrix of Gtau
    real(wp), intent(inout)    :: G_ji(:,:)     ! submatrix of Gtau
    real(wp), intent(inout)    :: V(:,:)        ! HSF
    type(Gtau), intent(inout), target  :: tau

    ! ... local scalars    ...
    integer :: info           ! parameters for lapack's sub
    integer :: i              ! iterator
    integer :: n
    integer :: which

    real(wp), pointer :: U1(:,:)       ! 
    real(wp), pointer :: D1(:)         ! 
    real(wp), pointer :: T1(:,:)       ! 
    real(wp), pointer :: U2(:,:)       ! 
    real(wp), pointer :: D2(:)         ! 
    real(wp), pointer :: T2(:,:)       ! 
    real(wp), pointer :: W1(:,:)       ! working space
    real(wp), pointer :: W2(:,:)       !
    real(wp), pointer :: rw(:)         ! working space
    integer,  pointer :: lw(:)         !
    integer,  pointer :: pvt1(:)       !
    integer,  pointer :: pvt2(:)       !

    real(wp), pointer :: bar1i(:)      ! 
    real(wp), pointer :: bar2i(:)      ! 
    real(wp), pointer :: hat1(:)       ! 
    real(wp), pointer :: hat2(:)       ! 
    
    type(SeqB), pointer :: SB1, SB2

    ! ... Executable ...

    ! STEP 0. Initialization
    n = tau%n
    bar1i => tau%v1
    bar2i => tau%v2
    hat1  => tau%v3
    hat2  => tau%v4

    if(spin .eq. TAU_UP) then
       SB1 => tau%SB1_up
       SB2 => tau%SB2_up
    else
       SB1 => tau%SB1_dn
       SB2 => tau%SB2_dn
    endif
    
    U1 => SB1%U
    D1 => SB1%D
    T1 => SB1%T

    U2 => SB2%U
    D2 => SB2%D
    T2 => SB2%T

    W1 => SB1%W1
    W2 => SB1%W2
    rw => SB1%rw
    lw => SB1%lw
    pvt1 => SB1%piv
    pvt2 => SB2%piv

    info = 0

    if(ib<1.or.ib>tau%L.or.jb<1.or.jb>tau%L)then
      write(*,'(A)')"GetGtau2 can only work with indices in [1,L]. Stop."
      stop
    endif
    

    ! STEP 1. Cmpute UDT decomposition of 
    !         inv(A_1) = inv(B_{i}...B_{j+1})
    !         and A_2  = B_j...B_{i+1}.
    ! ==========================================
    ! W1, W2, rw, lwork, tau, pvt1 can be reused.

    call DQMC_SeqMultB (jb, ib+1, SB1, V)
    if ( ib /= jb ) then
       call DQMC_SeqMultBi(ib, jb+1, SB2, V)
       which = tau%which
    else
       D2 = ONE
       call DQMC_Eye(n, U2)
       call DQMC_Eye(n, T2)
       which = TAU_T0
    endif
    
    if (which .eq. TAU_T0 .or. which .eq. TAU_BOTH) then
       !
       ! STEP 2.  D_1 = inv(barD_1)*hatD_1
       !          D_2 = inv(barD_2)*hatD_2
       ! ==================================
       
       do i = 1, n
          bar1i(i) = ONE / max(ONE, abs(D1(i)))
          hat1(i)  = D1(i) * bar1i(i)
          bar2i(i) = ONE / max(ONE, abs(D2(i)))
          hat2(i)  = D2(i) * bar2i(i)
       end do

       !   
       ! STEP 3. Compute C = hatD_2*T_2*inv(T_1)*inv(barD_1)+
       !                     inv(barD_2)*inv(U_2)*U_1*hatD_1
       ! =======================================================   
       
       !! Compute  T_2*inv(T_1)
       ! copy T_1 to W_2, because we may need T_1 later
       call blas_dcopy(n*n,T1,1,W2,1)
       
       ! W_1 = T_2'
       call DQMC_trans(n, W1, T2)

       ! W_1 = inv(W_2')*W_1 = inv(T_1')*T_2'
       call lapack_dgetrf(n, n, W2, n, pvt1, info)
       call lapack_dgetrs('T', n, n, W2, n, pvt1, W1, n, info)
       if (info .ne. 0) then
          call DQMC_Error("Error: dgetrs(1) in dqmc_getgtau.", info)
       end if

       ! T_2 = transpose(W_1) = transpose(inv(T_1')*T_2') = T_2*inv(T_1)
       call DQMC_trans(n, T2, W1)
       
       if (tau%which.eq.TAU_T0) then
          ! if only Row is computed, then T1 is not reference, reuse it 
          call blas_dcopy(n*n,G_ij,1,T1,1)
       end if
       
       ! U_1 = G_ij = U_2'*U_1
       ! ** G_ij here is used as a temp variable
       call blas_dgemm('T', 'N', n, n, n, ONE, U2, n, U1, n, ZERO, G_ij, n)
       call blas_dcopy(n*n,G_ij,1,U1,1)
       
       !! *** We need to keep T2 and U1 for later use.
       
       ! compute U_1 = barD_2*U_2'*U_1*hatD_1
       call DQMC_ScaleRow(n, G_ij, bar2i)
       call DQMC_ScaleCol(n, G_ij, hat1)
       
       ! compute W_1 = hatD_2*T_2*inv(T_1)*barD_1
       call blas_dcopy(n*n,T2,1,W1,1)
       call DQMC_ScaleRow(n, W1, hat2)
       call DQMC_ScaleCol(n, W1, bar1i)
       
       ! W_1 = W_1 + G_ij (This is called "C" where STEP 3 is defined)
       call blas_daxpy(n*n, ONE, G_ij, 1, W1, 1)
       
       !   
       ! STEP 4. Compute inv(T_1)*inv(barD_1)*inv(C)*inv(barD_2)*inv(U_2)
       ! =================================================================   
       
       ! Let G_ij = inv(barD_2) * inv(U2)
       call DQMC_trans(n, G_ij, U2)
       call DQMC_ScaleRow(n, G_ij, bar2i)

       ! Straight inversion of "C" using LU (dgesv calls dgetrf).
       ! To be modified using the safer alternative of a further
       ! UDT decomposition followed by inversion?
       ! G_ij = inv(W_1)*inv(barD_2)*inv(U_2)
       call lapack_dgesv(n, n, W1, n, pvt2, G_ij, n, info)

       if (info .ne. 0) then
          call DQMC_Error("Error: dgesv(2) in dqmc_getgtau.", info)
       end if
       
       ! G_ij = inv(barD_1)*G_ij = inv(barD_1)*inv(W_1)*inv(barD_2)*inv(U_2)
       call DQMC_ScaleRow(n, G_ij, bar1i)
       
       ! G_ij = inv(T_1)*G_ij
       !      = inv(T_1)*inv(barD_1)*inv(C)*inv(barD_2)*inv(U_2)
       call lapack_dgetrs('N', n, n, W2, n, pvt1, G_ij, n, info)
       if (info .ne. 0) then
          call DQMC_Error("Error: dgetrs(1) in dqmc_getgtau.", info)
       end if

    end if

    !
    ! Compute G_ji, repeat step 2, 3, 4 for Gji
    ! ==========================================

    if (which.eq.TAU_0T .or. which .eq. TAU_BOTH) then       
       !
       ! STEP 5.  inv(D_1) = barD_1*hatD_1
       !          inv(D_2) = barD_2*hatD_2
       ! ======================================
       !
       do i = 1, n
          if (D1(i) .eq. ZERO) then
             call DQMC_Error("Error: in dqmc_getgtau, D1(i)=0.0, i=", i)
          end if
          D1(i) = ONE / D1(i)
          bar1i(i) = ONE / max(ONE, D1(i))
          hat1(i) = D1(i) * bar1i(i)

          if (D2(i) .eq. ZERO) then
             call DQMC_Error("Error: in dqmc_getgtau, D2(i)=0.0, i=", i)
          end if
          D2(i) = ONE / D2(i)
          bar2i(i) = ONE / max(ONE, D2(i))
          hat2(i) = D2(i) * bar2i(i)
       end do
       
       !   
       ! STEP 6. Compute G_ji = hatD_1*inv(U_1)U_2*inv(barD_2)+
       !                        inv(barD_1)T_1*inv(T_2)hatD_2
       ! =======================================================   
       if (tau%which .eq. TAU_BOTH) then
          ! Previously, T_2 = T_2*inv(T_1)
          !             U_1 = inv(U_2)*U_1
          ! Therefore, we only need to invert them.
          
          ! first, compute inv(barD_1)T_1*inv(T_2)hatD_2          
          call lapack_dgetrf(n, n, T2, n, pvt1, info)
          if (info .ne. 0) then
             call DQMC_Error("Error: dgetrf(1) in dqmc_getgtau.", info)
          end if
          call lapack_dgetri(n, T2, n, pvt1, rw, lw(LA_GETRI), info)
          
          ! W1 = U1' = inv(inv(U_2)*U_1) = inv(U_1)*U_2
          call DQMC_Trans(n, W1, U1)
           
       else
          ! No previous computed results. Compute them from scratch.
          !
          ! (1) Compute T_1*inv(T_2) 
          !     Let W_1 = T_1'
          call DQMC_trans(n, W1, T1)

          !     W_1 = inv(T_2')*W_1 = inv(T_2')*T_1'
          call lapack_dgetrf(n, n, T2, n, pvt1, info)
          call lapack_dgetrs('T', n, n, T2, n, pvt1, W1, n, info)
          if (info .ne. 0) then
             call DQMC_Error("Error: dgetrs(1) in dqmc_getgtau.", info)
          end if
          !     T_2 = W_1' = (inv(T_2')*T_1')' = T_1*inv(T_2)
          call DQMC_trans(n, T2, W1)
          
          ! (2) Compute W_1 = U_2'*U_1
          call blas_dgemm('T', 'N', n, n, n, ONE, U1, n, U2, n, ZERO, W1, n)
       end if


       ! Compute inv(barD_1)T_1*inv(T_2)hatD_2
       call DQMC_ScaleRow(n, T2, bar1i)
       call DQMC_ScaleCol(n, T2, hat2)
          
       ! Compute hatD_1*inv(U_1)U_2*inv(barD_2)
       call DQMC_ScaleRow(n, W1, hat1)
       call DQMC_ScaleCol(n, W1, bar2i)
       

       ! W1 = W1 + T2
       call blas_daxpy(n*n, ONE, T2, 1, W1, 1)

       !
       ! STEP 7. Compute U_2*inv(barD_2)*inv(...)*inv(barD_1)*T_1
       ! =========================================================
       !
       call DQMC_ScaleRow(n, T1, bar1i)
       call lapack_dgesv(n, n, W1, n, pvt1, T1, n, info)
       if (info .ne. 0) then
          call DQMC_Error("Error: dgesv(3) in dqmc_getgtau.", info)
       end if
       
       ! inv(barD_2)*inv(...)*inv(barD_1)*T_1
       call DQMC_ScaleRow(n, T1, bar2i)

       ! copy the previous result
       call blas_dcopy(n*n,G_ij,1,W2,1)

       ! multiply -U2, the sign is negative
       call blas_dgemm('N', 'N', n, n, n, -ONE, U2, n, T1, n, ZERO, G_ji, n)

    end if

    if( ib == jb .and. tau%which > TAU_T0 )then
      G_ji = G_ij
    endif

    if ( jb > ib ) then
      if ( tau%which < TAU_0T ) G_ij=-G_ij
      if ( tau%which > TAU_T0 ) G_ji=-G_ji
    endif

    tau%ii = ib
    tau%ib = jb
    
  end subroutine DQMC_GetGtau2

  !--------------------------------------------------------------------!

  subroutine DQMC_change_gtau_time(idir, tau, G_up, G_dn)
    !
    ! Purpose
    ! =======
    ! This subroutine computes a new Gtau which is adjecent 
    ! to the one stored in tau using the recursion relations. 
    ! idir specifies which one of the adjecent four G has to be 
    ! computed. 
    !
    ! tau contains (or may contain) two blocks : 
    ! G(i,j) and G(j,i) where i and j are time indices. tau%ii
    ! and tau%ib are assumed to contain the indices i and j
    ! (Note that somewhere else tau%ib contains the displacement
    ! from tau%ii instead). The variable tau%which says whether
    ! G(i,j) and/or G(j,i) are stored. 
    !
    ! This routine applies the following transformation:
    !   if (idir == 1) G(i,j)=>G(i+1,j) and/or G(j,i)=>G(j,i+1)
    !   if (idir == 2) G(i,j)=>G(i-1,j) and/or G(j,i)=>G(j,i-1)
    !   if (idir == 3) G(i,j)=>G(i,j+1) and/or G(j,i)=>G(j+1,i)
    !   if (idir == 4) G(i,j)=>G(i,j-1) and/or G(j,i)=>G(j-1,i)
    ! keeping correctly track of the case where i==j (either 
    ! initially or after the transformation). 
    ! i and j are always kept between 1 and L.
    !
    ! Arguments
    ! =========
    !
    type(Gtau),  intent(inout) :: tau
    type(G_fun), intent(in)    :: G_up, G_dn
    integer,     intent(in)    :: idir
  
    ! ... local ...
    integer :: i, j, id, n, L

    ! ... aliases ...
    real(wp), pointer :: up(:,:)          
    real(wp), pointer :: dn(:,:)
    real(wp), pointer :: W(:,:)
    type(MatB), pointer :: B_up, B_dn

    B_up => tau%B_up
    B_dn => tau%B_dn
    W => tau%W1

    n = tau%n
    L = tau%L

    if(tau%which .le. TAU_BOTH) then

       up => tau%upt0
       dn => tau%dnt0

       select case (idir)

       case (1) ! G(i,j)=> G(i+1,j) 

          i = tau%ii + 1
          if(i > L) i = 1

          !Multiply by B_{ii+1} 
          call DQMC_MultB_Left  (n, up, B_up, G_up%V(:,i), W)
          call DQMC_MultB_Left  (n, dn, B_dn, G_dn%V(:,i), W)

          !Time wrapped through beta. Need to change sign.
          if(i == 1)then
            up = -up
            dn = -dn
          endif

          !Final G is equal time. Handle G(i,j) properly.
          if (tau%ib == i) then
             do id = 1, n
                up(id,id) = 1.d0 + up(id,id)
                dn(id,id) = 1.d0 + dn(id,id)
             enddo
          endif

       case (2) ! G(i,j)=> G(i-1,j) 

          i = tau%ii

          !Initial G is equal time. Handle G(i,j) properly.
          if (tau%ib == tau%ii) then
             do id = 1, n
                up(id,id) = -1.d0 + up(id,id)
                dn(id,id) = -1.d0 + dn(id,id)
             enddo
          endif

          call DQMC_MultBi_Left(n, up, B_up, G_up%V(:,i), W)
          call DQMC_MultBi_Left(n, dn, B_dn, G_dn%V(:,i), W)

          !Time wrapped through zero. Need to change sign.
          if(i == 1)then
            up = -up
            dn = -dn
            i = L
          else
            i = i -1
          endif

       case (3) !G(i,j)=> G(i,j+1) 

          j = tau%ib + 1
          if (j > L) j = 1

          if (tau%ib == tau%ii) then
             do id = 1, n
                up(id,id) = -1.d0 + up(id,id)
                dn(id,id) = -1.d0 + dn(id,id)
             enddo
          endif

          call DQMC_MultBi_Right(n, up, B_up, G_up%V(:,j), W)
          call DQMC_MultBi_Right(n, dn, B_dn, G_dn%V(:,j), W)

          !Time wrapped through beta. Need to change sign.
          if(j == 1)then
            up = -up
            dn = -dn
          endif

       case(4) !G(i,j)=> G(i,j-1) 

          j = tau%ib

          call DQMC_MultB_Right(n, up, B_up, G_up%V(:,j), W)
          call DQMC_MultB_Right(n, dn, B_dn, G_dn%V(:,j), W)

          !Time wrapped through zero. Need to change sign.
          if(j == 1)then
            up = -up
            dn = -dn
            j = L
          else
            j = j - 1
          endif

          !Final G is equal time. Treat G(i,j) properly.
          if(tau%ii == j)then
             do id = 1, n
                up(id,id) = 1.d0 + up(id,id)
                dn(id,id) = 1.d0 + dn(id,id)
             enddo
          endif

       end select

    endif

    if(tau%which .ge. TAU_BOTH) then

       up => tau%up0t
       dn => tau%dn0t

       select case (idir)

       case (1) ! G(j,i)=>G(j,i+1)

          i = tau%ii+1
          if(i > L) i = 1

          !initial G is equal time. Handle G(j,i) properly.
          if (tau%ib == tau%ii) then
             do id = 1, n
                up(id,id) = -1.d0 + up(id,id)
                dn(id,id) = -1.d0 + dn(id,id)
             enddo
          endif

          !Multiply by B_{i+1} and its inverse
          call DQMC_MultBi_Right(n, up, B_up, G_up%V(:,i), W)
          call DQMC_MultBi_Right(n, dn, B_dn, G_dn%V(:,i), W)

          !Time wrapped through beta. Need to change sign.
          if(i == 1)then
            up = -up
            dn = -dn
          endif

       case(2) ! G(j,i)=>G(j,i-1)

          i = tau%ii

          call DQMC_MultB_Right(n, up, B_up, G_up%V(:,i), W)
          call DQMC_MultB_Right(n, dn, B_dn, G_dn%V(:,i), W)

          !Time wrapped through zero. Need to change sign.
          if(i == 1)then
            up = -up
            dn = -dn
            i = L
          else
            i = i -1
          endif

          !Final G is equal time. Handle G(j,i) properly.
          if(tau%ib == i)then
             do id=1,n
                up(id,id) = 1.d0 + up(id,id)
                dn(id,id) = 1.d0 + dn(id,id)
             enddo
          endif

       case(3) !G(j,i)=>G(j+1,i)

          j = tau%ib + 1
          if(j > L) j = 1

          call DQMC_MultB_Left  (n, up, B_up, G_up%V(:,j), W)
          call DQMC_MultB_Left  (n, dn, B_dn, G_dn%V(:,j), W)

          !Time wrapped through beta. Need to change sign.
          if(j == 1)then
            up = -up
            dn = -dn
          endif

          !Final G is equal time. Handle G(j,i) properly.
          if(tau%ii == j)then
             do id=1,n
                up(id,id) = 1.d0 + up(id,id)
                dn(id,id) = 1.d0 + dn(id,id)
             enddo
          endif

       case(4) ! G(j,i)=>G(j-1,i)

          j = tau%ib

          if (tau%ii == tau%ib) then
             do id = 1, n
                up(id,id) = -1.d0 + up(id,id)
                dn(id,id) = -1.d0 + dn(id,id)
             enddo
          endif

          call DQMC_MultBi_Left(n, up, B_up, G_up%V(:,j), W)
          call DQMC_MultBi_Left(n, dn, B_dn, G_dn%V(:,j), W)

          !Time wrapped through zero. Need to change sign.
          if(j == 1)then
             up = -up
             dn = -dn
             j = L
          else
             j = j - 1
          endif

       end select

    endif

    !Update block index
    select case (idir)
      case(1, 2)
         tau%ii = i
      case(3, 4)
         tau%ib = j
    end select 

  end subroutine DQMC_change_gtau_time

end module DQMC_GTAU
