
    &`i6?                         d Z ddlZddlmZ ddlmZ  e            \  ZZZ ej	        dg d          Z
 ej	        dd          Zedfd	Zd
 ZeddddfdZ	 	 	 	 ddZ	 	 	 ddZd ZdS )a   Functions to compute V-trace off-policy actor critic targets.

For details and theory see:

"IMPALA: Scalable Distributed Deep-RL with
Importance Weighted Actor-Learner Architectures"
by Espeholt, Soyer, Munos et al.

See https://arxiv.org/abs/1802.01561 for the full paper.

In addition to the original paper's code, changes have been made
to support MultiDiscrete action spaces. behaviour_policy_logits,
target_policy_logits and actions parameters in the entry point
multi_from_logits method accepts lists of tensors instead of just
tensors.
    N)Categorical)try_import_tfVTraceFromLogitsReturnsvspg_advantageslog_rhosbehaviour_action_log_probstarget_action_log_probsVTraceReturnszvs pg_advantagesc                 6    t          | g|g||          d         S )Nr   )'multi_log_probs_from_logits_and_actions)policy_logitsactions
dist_classmodels       y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_tf.py!log_probs_from_logits_and_actionsr   4   s+     3	'J 	 	    c           
         g }t          t          |                     D ]"}t                              | |                   }t                              ||                   }t                              | |         t                              dg|dd         gd                    }t                              ||         t                              dg|dd         gd                    }	|                    t                               |||                              |	          |dd                              $|S )a  Computes action log-probs from policy logits and actions.

    In the notation used throughout documentation and comments, T refers to the
    time dimension ranging from 0 to T-1. B refers to the batch size and
    ACTION_SPACE refers to the list of numbers each representing a number of
    actions.

    Args:
        policy_logits: A list with length of ACTION_SPACE of float32
            tensors of shapes [T, B, ACTION_SPACE[0]], ...,
            [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities
            parameterizing a softmax policy.
        actions: A list with length of ACTION_SPACE of tensors of shapes
            [T, B, ...], ..., [T, B, ...]
            with actions.
        dist_class: Python class of the action distribution.

    Returns:
        A list with length of ACTION_SPACE of float32 tensors of shapes
            [T, B], ..., [T, B] corresponding to the sampling log probability
            of the chosen action w.r.t. the policy.
       Nr   axis)rangelentfshapereshapeconcatappendlogp)
r   r   r   r   	log_probsip_shapea_shapepolicy_logits_flatactions_flats
             r   r   r   <   s*   . I3}%%&& 
 
((=+,,((71:&&ZZ!bii"wqrr{(;!iDD
 
 zz'!*bii"wqrr{8KRSi.T.TUUJJ
-u55::<HH'RTSTRT+ 	
 	
 	
 	
 r         ?vtrace_from_logitsc                     t          | g|g|g|||||||	|
|          }t          |j        |j        |j        t
                              |j        d          t
                              |j        d                    S )z-multi_from_logits wrapper used only for tests)clip_rho_thresholdclip_pg_rho_thresholdnamer   r   r   )	multi_from_logitsr   r   r   r	   r   squeezer
   r   )behaviour_policy_logitstarget_policy_logitsr   	discountsrewardsvaluesbootstrap_valuer   r   r,   r-   r.   ress                r   from_logitsr8   d   s      	 !			-3  C #6'#%::c.LST:#U#U "

3+FQ
 O O   r   c           
         t          t          |                     D ]}t                              | |         t          j                  | |<   t                              ||         t          j                  ||<   | |         j                            d           ||         j                            d           t                              || ||||||g          5  t          ||||          }t          |           dk    s|	t          | |||          }	t          ||	          }t          ||||||
|          }t          d||	|d|                                cddd           S # 1 swxY w Y   dS )	a  V-trace for softmax policies.

    Calculates V-trace actor critic targets for softmax polices as described in

    "IMPALA: Scalable Distributed Deep-RL with
    Importance Weighted Actor-Learner Architectures"
    by Espeholt, Soyer, Munos et al.

    Target policy refers to the policy we are interested in improving and
    behaviour policy refers to the policy that generated the given
    rewards and actions.

    In the notation used throughout documentation and comments, T refers to the
    time dimension ranging from 0 to T-1. B refers to the batch size and
    ACTION_SPACE refers to the list of numbers each representing a number of
    actions.

    Args:
      behaviour_policy_logits: A list with length of ACTION_SPACE of float32
        tensors of shapes
        [T, B, ACTION_SPACE[0]],
        ...,
        [T, B, ACTION_SPACE[-1]]
        with un-normalized log-probabilities parameterizing the softmax behaviour
        policy.
      target_policy_logits: A list with length of ACTION_SPACE of float32
        tensors of shapes
        [T, B, ACTION_SPACE[0]],
        ...,
        [T, B, ACTION_SPACE[-1]]
        with un-normalized log-probabilities parameterizing the softmax target
        policy.
      actions: A list with length of ACTION_SPACE of
        tensors of shapes
        [T, B, ...],
        ...,
        [T, B, ...]
        with actions sampled from the behaviour policy.
      discounts: A float32 tensor of shape [T, B] with the discount encountered
        when following the behaviour policy.
      rewards: A float32 tensor of shape [T, B] with the rewards generated by
        following the behaviour policy.
      values: A float32 tensor of shape [T, B] with the value function estimates
        wrt. the target policy.
      bootstrap_value: A float32 of shape [B] with the value function estimate at
        time T.
      dist_class: action distribution class for the logits.
      model: backing ModelV2 instance
      behaviour_action_log_probs: precalculated values of the behaviour actions
      clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
        importance weights (rho) when calculating the baseline targets (vs).
        rho^bar in the paper.
      clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
        on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
      name: The name scope that all V-trace operations will be created in.

    Returns:
      A `VTraceFromLogitsReturns` namedtuple with the following fields:
        vs: A float32 tensor of shape [T, B]. Can be used as target to train a
            baseline (V(x_t) - vs_t)^2.
        pg_advantages: A float 32 tensor of shape [T, B]. Can be used as an
          estimate of the advantage in the calculation of policy gradients.
        log_rhos: A float32 tensor of shape [T, B] containing the log importance
          sampling weights (log rhos).
        behaviour_action_log_probs: A float32 tensor of shape [T, B] containing
          behaviour policy action log probabilities (log \mu(a_t)).
        target_action_log_probs: A float32 tensor of shape [T, B] containing
          target policy action probabilities (log \pi(a_t)).
    dtype   r5      N)r	   r3   r4   r5   r6   r,   r-   )r	   r
   r    )r   r   r   convert_to_tensorfloat32r   assert_has_ranktf1
name_scoper   get_log_rhosfrom_importance_weightsr   _asdict)r1   r2   r   r3   r4   r5   r6   r   r   r
   r,   r-   r.   r$   r   r	   vtrace_returnss                    r   r/   r/      s   j 3.//00 9 9%'%9%9#A&bj &: &
 &
" #%"6"6 #2: #7 #
 #
Q 	 "(88;;;Q%55a8888	# 
 
 
 
 )
 )
 #J ':u#
 #
 &''!++/I/Q *Q'*e* *&   79STT0+1"7
 
 
 ' 
'A$;
 
 $$&&	
 
I)
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
s   !BE//E36E3vtrace_from_importance_weightsc                    t                               | t           j                  } t                               |t           j                  }t                               |t           j                  }t                               |t           j                  }t                               |t           j                  }|&t                               |t           j                  }|&t                               |t           j                  }| j        j        }|j                            |           |j                            |dz
             |j                            |           |j                            |           ||j                            d           ||j                            d           t                              || ||||g          5  t           j        	                    |           }	|t           
                    ||	d          }
n|	}
t           
                    d|	d	          }t                               |dd         t                               |d          gd
          }|
|||z  z   |z
  z  }t                               |dg
          t                               |dg
          t                               |dg
          f}d }t                               |          }t           j                            t           j        t                               |||dd                    }t                               |dgd          }t                               ||d          }t                               |dd         t                               |d          gd
          }|t           
                    ||	d          }n|	}||||z  z   |z
  z  }t)          t                               |          t                               |                    cddd           S # 1 swxY w Y   dS )aF  V-trace from log importance weights.

    Calculates V-trace actor critic targets as described in

    "IMPALA: Scalable Distributed Deep-RL with
    Importance Weighted Actor-Learner Architectures"
    by Espeholt, Soyer, Munos et al.

    In the notation used throughout documentation and comments, T refers to the
    time dimension ranging from 0 to T-1. B refers to the batch size. This code
    also supports the case where all tensors have the same number of additional
    dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C],
    `bootstrap_value` is [B, C].

    Args:
      log_rhos: A float32 tensor of shape [T, B] representing the
        log importance sampling weights, i.e.
        log(target_policy(a) / behaviour_policy(a)). V-trace performs operations
        on rhos in log-space for numerical stability.
      discounts: A float32 tensor of shape [T, B] with discounts encountered when
        following the behaviour policy.
      rewards: A float32 tensor of shape [T, B] containing rewards generated by
        following the behaviour policy.
      values: A float32 tensor of shape [T, B] with the value function estimates
        wrt. the target policy.
      bootstrap_value: A float32 of shape [B] with the value function estimate at
        time T.
      clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
        importance weights (rho) when calculating the baseline targets (vs).
        rho^bar in the paper. If None, no clipping is applied.
      clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
        on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). If
        None, no clipping is applied.
      name: The name scope that all V-trace operations will be created in.

    Returns:
      A VTraceReturns namedtuple (vs, pg_advantages) where:
        vs: A float32 tensor of shape [T, B]. Can be used as target to
          train a baseline (V(x_t) - vs_t)^2.
        pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
          advantage in the calculation of policy gradients.
    r:   Nr>   r   r=   clipped_rhos)r.   r)   csr   c                 $    |\  }}}|||z  | z  z   S )Nr?   )accsequence_item
discount_tc_tdelta_ts        r   scanfuncz)from_importance_weights.<locals>.scanfunc~  s#    '4$JWZ#-333r   scan)fnelemsinitializerparallel_iterationsr.   vs_minus_v_xsr   clipped_pg_rhos)r   r   )r   r@   rA   r   ndimsrB   rC   rD   mathexpminimumr    expand_dimsreverse
zeros_likenestmap_structurestop_gradientrT   addr   )r	   r3   r4   r5   r6   r,   r-   r.   rho_rankrhosrK   rL   values_t_plus_1deltas	sequencesrS   initial_valuesrY   r   vs_t_plus_1rZ   r   s                         r   rF   rF     s6   h ##HBJ#??H$$Ybj$AAI""7"*"==G!!&
!;;F**?"**MMO%112DBJ1WW( " 4 4! !5 !
 !

 ~#H
L  ***))(Q,777O##H---M!!(+++% 00333(#33A666	h	7FOL 
 
 
 ;
 ;
 w{{8$$)::&8$^:TTLLLZZTZ--))ABBZ;;<1 $ 
 
 9+F!F!OP JJysJ++JJrJ$$JJvQCJ((
		4 	4 	4 77--GG*$%   	
 	
 

=1#O
LL VVM6V55 iiABB)K)K LSTiUU , jj%t2C )  OO #O'7Y5L+Lv+UV ##23C3CM3R3R
 
 
s;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
s   IP<<Q Q c                     t                               |           }t                               |          }t                               ||z
  d          }|S )zWith the selected log_probs for multi-discrete actions of behaviour
    and target policies we compute the log_rhos for calculating the vtrace.r   r   )r   stack
reduce_sum)r   r
   tbr	   s        r   rE   rE     sF     	())A
+,,A}}QU}++HOr   )Nr)   r)   r*   )r)   r)   rI   )__doc__collections"ray.rllib.models.tf.tf_action_distr   ray.rllib.utils.frameworkr   rC   r   tfv
namedtupler   r   r   r   r8   r/   rF   rE   r?   r   r   <module>rx      sE   "     : : : : : : 3 3 3 3 3 3}R0+0  	 	  '&8JKK (3$	 	 	 	% % %` 
	% % % %d  $	K
 K
 K
 K
h 	)F
 F
 F
 F
R    r   