
    &`i	                        d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ erd d
lmZ d dlmZ  e            \  ZZZ e            \  ZZe G d d                      ZdS )    )TYPE_CHECKINGDictListOptionalUnion)Space)BaseEnv)ActionDistribution)ModelV2)SampleBatch)OldAPIStack)
TensorTypetry_import_torch)AlgorithmConfigDictLocalOptimizer)Policy)try_import_tfc                      e Zd ZdZdedededededefdZ	d	d	d	d
de
eeef                  de
eeef                  de
d         fdZdddedeeef         defdZd	d	d	ddddedede
d         fdZd	d	d	ddddedede
d         fdZ	 d$dddede
d         fdZdee         dee         fdZd$d e
d         deeef         fd!Zd$d"ed e
d         dd	fd#Zd	S )%ExplorationzImplements an exploration strategy for Policies.

    An Exploration takes model outputs, a distribution, and a timestep from
    the agent and computes an action to apply to the environment using an
    implemented exploration schema.
    action_space	frameworkpolicy_configmodelnum_workersworker_indexc                "   || _         || _        || _        || _        || _        || _        d| _        t          | j        t          j	                  r<t          | j                                                  }|r|d         j        | _        dS dS dS )ac  
        Args:
            action_space: The action space in which to explore.
            framework: One of "tf" or "torch".
            policy_config: The Policy's config dict.
            model: The Policy's model.
            num_workers: The overall number of workers used.
            worker_index: The index of the worker using this class.
        Nr   )r   r   r   r   r   r   device
isinstancennModulelist
parameters)selfr   r   r   r   r   r   paramss           {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/utils/exploration/exploration.py__init__zExploration.__init__   s    & )*
&(" dj"),, 	/$*//1122F /$Qi.	/ 	// /    N)timestepexploretf_sessr(   r)   r*   z
tf.Sessionc                    dS )a)  Hook for preparations before policy.compute_actions() is called.

        Args:
            timestep: An optional timestep tensor.
            explore: An optional explore boolean flag.
            tf_sess: The tf-session object to use.
            **kwargs: Forward compatibility kwargs.
        N )r#   r(   r)   r*   kwargss        r%   before_compute_actionsz"Exploration.before_compute_actions@   	      	r'   T)r)   action_distributionc                    dS )aa  Returns a (possibly) exploratory action and its log-likelihood.

        Given the Model's logits outputs and action distribution, returns an
        exploratory action.

        Args:
            action_distribution: The instantiated
                ActionDistribution object to work with when creating
                exploration actions.
            timestep: The current sampling time step. It can be a tensor
                for TF graph mode, otherwise an integer.
            explore: True: "Normal" exploration behavior.
                False: Suppress all exploratory behavior and return
                a deterministic action.

        Returns:
            A tuple consisting of 1) the chosen exploration action or a
            tf-op to fetch the exploration action from the graph and
            2) the log-likelihood of the exploration action.
        Nr,   )r#   r0   r(   r)   s       r%   get_exploration_actionz"Exploration.get_exploration_actionU   s	    2 	r'   )environmentepisoder*   policyr   r3   r4   c                    dS )aY  Handles necessary exploration logic at the beginning of an episode.

        Args:
            policy: The Policy object that holds this Exploration.
            environment: The environment object we are acting in.
            episode: The number of the episode that is starting.
            tf_sess: In case of tf, the session object.
        Nr,   r#   r5   r3   r4   r*   s        r%   on_episode_startzExploration.on_episode_starts   r/   r'   c                    dS )aS  Handles necessary exploration logic at the end of an episode.

        Args:
            policy: The Policy object that holds this Exploration.
            environment: The environment object we are acting in.
            episode: The number of the episode that is starting.
            tf_sess: In case of tf, the session object.
        Nr,   r7   s        r%   on_episode_endzExploration.on_episode_end   r/   r'   sample_batchc                     |S )a}  Handles post-processing of done episode trajectories.

        Changes the given batch in place. This callback is invoked by the
        sampler after policy.postprocess_trajectory() is called.

        Args:
            policy: The owning policy object.
            sample_batch: The SampleBatch object to post-process.
            tf_sess: An optional tf.Session object.
        r,   )r#   r5   r;   r*   s       r%   postprocess_trajectoryz"Exploration.postprocess_trajectory   s
      r'   
optimizersreturnc                     |S )a  May add optimizer(s) to the Policy's own `optimizers`.

        The number of optimizers (Policy's plus Exploration's optimizers) must
        match the number of loss terms produced by the Policy's loss function
        and the Exploration component's loss terms.

        Args:
            optimizers: The list of the Policy's local optimizers.

        Returns:
            The updated list of local optimizers to use on the different
            loss terms.
        r,   )r#   r>   s     r%   get_exploration_optimizerz%Exploration.get_exploration_optimizer   s
      r'   sessc                     i S )zReturns the current exploration state.

        Args:
            sess: An optional tf Session object to use.

        Returns:
            The Exploration object's current state.
        r,   )r#   rB   s     r%   	get_statezExploration.get_state   s	     	r'   statec                     dS )a  Sets the Exploration object's state to the given values.

        Note that some exploration components are stateless, even though they
        decay some values over time (e.g. EpsilonGreedy). However the decay is
        only dependent on the current global timestep of the policy and we
        therefore don't need to keep track of it.

        Args:
            state: The state to set this Exploration to.
            sess: An optional tf Session object to use.
        Nr,   )r#   rE   rB   s      r%   	set_statezExploration.set_state   s	     	r'   )N)__name__
__module____qualname____doc__r   strr   r   intr&   r   r   r   boolr.   r
   r2   r	   r8   r:   r   r=   r   r   rA   r   rD   objectrG   r,   r'   r%   r   r      s        // 	/
 +/ / / / / / /H 6:59*.   5S12 %
D 012	
 ,'   2 04	  4F */z3)? )-	   D  $*.   	
  ,'   ,  $*.   	
  ,'   , +/	  " ,'	   $~.	n	   $	 	h|4 	S*_@U 	 	 	 	 v Xl-C t      r'   r   N) typingr   r   r   r   r   gymnasium.spacesr   ray.rllib.env.base_envr	   ray.rllib.models.action_distr
   ray.rllib.models.modelv2r   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   ray.rllib.utils.frameworkr   r   ray.rllib.utils.typingr   r   ray.rllib.policy.policyr   ray.rllib.utilsr   _tfr   r   r,   r'   r%   <module>r]      sb   = = = = = = = = = = = = = = " " " " " " * * * * * * ; ; ; ; ; ; , , , , , , 5 5 5 5 5 5 3 3 3 3 3 3 B B B B B B B B F F F F F F F F ......------}HAr12 { { { { { { { { { {r'   