
    &`i)                        d dl mZmZmZ d dlZd dlmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZm Z   e            \  Z!Z"Z# G d d          Z$ede%de&de&de'de&f
d            Z(edej)        de'de'dej)        fd            Z*e G d de                      Z+dS )    )ListOptionalUnionN)BoxDiscreteSpace)ActionDistribution)ModelCatalog)ModelV2)SampleBatch)OldAPIStackoverride)Exploration)try_import_tf)from_config)get_placeholder)FromConfigSpecModelConfigDict
TensorTypec                       e Zd ZdZddedeee                  fdZde	j
        de	j
        fd	Zd
edededdfdZedefd            ZdS )_MovingMeanStdz!Track moving mean, std and count.-C6?Nepsilonshapec                     |sg }t          j        |t           j                  | _        t          j        |t           j                  | _        || _        dS )zInitialize object.

        Args:
            epsilon: Initial count.
            shape: Shape of the trackables mean and std.
        )dtypeN)npzerosfloat32meanonesvarcount)selfr   r   s      ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/utils/exploration/random_encoder.py__init__z_MovingMeanStd.__init__   sL      	EHU"*555	75
333


    inputsreturnc                     t          j        |d          }t          j        |d          }|j        d         }|                     |||           t          j        || j        z  dz             S )zNormalize input batch using moving mean and std.

        Args:
            inputs: Input batch to normalize.

        Returns:
            Logarithmic scaled normalized output.
        r   axis   )r   r    r"   r   update_paramslogstd)r$   r(   
batch_mean	batch_varbatch_counts        r%   __call__z_MovingMeanStd.__call__$   sk     WV!,,,
F6***	l1o:y+>>>vftx'!+,,,r'   r1   r2   r3   c                     || j         z
  }| j        |z   }| j         |z   ||z  z   | _         | j        | j        z  }||z  }||z   t          j        |d          | j        z  |z  |z  z   }||z  | _        || _        dS )zUpdate moving mean, std and count.

        Args:
            batch_mean: Input batch mean.
            batch_var: Input batch variance.
            batch_count: Number of cases in the batch.
           N)r    r#   r"   r   power)	r$   r1   r2   r3   delta	tot_countm_am_bM2s	            r%   r.   z_MovingMeanStd.update_params3   s     TY&J,	 I%i(??	h#+%3Y%++dj8;FRR	>


r'   c                 4    t          j        | j                  S )zhGet moving standard deviation.

        Returns:
            Returns moving standard deviation.
        )r   sqrtr"   )r$   s    r%   r0   z_MovingMeanStd.stdH   s     wtx   r'   )r   N)__name__
__module____qualname____doc__floatr   r   intr&   r   ndarrayr4   r.   propertyr0    r'   r%   r   r      s        ++  Xd3i5H    -rz -bj - - - -,1@E	   * !U ! ! ! X! ! !r'   r   beta_schedulebetarhostepr)   c                 (    | dk    r|d|z
  |z  z  S |S )a  Update beta based on schedule and training step.

    Args:
        beta_schedule: Schedule for beta update.
        beta: Initial beta.
        rho: Schedule decay parameter.
        step: Current training iteration.

    Returns:
        Updated beta as per input schedule.
    linear_decay      ?rG   )rH   rI   rJ   rK   s       r%   update_betarO   R   s(     &&c	d*++Kr'   
obs_embeds	embed_dimk_nnc                 >   t          j        | d|g          }t           j                            |dddddf         |dddddf         z
  d          }|                    d          ddd|f         dddf                             t           j                  S )a<  Compute states entropy using K nearest neighbour method.

    Args:
        obs_embeds: Observation latent representation using
            encoder model.
        embed_dim: Embedding vector dimension.
        k_nn: Number of nearest neighbour for K-NN estimation.

    Returns:
        Computed states entropy.
    Nr+   )r   reshapelinalgnormargsortastyper   )rP   rQ   rR   obs_embeds_dists        r%   compute_states_entropyr\   d   s     *Z"i99K9>>+aaaqqqj1Kaaa
4KKRT>UUD<<R<  ETE*111b5188DDDr'   c                        e Zd ZdZddddddddd	d
ededededee	         de
dede
dededee         f fdZ ee          dddedeeef         defd            Z ee          dd            Zd Z xZS )RE3a  Random Encoder for Efficient Exploration.

    Implementation of:
    [1] State entropy maximization with random encoders for efficient
    exploration. Seo, Chen, Shin, Lee, Abbeel, & Lee, (2021).
    arXiv preprint arXiv:2102.09430.

    Estimates state entropy using a particle-based k-nearest neighbors (k-NN)
    estimator in the latent space. The state's latent representation is
    calculated using an encoder with randomly initialized parameters.

    The entropy of a state is considered as intrinsic reward and added to the
    environment's extrinsic reward for policy optimization.
    Entropy is calculated per batch, it does not take the distribution of
    the entire replay buffer into consideration.
       Ng?constantg?2   i'  )
embeds_dimencoder_net_configrI   rH   rJ   rR   random_timestepssub_explorationaction_space	frameworkmodelrb   rc   rI   rH   rJ   rR   rd   re   c       
   
         |dk    rt          d           t                      j        |f||d| || _        || _        |	| _        || _        || j        d                                         }|| _	        |Xt          | j        t                    rddd|
d	z   d
f|
dz   dfgddd}n't          | j        t                    rd|
d}nt          || _        t!          j        | j        j        | j        | j        | j	        | j        d          | _        | j        dk    rit-          | j        j        d          | _        t0                              |                     t4          j        | j        i          d                   | _        t;          t<          | j        | j        | j        | j        | j        | j        | j                   | _!        dS )a  Initialize RE3.

        Args:
            action_space: The action space in which to explore.
            framework: Supports "tf", this implementation does not
                support torch.
            model: The policy's model.
            embeds_dim: The dimensionality of the observation embedding
                vectors in latent space.
            encoder_net_config: Optional model
                configuration for the encoder network, producing embedding
                vectors from observations. This can be used to configure
                fcnet- or conv_net setups to properly process any
                observation space.
            beta: Hyperparameter to choose between exploration and
                exploitation.
            beta_schedule: Schedule to use for beta decay, one of
                "constant" or "linear_decay".
            rho: Beta decay factor, used for on-policy algorithm.
            k_nn: Number of neighbours to set for K-NN entropy
                estimation.
            random_timesteps: The number of timesteps to act completely
                randomly (see [1]).
            sub_exploration: The config dict for the underlying Exploration
                to use (e.g. epsilon-greedy for DQN). If None, uses the
                FromSpecDict provided in the Policy's default config.

        Raises:
            ValueError: If the input framework is Torch.
        torchz/This RE3 implementation does not support Torch.)rh   rg   Nrh   EpsilonGreedyPiecewiseSchedule)r   rN   r-   rN   r6   g{Gz?)type	endpointsoutside_value)rm   epsilon_scheduleOrnsteinUhlenbeckNoise)rm   rd   encoder_net)model_configrg   nametf_encoder_obs)spacert   r   )clsconfigrf   rg   policy_configrh   num_workersworker_index)"
ValueErrorsuperr&   rI   rJ   rR   rb   rz   copyrc   
isinstancerf   r   r   NotImplementedErrorre   r
   get_model_v2rh   	obs_spacerg   _encoder_netr   _obs_phru   stop_gradientr   OBS_obs_embedsr   r   r{   r|   exploration_submodule)r$   rf   rg   rh   rb   rc   rI   rH   rJ   rR   rd   re   kwargs	__class__s                r%   r&   zRE3.__init__   s   ^ NOOORUiRR6RRR		$%!%!3G!<!A!A!C!C"4 " $+X66 *+ 3 %-137-148&
 *.	) 	)# # D-s33 *4(8# #
 *). )5J O0n
 
 
 >T!!*j*  DL  "//!!;?DL"ABB1E   D
 &1'*n,*(*	&
 	&
 	&
"""r'   T)exploreaction_distributiontimestepr   c                <    | j                             |||          S )N)r   r   r   )r   get_exploration_action)r$   r   r   r   s       r%   r   zRE3.get_exploration_action   s-     )@@ 3hPW A 
 
 	
r'   c                 j    | j         dk    r|                     |||          }nt          d          |S )zCalculate states' latent representations/embeddings.

        Embeddings are added to the SampleBatch object such that it doesn't
        need to be calculated during each training step.
        rj   zNot implemented for Torch.)rg   _postprocess_tfr}   )r$   policysample_batchtf_sesss       r%   postprocess_trajectoryzRE3.postprocess_trajectory  s?     >W$$//gNNLL9:::r'   c                 d   | j         dk    r4|                    | j        | j        |t          j                 i          }nat                              |                     t          j        |t          j                 i          d                   	                                }||t          j
        <   |S )z7Calculate states' embeddings and add it to SampleBatch.ru   )	feed_dictr   )rg   runr   r   r   r   ru   r   r   numpy
OBS_EMBEDS)r$   r   r   r   rP   s        r%   r   zRE3._postprocess_tf  s    >T!!  <ko)FG %  JJ
 ))!!;?L4Q"RSSTUV egg  0:[+,r'   )N)r?   r@   rA   rB   r   strr   rD   r   r   rC   r   r&   r   r   r	   r   r   boolr   r   r   __classcell__)r   s   @r%   r^   r^   x   s        . 8<' %48r
 r
 r
r
 	r

 r
 r
 %_5r
 r
 r
 r
 r
 r
 ".1r
 r
 r
 r
 r
 r
h Xk 

 

 

 0

 Z(	


 

 

 

 

 Xk
 
 
 
      r'   r^   ),typingr   r   r   r   r   gymnasium.spacesr   r   r   ray.rllib.models.action_distr	   ray.rllib.models.catalogr
   ray.rllib.models.modelv2r   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   r   'ray.rllib.utils.exploration.explorationr   ray.rllib.utils.frameworkr   ray.rllib.utils.from_configr   ray.rllib.utils.tf_utilsr   ray.rllib.utils.typingr   r   r   tf1ru   tfvr   r   rC   rD   rO   rE   r\   r^   rG   r'   r%   <module>r      s&   ( ( ( ( ( ( ( ( ( (     1 1 1 1 1 1 1 1 1 1 ; ; ; ; ; ; 1 1 1 1 1 1 , , , , , , 5 5 5 5 5 5 = = = = = = = = ? ? ? ? ? ? 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 N N N N N N N N N N}R;! ;! ;! ;! ;! ;! ;! ;!| s % e 3 5    " E
E'*E25EZE E E E& l l l l l+ l l l l lr'   