
    &`iH                        d dl mZmZmZ d dlZd dlmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z+ d dl,m*Z* d dl-m.Z.m/Z/m0Z0  e$            \  Z1Z2Z3 e%            \  Z4Z5dZ6e5e5j7        Z6e G d de"                      Z8dS )    )OptionalTupleUnionN)DiscreteMultiDiscreteSpace)ActionDistribution)ModelCatalog)ModelV2)CategoricalMultiCategorical)SlimFC)TorchCategoricalTorchMultiCategorical)get_activation_fn)SampleBatch)NullContextManager)OldAPIStackoverride)Exploration)try_import_tftry_import_torch)from_config)get_placeholderone_hot)r   )FromConfigSpecModelConfigDict
TensorTypec                   X    e Zd ZdZddddddddddd	
d
ededededee	         de
e         dede
e         dededededee         f fdZ ee          dddedeeef         defd            Z ee          d             Z ee          d$d            Zd  Zd! Zd" Zd$d#Z xZS )%	Curiositya  Implementation of:
    [1] Curiosity-driven Exploration by Self-supervised Prediction
    Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017.
    https://arxiv.org/pdf/1705.05363.pdf

    Learns a simplified model of the environment based on three networks:
    1) Embedding observations into latent space ("feature" network).
    2) Predicting the action, given two consecutive embedded observations
    ("inverse" network).
    3) Predicting the next embedded obs, given an obs and action
    ("forward" network).

    The less the agent is able to predict the actually observed next feature
    vector, given obs and action (through the forwards network), the larger the
    "intrinsic reward", which will be added to the extrinsic reward.
    Therefore, if a state transition was unexpected, the agent becomes
    "curious" and will further explore this transition leading to better
    exploration in sparse rewards environments.
    i   N)   relug?      ?gMbP?)
feature_dimfeature_net_configinverse_net_hiddensinverse_net_activationforward_net_hiddensforward_net_activationbetaetalrsub_explorationaction_space	frameworkmodelr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   c          
      z   t          |t          t          f          st          d           t	                      j        |f||d| | j        d         dk    rt          d          || _        || j        d                                         }|| _	        || _
        || _        || _        |	| _        t          | j        t                    r| j        j        nt!          j        | j        j                  | _        |
| _        || _        || _        |t.          || _        t3          j        | j        j        | j        | j        | j	        | j        d	          | _        |                     d
| j        z  gtA          | j
                  z   | j        gz   | j        d          | _!        |                     | j        | j        z   gtA          | j                  z   | j        gz   | j        d          | _"        tG          tH          | j        | j        | j        | j        | j        | j%        | j&                  | _'        dS )aP  Initializes a Curiosity object.

        Uses as defaults the hyperparameters described in [1].

        Args:
             feature_dim: The dimensionality of the feature (phi)
                vectors.
             feature_net_config: Optional model
                configuration for the feature network, producing feature
                vectors (phi) from observations. This can be used to configure
                fcnet- or conv_net setups to properly process any observation
                space.
             inverse_net_hiddens: Tuple of the layer sizes of the
                inverse (action predicting) NN head (on top of the feature
                outputs for phi and phi').
             inverse_net_activation: Activation specifier for the inverse
                net.
             forward_net_hiddens: Tuple of the layer sizes of the
                forward (phi' predicting) NN head.
             forward_net_activation: Activation specifier for the forward
                net.
             beta: Weight for the forward loss (over the inverse loss,
                which gets weight=1.0-beta) in the common loss term.
             eta: Weight for intrinsic rewards before being added to
                extrinsic ones.
             lr: The learning rate for the curiosity-specific
                optimizer, optimizing feature-, inverse-, and forward nets.
             sub_exploration: The config dict for
                the underlying Exploration to use (e.g. epsilon-greedy for
                DQN). If None, uses the FromSpecDict provided in the Policy's
                default config.
        zBOnly (Multi)Discrete action spaces supported for Curiosity so far!)r0   r/   num_env_runnersr   zVCuriosity exploration currently does not support parallelism. `num_workers` must be 0!Nr0   feature_net)model_configr/   name   inverse_net)r5   forward_net)clsconfigr.   r/   policy_configr0   num_workersworker_index)(
isinstancer   r   
ValueErrorsuper__init__r;   r$   copyr%   r&   r'   r(   r)   r.   nnpsumnvec
action_dimr*   r+   r,   NotImplementedErrorr-   r
   get_model_v2r0   	obs_spacer/   _curiosity_feature_net_create_fc_netlist_curiosity_inverse_fcnet_curiosity_forward_fcnetr   r   r<   r=   exploration_submodule)selfr.   r/   r0   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   kwargs	__class__s                  y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/utils/exploration/curiosity.pyrA   zCuriosity.__init__7   ss   d ,=(ABB 	T   	RUiRR6RRR/0A55,  
 '%!%!3G!<!A!A!C!C"4#6 &<##6 &<# $+X660D).// 	 	 "%%. '3&?J 0n'
 '
 '
# )-(;(;!!"T$*B%C%CCtFWW' )< )
 )
% )-(;(;/04+,,- ! ' )< )
 )
% &1'*n,*(*	&
 	&
 	&
"""    T)exploreaction_distributiontimesteprV   c                <    | j                             |||          S )N)rW   rX   rV   )rP   get_exploration_action)rQ   rW   rX   rV   s       rT   rZ   z Curiosity.get_exploration_action   s-     )@@ 3hPW A 
 
 	
rU   c                    | j         dk    r t          | j                                                  }t          | j                                                  }t          | j                                                  }| j                            | j                  | j        _        | j                            | j                  | j        _        | j                            | j                  | j        _        t          j
                            ||z   |z   | j                  | _        n+| j        | j        _        | j        | j        _        | j        | j        _        | j        j        j        | j        j        z   | j        j        z   | _        t"          j                            | j                  | _        | j         dk    rt)          | j        j        d          | _        t)          | j        j        d          | _        t)          | j        j        d          | _        |                     | j        | j        | j                  \  | _        | _        |S )	Ntorch)r,   )learning_ratetf_curiosity_obs)spacer5   _curiosity_next_obs_curiosity_action)r/   rM   rK   
parametersrN   rO   todevicer0   r\   optimAdamr,   
_optimizer
base_model	variables_optimizer_var_listtf1trainAdamOptimizerr   rJ   _obs_ph_next_obs_phr.   
_action_ph_postprocess_helper_tf_forward_l2_norm_sqared
_update_op)rQ   
optimizersfeature_paramsinverse_paramsforward_paramss        rT   get_exploration_optimizerz#Curiosity.get_exploration_optimizer   s-    >W$$!$"="H"H"J"JKKN!$"?"J"J"L"LMMN!$"?"J"J"L"LMMN
 150K0N0N1 1DJ- 372O2R2R3 3DJ/ 372O2R2R3 3DJ/ $k../.@TW /  DOO 150KDJ-262ODJ/262ODJ/ +6@/9:/9: $
 "i55DG5LLDO~%%.*.5E      %4*.5J% % %! #2*18K# # # //L$"3T_ 0O
 rU   c                 z    | j         dk    r|                     |||           dS |                     ||           dS )zCalculates phi values (obs, obs', and predicted obs') and ri.

        Also calculates forward and inverse losses and updates the curiosity
        module on the provided batch using our optimizer.
        r\   N)r/   _postprocess_tf_postprocess_torch)rQ   policysample_batchtf_sesss       rT   postprocess_trajectoryz Curiosity.postprocess_trajectory   sK     >W$$  w?????##FL99999rU   c           
         | j         dk    rl|                    | j        | j        g| j        |t
          j                 | j        |t
          j                 | j	        |t
          j
                 i          \  }}nJ|                     |t
          j                 |t
          j                 |t
          j
                           \  }}|t
          j                 | j        |z  z   |t
          j        <   |S )Nr^   )	feed_dict)r/   runrs   rt   ro   r   OBSrp   NEXT_OBSrq   ACTIONSrr   REWARDSr+   )rQ   r}   r~   r   forward_l2_norm_sqared_s         rT   r{   zCuriosity._postprocess_tf  s    >T!!(/-t?L,{"?%|K4H'IO\+2E%F )4 ) )%"AA )-(C(C[_-[12[01) )%"A ,-;Q0QQ 	[() rU   c           
      n   | j         dk    rt                                          nt                      5 }| j                            t          j        t                              ||gd          i          \  }}t          	                    |d          \  }}| j        
                    t                              |t          || j                  gd                    }	dt                              t                              |	|z
            d          z  }
t                              |
          }t                              ||gd          }| j                            |          }t#          | j        t$                    rt'          || j                  nt)          || j        | j        j                  }|                    t                              |                     }t                              |          }d| j        z
  |z  | j        |z  z   }d d d            n# 1 swxY w Y   | j         dk    rU|                    || j                  }d t7          || j                  D             }| j                            |          }n!| j                            || j        	          }|
|fS )
Nr^   r   )axisr6         ?r#   c                      g | ]\  }}|||fS N ).0gvs      rT   
<listcomp>z4Curiosity._postprocess_helper_tf.<locals>.<listcomp>H  s(       1a1=A===rU   )var_list)r/   r^   GradientTaper   r0   rK   r   r   concatsplitrO   
tf_one_hotr.   
reduce_sumsquarereduce_meanrN   r>   r   r   r   rF   logpconvert_to_tensorr*   gradientrk   ziprh   apply_gradientsminimize)rQ   obsnext_obsactionstapephisr   phinext_phipredicted_next_phir   forward_lossphi_cat_next_phidist_inputsaction_distinverse_losslossgradsgrads_and_vars	update_ops                       rT   rr   z Curiosity._postprocess_helper_tf  s   !%4!7!7BOO=O=Q=Q$	Oj77"))S(O!)"D"DE GD! HHT1--MC "&!D!D		3
7D4E F FGb	QQ" " &)2==		,x788r ,9 , , &" >>*@AAL  "yy#xryBB*==>NOOK d/::WK444%k4:t?P?UVV  (,,R-A-A'-J-JKKKL>>,77L $)O|3di,6NNDI$	O $	O $	O $	O $	O $	O $	O $	O $	O $	O $	O $	O $	O $	O $	ON >T!!MM$(@AAE #&ud.F#G#G  N 77GGII00t7 1  I
 &y00s   G(H''H+.H+c           
         | j                             t          j        t                              t                              |t          j                                               |j                  t                              |t          j	                                               |j                  g          i          \  }}t          
                    |d          \  }}t                              |t          j                                                                               |j                  }| j                             t                              |t          || j                                                  gd                    }dt                              t                              ||z
  d          d          z  }	t                              |	          }
|t          j                 | j        |	                                                                                                z  z   |t          j        <   t                              ||gd          }| j                             |          }t5          | j        t6                    rt9          || j                   nt;          || j         | j        j                  }|                    |           }t                              |          }d| j         z
  |z  | j         |
z  z   }| j!        "                                 |#                                 | j!        $                                 |S )Nr6   r   )dimr   g       @r#   )%r0   rK   r   r   r\   cat
from_numpyrd   re   r   chunkr   longrO   r   r.   floatrE   powmeanr   r+   detachcpunumpyrN   r>   r   r   r   rF   r   r*   rh   	zero_gradbackwardstep)rQ   r}   r~   r   r   r   r   actions_tensorr   r   r   r   r   r   r   r   s                   rT   r|   zCuriosity._postprocess_torchT  s   *33((ko)FGGJJ"M  ((k6J)KLLOO"M 		" 	"
 
a D!,,X\+*=>??DDFFII&-XX 	
 "Z@@IIsGND4EFFLLNNOUWIXX
 
 "%uyyII(83S99r (1 (
 (
 "
 zz"899 ,-h/6688<<>>DDFFFG 	[() !99c8_"9==j99:JKK $+X66X[$*555&{DJ@Q@VWW 	 $((888zz,// di</$)l2JJ!!### rU   c                    | j         dk    rAt          j        j                            |d         fd                    |                    gng }t          t          |          dz
            D ]}|t          |          dz
  k     r|nd}| j         dk    rJ|                    t          ||         ||dz            t          j        j        j        |                     q|                    t          j        j                            ||dz            t          |          d	                    ||          
                     | j         dk    rt          j        | S t          j                            |          S )a  Given a list of layer dimensions (incl. input-dim), creates FC-net.

        Args:
            layer_dims (Tuple[int]): Tuple of layer dims, including the input
                dimension.
            activation: An activation specifier string (e.g. "relu").

        Examples:
            If layer_dims is [4,8,6] we'll have a two layer net: 4->8 (8 nodes)
            and 8->6 (6 nodes), where the second layer (6 nodes) does not have
            an activation anymore. 4 is the input dimension.
        r\   r   z{}_in)shaper5      r6   N)in_sizeout_sizeinitializeractivation_fnz{}_{})units
activationr5   )r/   r^   keraslayersInputformatrangelenappendr   r\   nninitxavier_uniform_Denser   
Sequential)rQ   
layer_dimsr   r5   r   iacts          rT   rL   zCuriosity._create_fc_net  s}    ~(( X_""*Q-)9t@T@T"UUVV 	 s:*++ 	 	A !C
OOa$7 7 7**TC~(( *1!+AE!2$)HM$A&)	      HO))(Q/#4S#9#9$^^D!44 *      >W$$=&))8&&v...rU   r   )__name__
__module____qualname____doc__r   strr   intr   r   r   r   r   rA   r   r   r	   r   r   boolrZ   ry   r   r{   rr   r|   rL   __classcell__)rS   s   @rT   r    r    !   s        4 8<*0&,*0&,48y
 y
 y
y
 	y

 y
 y
 %_5y
 #3Zy
 !$y
 #3Zy
 !$y
 y
 y
 y
 ".1y
 y
 y
 y
 y
 y
v Xk 

 

 

 0

 Z(	


 

 

 

 

 Xk6 6 6p Xk	: 	: 	: 	:  441 41 41l= = =~*/ */ */ */ */ */ */ */rU   r    )9typingr   r   r   r   rD   gymnasium.spacesr   r   r   ray.rllib.models.action_distr	   ray.rllib.models.catalogr
   ray.rllib.models.modelv2r   "ray.rllib.models.tf.tf_action_distr   r   ray.rllib.models.torch.miscr   (ray.rllib.models.torch.torch_action_distr   r   ray.rllib.models.utilsr   ray.rllib.policy.sample_batchr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   r   'ray.rllib.utils.exploration.explorationr   ray.rllib.utils.frameworkr   r   ray.rllib.utils.from_configr   ray.rllib.utils.tf_utilsr   r   r   ray.rllib.utils.torch_utilsray.rllib.utils.typingr   r   r   rl   r^   tfvr\   r   F
functionalr    r   rU   rT   <module>r      s'   ) ) ) ) ) ) ) ) ) )     ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; 1 1 1 1 1 1 , , , , , , L L L L L L L L . . . . . .        5 4 4 4 4 4 5 5 5 5 5 5 . . . . . . = = = = = = = = ? ? ? ? ? ? E E E E E E E E 3 3 3 3 3 3 K K K K K K K K / / / / / / N N N N N N N N N N}R	r>
A [/ [/ [/ [/ [/ [/ [/ [/ [/ [/rU   