
    &`iJC                     L   d dl mZmZmZ d dlZd dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z& erd dl'm(Z(  e            \  Z)Z*Z+ e            \  Z,Z-e G d de                      Z.dS )    )TYPE_CHECKINGOptionalUnionN)BoxDiscrete)BaseEnv)ActionDistribution)ModelV2)CategoricalDeterministic)TorchCategoricalTorchDeterministic)SampleBatch)OldAPIStackoverride)Exploration)get_variabletry_import_tftry_import_torch)from_config)SMALL_NUMBERsoftmax)
TensorType)Policyc                       e Zd ZdZdddddededed	ed
ede	e         f fdZ
 ee          ddddde	e         de	e         de	d         fd            Z ee          dedeeef         deeef         fd            Z ee          dddddddedede	d         fd            Zd Z ee          ddddd            Z ee          	 d.dddede	d         fd            Zddd Zd! Zdd"d#d$Zddd%Zd& Zddd'Zd( Z ee          d.d)            Z  ee          d.d*ed+e	d         d,dfd-            Z! xZ"S )/ParameterNoisea  An exploration that changes a Model's parameters.

    Implemented based on:
    [1] https://openai.com/research/better-exploration-with-parameter-noise
    [2] https://arxiv.org/pdf/1706.01905.pdf

    At the beginning of an episode, Gaussian noise is added to all weights
    of the model. At the end of the episode, the noise is undone and an action
    diff (pi-delta) is calculated, from which we determine the changes in the
    noise's stddev for the next episode.
          ?i'  N)initial_stddevrandom_timestepssub_exploration	frameworkpolicy_configmodelr   r   r    c          
         |J  t                      j        |f|||d| t          || j        d          | _        || _        d | j                            d                                          D             | _	        g | _
        | j	        D ]}	|	j        r#|	j                            d          d	         d
z   nd}
| j
                            t          t          j        |	j        t          j                  | j        |
d| j                             | j        dk    rt&                                          s|                                 | _        |                                 | _        |                                 | _        t6                              | j        g          5  |                                 }ddd           n# 1 swxY w Y   t6                              |g          5  t&                                          | _        ddd           n# 1 swxY w Y   d| _        |XtA          | j!        tD                    rddd|dz   df|dz   dfgddd}n'tA          | j!        tF                    rd|d}ntH          tK          tL          |f| j        | j!        | j'        | j        d|| _(        d| _)        dS )af  Initializes a ParameterNoise Exploration object.

        Args:
            initial_stddev: The initial stddev to use for the noise.
            random_timesteps: The number of timesteps to act completely
                randomly (see [1]).
            sub_exploration: Optional sub-exploration config.
                None for auto-detection/setup.
        N)r"   r#   r!   stddev)r!   tf_namec                      g | ]\  }}d |v	|S )	LayerNorm ).0kvs      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/utils/exploration/parameter_noise.py
<listcomp>z+ParameterNoise.__init__.<locals>.<listcomp>Q   s1      
  
  
1!## ###    T)as_dict:r   _noisy )dtype)r!   r&   torch_tensordevicetfFEpsilonGreedyPiecewiseSchedule)r   r      r      g{Gz?)type	endpointsoutside_value)r<   epsilon_scheduleOrnsteinUhlenbeckNoise)r<   r   )r!   action_spacer"   r#   )*super__init__r   r!   r%   
stddev_valr#   trainable_variablesitemsmodel_variablesnoisenamesplitappendnpzerosshapefloat32r6   r7   executing_eagerly_tf_sample_new_noise_optf_sample_new_noise_op_tf_add_stored_noise_optf_add_stored_noise_op_tf_remove_noise_optf_remove_noise_optf1control_dependenciesno_optf_sample_new_noise_and_add_opweights_are_currently_noisy
isinstancerA   r   r   NotImplementedErrorr   r   r"   r    episode_started)selfrA   r!   r"   r#   r   r   r    kwargsvarname_add_op	__class__s               r-   rC   zParameterNoise.__init__+   s   * $$$	
'		
 	

 	
 	
 	
 #dnh
 
 
 )
 
  

66t6DDJJLL 
  
  
 
' 
	 
	C9<ICHNN3''*X55rEJHSYbj999"n!!%;      >T!!"*>*>*@*@!*.*F*F*H*HD'*.*F*F*H*HD'&*&>&>&@&@D#))4+F*GHH 8 855778 8 8 8 8 8 8 8 8 8 8 8 8 8 8))6(33 A A68hhjj3A A A A A A A A A A A A A A A ,1( " $+X66 *+ 3 %-137-148&
 *.	) 	)# # D-s33 *4(8# # *)* 
 n*,* 
  
  
  
  %s$   )G

GG0HH"H)timestepexploretf_sessre   rf   rg   z
tf.Sessionc                    ||n| j         d         }| j        r|                     ||           |r| j        s|                     |           d S |s| j        r|                     |           d S d S d S )Nrf   rg   )r"   r^   _delayed_on_episode_startr[   _add_stored_noise_remove_noise)r_   re   rf   rg   s       r-   before_compute_actionsz%ParameterNoise.before_compute_actions   s     %0''d6H6S  	=**7G<<<  	04; 	0""7"33333 	0T= 	0w/////	0 	0 	0 	0r/   action_distributionc                <    | j                             |||          S )N)rn   re   rf   )r    get_exploration_action)r_   rn   re   rf   s       r-   rp   z%ParameterNoise.get_exploration_action   s,     #:: 3hPW ; 
 
 	
r/   )environmentepisoderg   policyr   rq   rr   c                    d| _         d S )NT)r^   r_   rs   rq   rr   rg   s        r-   on_episode_startzParameterNoise.on_episode_start   s      $r/   c                 t    |r|                      |d           n|                     |           d| _        d S )NTrg   r   ri   F)_sample_new_noise_and_add_sample_new_noiser^   )r_   rf   rg   s      r-   rj   z(ParameterNoise._delayed_on_episode_start   sM     	4**7T*JJJJ ""7"333$r/   c                D    | j         r|                     |           d S d S )Nri   )r[   rl   ru   s        r-   on_episode_endzParameterNoise.on_episode_end   s6     + 	0w/////	0 	0r/   sample_batchc           
         d x}}|                     || j                  \  }}}t          |j        t          t
          f          r t          |t          j                           }n;t          |j        t          t          f          r|t          j                 }nt          | j        r|}n|}|                     || j                   \  }}}t          |j        t          t
          f          r t          |t          j                           }n3t          |j        t          t          f          r|t          j                 }||}n|}d x}	}
t          |j        t          t
          f          rt          j        t          j        |t          j        ||t           z   z            z  d                    }
| j                            |          d         }t          j        d|z
  || j        j        z  z              }	nt          |j        t          t          f          rut          j        t          j        t          j        ||z
                                }
| j                            |          d         }t1          | j        dd          |z  }	|
|	k    r| xj        dz  c_        n| xj        dz  c_        |                     |                                 |           |S )	N)
input_dictrf   r:   )sesscur_epsilon	cur_scaleou_sigmag?g)\(?)compute_actions_from_input_dictr[   
issubclass
dist_classr   r   r   r   ACTION_DIST_INPUTSr   r   r]   rL   nanmeansumlogr   r    	get_staterA   nsqrtmeansquaregetattrrD   	set_state)r_   rs   r}   rg   noisy_action_distnoise_free_action_dist_fetchesaction_distdeltadistancecurrent_epsiloncurrent_scales                r-   postprocess_trajectoryz%ParameterNoise.postprocess_trajectory   s    6:92
 >>#T-M ? 
 
1g
 f'+7G)HII 	&!'+*H"IJJKK)M;M+NOO 	&!+"@AKK%%+ 	1 +%0">>#1Q-Q ? 
 
1g
 f'+7G)HII 	B!'+*H"IJJKK)M;M+NOO 	B!+"@AK$ +%0"f'+7G)HII 	S z*f.2Cl2RS    H #2<<'<JJO VA//DDUDW2WWXXXEE)M;M+NOO 	Sw	"8;L"LMMNN H !0:::HHUMD0*cBB]RE uOOt#OOOOOt#OO 	t~~''g666r/   ri   c                   | j         dk    r|                    | j                   dS | j         dk    r|                                  dS t	          t          | j                            D ]x}t                              t          	                    | j        |         
                                          | j                                      | j                  | j        |<   ydS )z0Samples new noise and stores it in `self.noise`.r7   tf2)r   stdN)r!   runrR   rQ   rangelenrH   torchnormalrM   sizer%   tor6   )r_   rg   is      r-   rz   z ParameterNoise._sample_new_noise3  s    >T!!KK344444^u$$((*****3tz??++ " " %TZ]%7%7%9%9:: !- ! !"T[// 
1" "r/   c                     g }| j         D ]d}|                    t                              |t          j                            |j        | j        t          j	                                       et	          j
        | S )N)rN   r%   r4   )rH   rK   rW   assignr7   randomr   rN   r%   rO   group)r_   added_noisesrH   s      r-   rQ   z&ParameterNoise._tf_sample_new_noise_op?  s    Z 	 	E

I$$#k$+RZ %       x&&r/   Frx   c                0   | j         dk    r>|r!| j        r|                    | j                   |                    | j                   nE|r| j        r|                                  |                                  |                                  d| _        d S )Nr7   T)r!   r[   r   rV   rZ   rl   rz   rk   )r_   rg   r   s      r-   ry   z(ParameterNoise._sample_new_noise_and_addL  s    >T!! 5D< 5D3444KK;<<<< %D< %""$$$""$$$""$$$+/(((r/   c                <   | j         du sJ | j        dk    r|                    | j                   nc| j        dk    r|                                  nCt          | j        | j                  D ](\  }}d|_        |	                    |           d|_        )d| _         dS )a  Adds the stored `self.noise` to the model's parameters.

        Note: No new sampling of noise here.

        Args:
            tf_sess (Optional[tf.Session]): The tf-session to use to add the
                stored noise to the (currently noise-free) weights.
            override: If True, undo any currently applied noise first,
                then add the currently stored noise.
        Fr7   r   TN)
r[   r!   r   rT   rS   ziprG   rH   requires_gradadd_r_   rg   ra   rH   s       r-   rk   z ParameterNoise._add_stored_noiseY  s     /58888 >T!!KK34444^u$$((****!$"6
CC ) )
U$)!$(!!+/(((r/   c                    t                      }t          | j        | j                  D ]3\  }}|                    t
                              ||                     4t          j        t          |           }t
          
                    |g          5  t                                          cddd           S # 1 swxY w Y   dS )zGenerates tf-op that assigns the stored noise to weights.

        Also used by tf-eager.

        Returns:
            tf.op: The tf op to apply the already stored noise to the NN.
        Nlistr   rG   rH   rK   rW   
assign_addr7   r   tuplerX   rY   )r_   add_noise_opsra   rH   rets        r-   rS   z&ParameterNoise._tf_add_stored_noise_opu  s     d2DJ?? 	= 	=JC  U!;!;<<<<hm,,-%%se,, 	 	88::	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B99B= B=c                >   | j         du sJ | j        dk    r|                    | j                   nd| j        dk    r|                                  nDt          | j        | j                  D ])\  }}d|_        |	                    |            d|_        *d| _         dS )z
        Removes the current action noise from the model parameters.

        Args:
            tf_sess (Optional[tf.Session]): The tf-session to use to remove
                the noise from the (currently noisy) weights.
        Tr7   r   FN)
r[   r!   r   rV   rU   r   rG   rH   r   r   r   s       r-   rl   zParameterNoise._remove_noise  s     /47777 >T!!KK/0000^u$$$$&&&&!$"6
CC ) )
U$)!%   $(!!+0(((r/   c                    t                      }t          | j        | j                  D ]4\  }}|                    t
                              ||                      5t          j        t          |           }t
          
                    |g          5  t                                          cddd           S # 1 swxY w Y   dS )zGenerates a tf-op for removing noise from the model's weights.

        Also used by tf-eager.

        Returns:
            tf.op: The tf op to remve the currently stored noise from the NN.
        Nr   )r_   remove_noise_opsra   rH   r   s        r-   rU   z"ParameterNoise._tf_remove_noise_op  s      66d2DJ?? 	A 	AJC##CNN3$?$?@@@@h.//0%%se,, 	 	88::	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B::B>B>c                     d| j         iS )N
cur_stddev)rD   )r_   r   s     r-   r   zParameterNoise.get_state  s    do..r/   stater   returnc                 
   |d         | _         | j        dk    r#| j                            | j         |           d S t	          | j        t
                    r| j         | _        d S | j                            | j                    d S )Nr   r7   )session)rD   r!   r%   loadr\   floatr   )r_   r   r   s      r-   r   zParameterNoise.set_state  s}    ->T!!KT_d;;;;;U++ 	0/DKKKKt/////r/   )N)#__name__
__module____qualname____doc__strdictr
   r   intr   rC   r   r   boolrm   r	   r   r   rp   r   rv   rj   r|   r   r   rz   rQ   ry   rk   rS   rl   rU   r   r   __classcell__)rd   s   @r-   r   r      s       
 
& !$ %*.p% p% p% 	p%
 p% p% p% p% "$p% p% p% p% p% p%d Xk #'"&*.0 0 0 3-0 $	0
 ,'0 0 0 0* Xk
 0
 
C(	

 z4'(
 
 
 
 Xk
  $*.$ $ $$ 	$
 $ ,'$ $ $ $% % % Xk48$PT 0 0 0 0 0
 Xk
 +/	Q QQ "Q ,'	Q Q Q Qf ,0 
" 
" 
" 
" 
"' ' ' 48% 0 0 0 0 0 ,0 0 0 0 0 08   (, 1 1 1 1 12   Xk/ / / / Xk0 0t 08L+A 0T 0 0 0 0 0 0 0 0r/   r   )/typingr   r   r   numpyrL   gymnasium.spacesr   r   ray.rllib.env.base_envr   ray.rllib.models.action_distr	   ray.rllib.models.modelv2r
   "ray.rllib.models.tf.tf_action_distr   r   (ray.rllib.models.torch.torch_action_distr   r   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   r   'ray.rllib.utils.exploration.explorationr   ray.rllib.utils.frameworkr   r   r   ray.rllib.utils.from_configr   ray.rllib.utils.numpyr   r   ray.rllib.utils.typingr   ray.rllib.policy.policyr   rW   r7   tfvr   r   r   r)   r/   r-   <module>r      s   1 1 1 1 1 1 1 1 1 1     * * * * * * * * * * * * * * ; ; ; ; ; ; , , , , , , I I I I I I I I        6 5 5 5 5 5 = = = = = = = = ? ? ? ? ? ? S S S S S S S S S S 3 3 3 3 3 3 7 7 7 7 7 7 7 7 - - - - - - /......}Rq [0 [0 [0 [0 [0[ [0 [0 [0 [0 [0r/   