
    &`i;                        d dl Z d dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZmZmZmZ  e j         e!          Z" e            \  Z#Z$Z%e G d d                      Z&e G d d                      Z'e G d d                      Z(e G d d                      Z)e G d d                      Z*e G d d                      Z+dededefdZ,dS )    N)DictList)ModelV2)EagerTFPolicy)EagerTFPolicyV2)PolicyState)SampleBatch)TFPolicy)OldAPIStack)get_variabletry_import_tf)PiecewiseSchedule)make_tf_callable)AlgorithmConfigDictLocalOptimizerModelGradients
TensorTypec                   .     e Zd ZdZd Z fdZd Z xZS )LearningRateSchedulez6Mixin for TFPolicy that adds a learning rate schedule.c                    d | _         |$t                              d|d          | _        d S t	          ||d         d         d           | _         t                              d| j                             d          d          | _        | j        dk    rSt                              t          j	        d          | _
        | j                            | j
        d	          | _        d S d S )
NlrF)initializer	trainableoutside_value	frameworkr   tfdtypename
read_value)_lr_scheduletf1r   cur_lrr   valuer   placeholderr   float32_lr_placeholderassign
_lr_update)selfr   lr_schedules      n/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/tf_mixins.py__init__zLearningRateSchedule.__init__   s     **4R5*QQDKKK 1;r?2+>$! ! !D **$"3"9"9!"<"< +  DK ~%%'*RZd'S'S$"&+"4"4(U #5 # # &%    c                    t                                          |           | j        | j                            |d                   }| j        dk    r7|                                                     | j        | j        |i           d S | j	        
                    |d           | j        j        
                    | j	                   d S d S Ntimestepr   	feed_dictFr"   )superon_global_var_updater$   r'   r   get_sessionrunr,   r*   r&   r+   
_optimizerlearning_rater-   global_varsnew_val	__class__s      r/   r8   z)LearningRateSchedule.on_global_var_update0   s    $$[111('--k*.EFFG~%%  ""&&O0Dg/N '      ""7u"=== -44T[AAAAA )(r1   c                     | j         dk    r%t          j                            | j                  S t
          j        j                            | j                  S )Nr   )r<   )	r   r%   trainAdamOptimizerr&   r   keras
optimizersAdamr-   s    r/   	optimizerzLearningRateSchedule.optimizer>   sE    >T!!9***EEE8&++DK888r1   )__name__
__module____qualname____doc__r0   r8   rH   __classcell__r@   s   @r/   r   r      sa        @@  "B B B B B9 9 9 9 9 9 9r1   r   c                   (     e Zd ZdZd Z fdZ xZS )EntropyCoeffSchedulez1Mixin for TFPolicy that adds entropy coeff decay.c                    d | _         |t          |ddd          | _        d S t          |t                    r$t          ||d         d         d           | _         nt          d|g|dggdd           | _         t          | j                             d          ddd          | _        | j        dk    rSt          	                    t          j        d	          | _        | j                            | j        d
          | _        d S d S )Nr   entropy_coeffF)r   tf_namer   r   r   r           r   r"   )_entropy_coeff_scheduler   rR   
isinstancelistr   r'   r   r%   r(   r   r)   _entropy_coeff_placeholderr+   _entropy_coeff_update)r-   rR   entropy_coeff_schedules      r/   r0   zEntropyCoeffSchedule.__init__I   sM   '+$!)!-RW" " "D
 0$77 /@*"8"<R"@"0 0 0,, 0A'*@#)FG"%"0 0 0, ".,22155'	" " "D ~%%25//*? 3B 3 3/ .2-?-F-F3 .G . .***	 &%r1   c                 X   t                                          |           | j        | j                            |d                   }| j        dk    r7|                                                     | j        | j        |i           d S | j	        
                    |d           d S d S r3   )r7   r8   rU   r'   r   r9   r:   rY   rX   rR   r+   r=   s      r/   r8   z)EntropyCoeffSchedule.on_global_var_updatem   s    $$[111'3288Z9PQQG~%%  ""&&.#>H '     
 "))'e)DDDDD 43r1   )rI   rJ   rK   rL   r0   r8   rM   rN   s   @r/   rP   rP   E   sW        ;;" " "H
E 
E 
E 
E 
E 
E 
E 
E 
Er1   rP   c                   T     e Zd ZdZdefdZd Zd Zdef fdZ	dedd	f fd
Z
 xZS )KLCoeffMixinzAssigns the `update_kl()` and other KL-related methods to a TFPolicy.

    This is used in Algorithms to update the KL coefficient after each
    learning step based on `config.kl_target` and the measured KL value
    (from the train_batch).
    configc                 V   |d         | _         t          t          | j                   dd|d                   | _        |d         | _        | j        dk    rSt                              t          j	        d          | _
        | j                            | j
        d          | _        d S d S )	Nkl_coeffFr   )rS   r   r   	kl_targetr   r   r"   )kl_coeff_valr   floatr`   ra   r   r%   r(   r   r)   _kl_coeff_placeholderr+   _kl_coeff_update)r-   r^   s     r/   r0   zKLCoeffMixin.__init__   s    ":.$$#$$[)	
 
 
  ,>T!!),jz *9 * *D& %)M$8$8*u %9 % %D!!!	 "!r1   c                     |d| j         z  k    r| xj        dz  c_        n&|d| j         z  k     r| xj        dz  c_        n| j        S |                     | j                   | j        S )Ng       @g      ?g      ?)ra   rb   _set_kl_coeff)r-   
sampled_kls     r/   	update_klzKLCoeffMixin.update_kl   s     dn,,,$#...$ $$ 	4,---   r1   c                     || _         | j        dk    r<|                                                     | j        | j        | j         i           d S | j                            | j         d           d S )Nr   r5   Fr"   )rb   r   r9   r:   re   rd   r`   r+   )r-   new_kl_coeffs     r/   rg   zKLCoeffMixin._set_kl_coeff   s    ( >T!!""%5t7HI #     
 M  !2u EEEEEr1   returnc                 \    t                                                      }| j        |d<   |S )Ncurrent_kl_coeff)r7   	get_staterb   r-   stater@   s     r/   ro   zKLCoeffMixin.get_state   s+    !!##$($5 !r1   rq   Nc                     |                      |                    d| j        d                              t                                          |           d S )Nrn   r`   )rg   popr^   r7   	set_staterp   s     r/   rt   zKLCoeffMixin.set_state   sK    599%7Z9PQQRRR%     r1   )rI   rJ   rK   rL   r   r0   ri   rg   r   ro   rt   rM   rN   s   @r/   r]   r]   z   s         2    (! ! !$F F F;      !{ !t ! ! ! ! ! ! ! ! ! !r1   r]   c                   t    e Zd ZdZd Zed             Zed             ZddeddfdZ	de
e         fd	Zd
 ZdS )TargetNetworkMixinzAssign the `update_target` method to the policy.

    The function is called every `target_network_update_freq` steps by the
    master learner.
    c                    | j                                         | j                                        t          |                                           fd            }|| _        |                     d           d S )Nc                    t                               | t           j                  } g }t                    t                    k    sJ f            t	                    D ]f\  }}|                    |                    | |z  d| z
  |z  z                        t                              d	                    |                     gt          j
        | S )N)r          ?zUpdate target op {})r   convert_to_tensorr)   lenzipappendr+   loggerdebugformatgroup)tauupdate_target_exprvar
var_target
model_varstarget_model_varss       r/   update_target_fnz5TargetNetworkMixin.__init__.<locals>.update_target_fn   s    &&s"*&==C!#z??c*;&<&<<<<!?<<< $'z3D#E#E G GZ"))%%cCi39
2J&JKK   299*EEFFFF8/00r1   ry   )r   )modeltrainable_variablestarget_modelr   r9   
_do_updateupdate_target)r-   r   r   r   s     @@r/   r0   zTargetNetworkMixin.__init__   s    Z3355
 -AACC	$**,,	-	-	1 	1 	1 	1 	1 
.	-	1 +
 	s#####r1   c                 l    t          | d          s| j                                        | _        | j        S )N_q_func_vars)hasattrr   	variablesr   rG   s    r/   q_func_varszTargetNetworkMixin.q_func_vars   s3    t^,, 	7 $
 4 4 6 6D  r1   c                 l    t          | d          s| j                                        | _        | j        S )N_target_q_func_vars)r   r   r   r   rG   s    r/   target_q_func_varsz%TargetNetworkMixin.target_q_func_vars   s6    t233 	E'+'8'B'B'D'DD$''r1   Nr   rl   c                     |                      t          j        |p| j                            dd                               d S Nr   ry   )r   npr)   r^   get)r-   r   s     r/   r   z TargetNetworkMixin.update_target   s9    
3#E$+//%*E*EFFGGGGGr1   c                 4    | j                                         S N)r   r   rG   s    r/   r   zTargetNetworkMixin.variables   s    z##%%%r1   c                 b   t          | t                    rt          j        | |           nUt          | t                    rt          j        | |           n*t          | t                    rt	          j        | |           |                     | j                            dd                     d S r   )rV   r
   set_weightsr   r   r   r^   r   )r-   weightss     r/   r   zTargetNetworkMixin.set_weights   s    dH%% 	5 w////o.. 	5'g6666m,, 	5%dG4444;??5#6677777r1   r   )rI   rJ   rK   rL   r0   propertyr   r   intr   r   r   r   r    r1   r/   rv   rv      s         $ $ $6 ! ! X!
 ( ( X(H H H H H H H&4
+ & & & &8 8 8 8 8r1   rv   c                   Z     e Zd ZdZd Zdeeef         f fdZdeeef         fdZ	 xZ
S )ValueNetworkMixina  Assigns the `_value()` method to a TFPolicy.

    This way, Policy can call `_value()` to get the current VF estimate on a
    single(!) observation (as done in `postprocess_trajectory_fn`).
    Note: When doing this, an actual forward pass is being performed.
    This is different from only calling `model.value_function()`, where
    the result of the most recent forward pass is being used to return an
    already calculated tensor.
    c                 H    |                     d          s|                     d          r-t                                                      fd            }n*t                                                     d             }| _        |d         dk     _        d  _        d S )Nuse_gaevtracec                  :   t          |           } t          j        t          j        j                  r1                    |           \  }}}|t           j                 d         S                     |           \  }}j                                        d         S )Nr   )r	   rV   r   r   rD   ModelVF_PREDSvalue_function)
input_dict_
extra_outs	model_outr-   s       r/   r'   z)ValueNetworkMixin.__init__.<locals>.value  s    (44
dj"(.99 :'+zz*'='=$Aq*%k&:;A>>#'::j#9#9LIq:4466q99r1   c                  6    t                               d          S )NrT   )r   constant)argskwargss     r/   r'   z)ValueNetworkMixin.__init__.<locals>.value$  s    {{3'''r1   r   r   )r   r   r9   _value_should_cache_extra_action_cached_extra_action_fetches)r-   r^   r'   s   `  r/   r0   zValueNetworkMixin.__init__  s     ::i   	(FJJx$8$8 	( d..0011: : : : 21: : d..0011( ( 21( *0*=*E',0)))r1   rl   c                    t                                                      }t          | j        t          j        j                  r|S |                    t          j	        | j        
                                i           |S r   )r7   extra_action_out_fnrV   r   r   rD   r   updater	   r   r   )r-   extra_action_outr@   s     r/   _extra_action_out_implz(ValueNetworkMixin._extra_action_out_impl,  sv     776688 dj"(.11 	$## 	$dj&?&?&A&A	
 	
 	

  r1   c                     | j         s|                                 S | j        | j        S |                                 | _        | j        S r   )r   r   r   rG   s    r/   r   z%ValueNetworkMixin.extra_action_out_fn<  sP    . 	1..000 ,844,0,G,G,I,I)00r1   )rI   rJ   rK   rL   r0   r   strr   r   r   rM   rN   s   @r/   r   r     s         1 1 1: S*_(=             1T#z/%: 1 1 1 1 1 1 1 1r1   r   c                   8    e Zd Zd Zdededeeef         fdZ	dS )GradStatsMixinc                     d S r   r   rG   s    r/   r0   zGradStatsMixin.__init__V  s    r1   train_batchgradsrl   c                     | j                             d          rd |D             }nt          j                            |          }d|iS )N%_tf_policy_handles_more_than_one_lossc                 L    g | ]!}t           j                            |          "S r   )r   linalgglobal_norm).0gs     r/   
<listcomp>z0GradStatsMixin.grad_stats_fn.<locals>.<listcomp>^  s(    BBBq")//22BBBr1   
grad_gnorm)r^   r   r   r   r   )r-   r   r   r   s       r/   grad_stats_fnzGradStatsMixin.grad_stats_fnY  sX     ;??BCC 	6BBEBBBJJ ..u55J *
 	
r1   N)
rI   rJ   rK   r0   r	   r   r   r   r   r   r   r1   r/   r   r   T  sX          
&
/=
	c:o	
 
 
 
 
 
r1   r   rH   lossrl   c           	         | j         j        }t          | j         t                    r
 |            }|                    ||          }| j                            d          d |D             }t                              || j        d                   \  }}g | _	        |D ]}|j| j	        
                    t                              t          j                            |          t                              |          |                     n| j	        
                    d            t          t!          | j	        |                    }|S |S )N	grad_clipc                     g | ]\  }}|S r   r   )r   r   vs      r/   r   z%compute_gradients.<locals>.<listcomp>u  s    000v1000r1   )r   r   rV   r   compute_gradientsr^   r   r   clip_by_global_normr   r}   wheremathis_nan
zeros_likerW   r|   )	policyrH   r   r   grads_and_varsr   r   r   clipped_grads_and_varss	            r/   r   r   i  s-    0I&,((  IKK	00yAAN }%%100000))%{1KLLq  	* 	*A}##BHHRW^^A->->a@P@PRS$T$TUUUU##D))))!%c&,	&B&B!C!C%%r1   )-loggingtypingr   r   numpyr   ray.rllib.models.modelv2r    ray.rllib.policy.eager_tf_policyr   #ray.rllib.policy.eager_tf_policy_v2r   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr	   ray.rllib.policy.tf_policyr
   ray.rllib.utils.annotationsr   ray.rllib.utils.frameworkr   r   ray.rllib.utils.schedulesr   ray.rllib.utils.tf_utilsr   ray.rllib.utils.typingr   r   r   r   	getLoggerrI   r~   r%   r   tfvr   rP   r]   rv   r   r   r   r   r1   r/   <module>r      s                , , , , , , : : : : : : ? ? ? ? ? ? / / / / / / 5 5 5 5 5 5 / / / / / / 3 3 3 3 3 3 A A A A A A A A 7 7 7 7 7 7 5 5 5 5 5 5            
	8	$	$}R &9 &9 &9 &9 &9 &9 &9 &9R 1E 1E 1E 1E 1E 1E 1E 1Eh E! E! E! E! E! E! E! E!P <8 <8 <8 <8 <8 <8 <8 <8~ M1 M1 M1 M1 M1 M1 M1 M1` 
 
 
 
 
 
 
 
(%-7     r1   