
    &`i!                     :   d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ  e	            \  ZZe G d d                      Ze G d	 d
                      Ze G d d                      Ze G d d                      Ze G d d                      ZdS )    )PolicyState)SampleBatch)TorchPolicy)OldAPIStack)try_import_torch)PiecewiseSchedulec                   *     e Zd ZdZddZ fdZ xZS )LearningRateSchedulez9Mixin for TorchPolicy that adds a learning rate schedule.Nc                 T   d | _         d | _        ||| _        nBt          ||d         d         d           | _         | j                             d          | _        |	|| _        d S t          ||d         d         d           | _        | j                            d          | _        d S )Noutside_value	frameworkr   )_lr_schedule_lr2_schedulecur_lrr   valuecur_lr2)selflrlr_schedulelr2lr2_schedules        q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/torch_mixins.py__init__zLearningRateSchedule.__init__   s     ! DKK 1;r?2+>$! ! !D +11!44DKDLLL!2L,<R,@D" " "D  -33A66DLLL    c                    t                                          |           | j        rC| j                            |d                   | _        | j        D ]}|j        D ]}| j        |d<   | j        r`t          | j                  dk    sJ | j                            |d                   | _	        | j        d         }|j        D ]}| j	        |d<   d S d S )Ntimestepr         )
superon_global_var_updater   r   r   _optimizersparam_groupsr   lenr   )r   global_varsoptp	__class__s       r   r"   z)LearningRateSchedule.on_global_var_update#   s    $$[111 	*+11+j2IJJDK' * *) * *A"kAdGG* 	't'((A-----33K
4KLLDL"1%C% ' ',$	' 	'' 'r   )NN__name__
__module____qualname____doc__r   r"   __classcell__r)   s   @r   r
   r
      sR        CC7 7 7 7(' ' ' ' ' ' ' ' 'r   r
   c                   (     e Zd ZdZd Z fdZ xZS )EntropyCoeffSchedulez4Mixin for TorchPolicy that adds entropy coeff decay.c                    d | _         |	|| _        d S t          |t                    r$t	          ||d         d         d           | _         nt	          d|g|dggdd           | _         | j                             d          | _        d S )Nr   r   r           )_entropy_coeff_scheduleentropy_coeff
isinstancelistr   r   )r   r6   entropy_coeff_schedules      r   r   zEntropyCoeffSchedule.__init__6   s    '+$ ")!.D 0$77 /@*"8"<R"@"0 0 0,, 0A'*@#)FG"%"0 0 0,
 "&!=!C!CA!F!FDr   c                     t          t          |                               |           | j        '| j                            |d                   | _        d S d S )Nr   )r!   r2   r"   r5   r   r6   )r   r&   r)   s     r   r"   z)EntropyCoeffSchedule.on_global_var_updateM   s\    "D))>>{KKK'3!%!=!C!CJ'" "D 43r   r*   r0   s   @r   r2   r2   2   sP        >>G G G.        r   r2   c                   H     e Zd ZdZd Zd Zdef fdZdeddf fdZ xZ	S )	KLCoeffMixinzAssigns the `update_kl()` method to a TorchPolicy.

    This is used by Algorithms to update the KL coefficient
    after each learning step based on `config.kl_target` and
    the measured KL value (from the train_batch).
    c                 :    |d         | _         |d         | _        d S )Nkl_coeff	kl_target)r>   r?   )r   configs     r   r   zKLCoeffMixin.__init__^   s    z*,r   c                     |d| j         z  k    r| xj        dz  c_        n|d| j         z  k     r| xj        dz  c_        | j        S )Ng       @g      ?g      ?)r?   r>   )r   
sampled_kls     r   	update_klzKLCoeffMixin.update_kld   sP    dn,,,MMS MMM#...MMS MM}r   returnc                 \    t                                                      }| j        |d<   |S )Ncurrent_kl_coeff)r!   	get_stater>   r   stater)   s     r   rG   zKLCoeffMixin.get_statem   s*    !!##$(M !r   rI   Nc                     |                     d| j        d                   | _        t                                          |           d S )NrF   r>   )popr@   r>   r!   	set_staterH   s     r   rL   zKLCoeffMixin.set_states   s>    		"4dk*6MNN%     r   )
r+   r,   r-   r.   r   rC   r   rG   rL   r/   r0   s   @r   r<   r<   U   s         - - -  ;      !{ !t ! ! ! ! ! ! ! ! ! !r   r<   c                       e Zd ZdZd Zd ZdS )ValueNetworkMixina  Assigns the `_value()` method to a TorchPolicy.

    This way, Policy can call `_value()` to get the current VF estimate on a
    single(!) observation (as done in `postprocess_trajectory_fn`).
    Note: When doing this, an actual forward pass is being performed.
    This is different from only calling `model.value_function()`, where
    the result of the most recent forward pass is being used to return an
    already calculated tensor.
    c                 |     |                     d          s|                     d          r fd}nd }| _        d S )Nuse_gaevtracec                      t          |           }                     |           }                     |           \  }}j                                        d                                         S )Nr   )r   _lazy_tensor_dictmodelvalue_functionitem)
input_dict	model_out_r   s      r   r   z)ValueNetworkMixin.__init__.<locals>.value   s]    (44
!33J??
#zz*55	1z002215::<<<r   c                      dS )Nr4    )argskwargss     r   r   z)ValueNetworkMixin.__init__.<locals>.value   s    sr   )get_value)r   r@   r   s   `  r   r   zValueNetworkMixin.__init__   sf     ::i   	FJJx$8$8 	
= = = = = =   r   c                 B    t           j        |                                iS )a  Defines extra fetches per action computation.

        Args:
            input_dict (Dict[str, TensorType]): The input dict used for the action
                computing forward pass.
            state_batches (List[TensorType]): List of state tensors (empty for
                non-RNNs).
            model (ModelV2): The Model object of the Policy.
            action_dist: The instantiated distribution
                object, resulting from the model's outputs and the given
                distribution class.

        Returns:
            Dict[str, TensorType]: Dict with extra tf fetches to perform per
                action computation.
        )r   VF_PREDSrU   )r   rW   state_batchesrT   action_dists        r   extra_action_outz"ValueNetworkMixin.extra_action_out   s"    *  %"6"6"8"8
 	
r   N)r+   r,   r-   r.   r   rd   r[   r   r   rN   rN   z   s<           .
 
 
 
 
r   rN   c                   &    e Zd ZdZd ZddZd ZdS )TargetNetworkMixina  Mixin class adding a method for (soft) target net(s) synchronizations.

    - Adds the `update_target` method to the policy.
      Calling `update_target` updates all target Q-networks' weights from their
      respective "main" Q-networks, based on tau (smooth, partial updating).
    c                 h    | j                             dd          }|                     |           d S )Ntau      ?)rh   )r@   r^   update_target)r   rh   s     r   r   zTargetNetworkMixin.__init__   s4    kooeS))s#####r   Nc                    p| j                             dd          | j                                        t	          t          | j                                                                                            }fd|                                D             | j                                        D ]}|	                               d S )Nrh   ri   c                 @    i | ]\  }}||         z  d z
  |z  z   S )r    r[   ).0kvmodel_state_dictrh   s      r   
<dictcomp>z4TargetNetworkMixin.update_target.<locals>.<dictcomp>   sG     
 
 
1 s%a((AGq=8
 
 
r   )
r@   r^   rT   
state_dictnextitertarget_modelsvaluesitemsload_state_dict)r   rh   target_state_dicttargetrp   s    `  @r   rj   z TargetNetworkMixin.update_target   s     0T[__UC00:0022 !d&8&?&?&A&A!B!BCCNNPP
 
 
 
 
)//11
 
 

 (//11 	5 	5F""#34444	5 	5r   c                 X    t          j        | |           |                                  d S N)r   set_weightsrj   )r   weightss     r   r}   zTargetNetworkMixin.set_weights   s/     	g...r   r|   )r+   r,   r-   r.   r   rj   r}   r[   r   r   rf   rf      sP         $ $ $
5 5 5 5(    r   rf   N)ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.torch_policyr   ray.rllib.utils.annotationsr   ray.rllib.utils.frameworkr   ray.rllib.utils.schedulesr   torchnnr
   r2   r<   rN   rf   r[   r   r   <module>r      s   / / / / / / 5 5 5 5 5 5 5 5 5 5 5 5 3 3 3 3 3 3 6 6 6 6 6 6 7 7 7 7 7 7	r #' #' #' #' #' #' #' #'L        D !! !! !! !! !! !! !! !!H 8
 8
 8
 8
 8
 8
 8
 8
v & & & & & & & & & &r   