
    &`i	                         d dl Z d dlmZmZ d dlmZmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ  G d de          ZdS )    N)AnyDict)&LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEYLEARNER_RESULTS_KL_KEY	PPOConfig)AddOneTsToEpisodesAndTruncateGeneralAdvantageEstimation)LearnerValueFunctionAPI)5OverrideToImplementCustomLogic_CallToSuperRecommendedoverride)LambdaDefaultDict)NUM_ENV_STEPS_SAMPLED_LIFETIME)convert_to_numpy)	Scheduler)ModuleID
TensorTypec            	       F    e Zd Z ee          d fd            Z ee          def fd            Ze ee          de	e
ef         ddf fd                        Ze ee          dee         fd                        Zej        ded	ed
eddfd            Z xZS )
PPOLearnerreturnNc                     t                                                       t           fd           _        t           fd           _         j        q j        j        rg j                            t                                  j        
                    t           j        j         j        j                             d S d S d S )Nc                 v    t          j                            |           j        j        j                  S )N)fixed_value_or_schedule	frameworkdevice)r   configget_config_for_moduleentropy_coeffr   _device	module_idselfs    x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/ppo_learner.py<lambda>z"PPOLearner.build.<locals>.<lambda>%   s6    iK55i@@N.|       c                 h                         j                            |           j                  S N)_get_tensor_variabler   r   kl_coeffr!   s    r$   r%   z"PPOLearner.build.<locals>.<lambda>2   s-    d7711)<<E  r&   )gammalambda_)superbuildr   #entropy_coeff_schedulers_per_modulecurr_kl_coeffs_per_module_learner_connectorr   *add_default_connectors_to_learner_pipelineprependr   appendr	   r+   r,   )r#   	__class__s   `r$   r.   zPPOLearner.build   s    
    
 
 	0 FW   F
 F
& #/F 0 #++,I,K,KLLL #***++T[5H       0///r&   r"   c                      t                      j        |fi |}| j                            |d            | j                            |d            |S r(   )r-   remove_moduler/   popr0   )r#   r"   kwargs	marl_specr5   s       r$   r7   zPPOLearner.remove_moduleM   sY    )EGG))>>v>>	044YEEE&**9d;;;r&   	timestepsc                >   t                                          |           | j        j                                        D ]\  }}| j                            |          }| j        |                             |	                    t          d                    }| j                            |t          f|d           |j        rV|t          f| j        v rFt!          | j                            |t          f                    }|                     |||           d S )N)r;   r   )timestep   )window)r"   r   kl_loss)r-   after_gradient_based_updatemodule_rl_modulesitemsr   r   r/   updategetr   metrics	log_valuer   use_kl_lossr   r   peek_update_module_kl_coeff)r#   r;   r"   rB   r   new_entropy_coeffr@   r5   s          r$   rA   z&PPOLearner.after_gradient_based_updateV   s8    	++i+@@@!%!8!>!>!@!@ 	 	Iv[66yAAF !% H!fimm,JANNfOO  L""BC! #    " 674<GG*L%%y2H&IJJ  ,,'!# -   '	 	r&   c                     t           gS r(   r   )clss    r$   rl_module_required_apisz"PPOLearner.rl_module_required_apisx   s    
 !!!r&   r   r@   c                    dS )aO  Dynamically update the KL loss coefficients of each module.

        The update is completed using the mean KL divergence between the action
        distributions current policy and old policy of each module. That action
        distribution is computed during the most recent update/call to `compute_loss`.

        Args:
            module_id: The module whose KL loss coefficient to update.
            config: The AlgorithmConfig specific to the given `module_id`.
            kl_loss: The mean KL loss of the module, computed inside
                `compute_loss_for_module()`.
        N )r#   r"   r   r@   s       r$   rK   z"PPOLearner._update_module_kl_coeff   s      r&   )r   N)__name__
__module____qualname__r   r
   r.   r   r7   r   r   strr   rA   classmethodlisttyperO   abcabstractmethodr   floatrK   __classcell__)r5   s   @r$   r   r      sx       Xg- - - - - -^ Xgx       ;Xg S> 
	      ;:@ Xg"T
 " " "  ["
 	  	
  
       r&   r   )rY   typingr   r   ray.rllib.algorithms.ppo.ppor   r   r   ray.rllib.connectors.learnerr   r	   ray.rllib.core.learner.learnerr
   0ray.rllib.core.rl_module.apis.value_function_apir   ray.rllib.utils.annotationsr   r   "ray.rllib.utils.lambda_defaultdictr   ray.rllib.utils.metricsr   ray.rllib.utils.numpyr   #ray.rllib.utils.schedules.schedulerr   ray.rllib.utils.typingr   r   r   rQ   r&   r$   <module>rh      s}   



                 
        3 2 2 2 2 2 M M M M M M        A @ @ @ @ @      3 2 2 2 2 2 9 9 9 9 9 9 7 7 7 7 7 7 7 7v v v v v v v v v vr&   