
    &`i]                        d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 erddl5m6Z6  ej7        e8          Z9dZ:dZ;dZ<dZ=dZ> G d de          Z? G d de          Z@dS )a3  
Proximal Policy Optimization (PPO)
==================================

This file defines the distributed Algorithm class for proximal policy
optimization.
See `ppo_[tf|torch]_policy.py` for the definition of the policy loss.

Detailed documentation: https://docs.ray.io/en/master/rllib-algorithms.html#ppo
    N)TYPE_CHECKINGAnyDictListOptionalTypeUnion)Self)DEPRECATED_VALUE)	Algorithm)AlgorithmConfigNotProvided)RLModuleSpec)standardize_fieldssynchronous_parallel_sample)multi_gpu_train_one_steptrain_one_step)Policy)OldAPIStackoverride)ALL_MODULESENV_RUNNER_RESULTSENV_RUNNER_SAMPLING_TIMERLEARNER_RESULTSLEARNER_UPDATE_TIMERNUM_AGENT_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLED_LIFETIME!NUM_MODULE_STEPS_TRAINED_LIFETIMESAMPLE_TIMERSYNCH_WORKER_WEIGHTS_TIMERTIMERS)LEARNER_STATS_KEY)	Scheduler)
ResultDict)log_once)Learnervf_loss_unclippedvf_explained_varmean_kl_losscurr_kl_coeffcurr_entropy_coeffc            !           e Zd ZdZd fd	Z ee          defd            Z ee          de	e
d         ef         fd            Z ee          eeeeeeeeeeeeeedd	ee         d
ee         dee         dee         dee         dee         dee         dee         deeee	eef                                    dee         dee         dee         deeee	eef                                    def fd            Z ee          d fd            Ze ee          deeef         f fd                        Z xZS )	PPOConfigaT  Defines a configuration class from which a PPO Algorithm can be built.

    .. testcode::

        from ray.rllib.algorithms.ppo import PPOConfig

        config = PPOConfig()
        config.environment("CartPole-v1")
        config.env_runners(num_env_runners=1)
        config.training(
            gamma=0.9, lr=0.01, kl_coeff=0.3, train_batch_size_per_learner=256
        )

        # Build a Algorithm object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()

    .. testcode::

        from ray.rllib.algorithms.ppo import PPOConfig
        from ray import tune

        config = (
            PPOConfig()
            # Set the config object's env.
            .environment(env="CartPole-v1")
            # Update the config object's training parameters.
            .training(
                lr=0.001, clip_param=0.2
            )
        )

        tune.Tuner(
            "PPO",
            run_config=tune.RunConfig(stop={"training_iteration": 1}),
            param_space=config,
        ).fit()

    .. testoutput::
        :hide:

        ...
    Nc                    ddi| _         t                                          |pt                     d| _        d| _        d| _        d| _        d| _        d| _	        d	| _
        d| _        d
| _        d| _        d| _        d| _        d
| _        d| _        d| _        d| _        d| _        d| _        d| j        d<   d| _        d| _        t2          | _        t2          | _        dS )z!Initializes a PPOConfig instance.typeStochasticSampling)
algo_classg-C6
?autoi  T      g      ?g?g{Gz?        g333333?g      $@N   Fvf_share_layers)exploration_configsuper__init__PPOlrrollout_fragment_lengthtrain_batch_size
use_criticuse_gae
num_epochsminibatch_sizeshuffle_batch_per_epochlambda_use_kl_losskl_coeff	kl_targetvf_loss_coeffentropy_coeff
clip_paramvf_clip_param	grad_clipnum_env_runnersmodelentropy_coeff_schedulelr_scheduler   sgd_minibatch_sizer8   )selfr2   	__class__s     p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/ppo.pyr;   zPPOConfig.__init__l   s     (#
 	J$5#666 '-$ $ !'+$  !  ! ).
$%&*# #3/    returnc                 t    | j         dk    rddlm} t          |          S t	          d| j          d          )Ntorchr   )DefaultPPOTorchRLModule)module_classThe framework z/ is not supported. Use either 'torch' or 'tf2'.)framework_str:ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_modulerZ   r   
ValueError)rS   rZ   s     rU   get_default_rl_module_specz$PPOConfig.get_default_rl_module_spec   sm    ((       -DEEEE/!3 / / /  rV   r'   c                     | j         dk    rddlm} |S | j         dv rt          d          t          d| j          d          )NrY   r   )PPOTorchLearner)tf2tfzPTensorFlow is no longer supported on the new API stack! Use `framework='torch'`.r\   z+ is not supported. Use `framework='torch'`.)r]   0ray.rllib.algorithms.ppo.torch.ppo_torch_learnerrb   r_   )rS   rb   s     rU   get_default_learner_classz#PPOConfig.get_default_learner_class   s    ((      #"=00+  
 +!3 + + +  rV   )r@   rA   rE   rF   rG   rH   rI   rJ   rP   rK   rL   rM   rQ   r8   r@   rA   rE   rF   rG   rH   rI   rJ   rP   rK   rL   rM   rQ   c                    t                      j        di | |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        |
t          ur|
| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |	t          ur|	| _        | S )a	  Sets the training related configuration.

        Args:
            use_critic: Should use a critic as a baseline (otherwise don't use value
                baseline; required for using GAE).
            use_gae: If true, use the Generalized Advantage Estimator (GAE)
                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
            lambda_: The lambda parameter for General Advantage Estimation (GAE).
                Defines the exponential weight used between actually measured rewards
                vs value function estimates over multiple time steps. Specifically,
                `lambda_` balances short-term, low-variance estimates against long-term,
                high-variance returns. A `lambda_` of 0.0 makes the GAE rely only on
                immediate rewards (and vf predictions from there on, reducing variance,
                but increasing bias), while a `lambda_` of 1.0 only incorporates vf
                predictions at the truncation points of the given episodes or episode
                chunks (reducing bias but increasing variance).
            use_kl_loss: Whether to use the KL-term in the loss function.
            kl_coeff: Initial coefficient for KL divergence.
            kl_target: Target value for KL divergence.
            vf_loss_coeff: Coefficient of the value function loss. IMPORTANT: you must
                tune this if you set vf_share_layers=True inside your model's config.
            entropy_coeff: The entropy coefficient (float) or entropy coefficient
                schedule in the format of
                [[timestep, coeff-value], [timestep, coeff-value], ...]
                In case of a schedule, intermediary timesteps will be assigned to
                linearly interpolated coefficient values. A schedule config's first
                entry must start with timestep 0, i.e.: [[0, initial_value], [...]].
            clip_param: The PPO clip parameter.
            vf_clip_param: Clip param for the value function. Note that this is
                sensitive to the scale of the rewards. If your expected V is large,
                increase this.
            grad_clip: If specified, clip the global norm of gradients by this amount.

        Returns:
            This updated AlgorithmConfig object.
         )r:   trainingr   r@   rA   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rQ   rP   )rS   r@   rA   rE   rF   rG   rH   rI   rJ   rP   rK   rL   rM   rQ   r8   kwargsrT   s                   rU   ri   zPPOConfig.training   s   v 	""6"""[(((DO +%%"DL+%%"DLk))*D;&&$DMK''&DN++!.D++!.D[(((DO++!.DK''&DN k))*D!44*@D'rV   c                    t                                                       |                                  | j        sG| j        | j        k    r7|                     d| j         d| j         d| j         d| j         d	           ns| j        rl| j        }| j        p| j        }t          |t                    rBt          |t                    r-||k    r'|                     d| d| d| d| j         d	           | j        s'| j        dk    r| j        s|                     d           | j        rT| j        |                     d	           | j        |                     d
           t!          j        | j        dd           t          | j        t$                    r"| j        dk     r|                     d           d S d S d S )Nz`minibatch_size` (z!) must be <= `train_batch_size` (z.). In PPO, the train batch will be split into zG chunks, each of which is iterated over (used for updating the policy) z times.z-) must be <= `train_batch_size_per_learner` (truncate_episodeszEpisode truncation is not supported without a value function (to estimate the return at the end of the truncated trajectory). Consider setting batch_mode=complete_episodes.zW`lr_schedule` is deprecated and must be None! Use the `lr` setting to setup a schedule.zm`entropy_coeff_schedule` is deprecated and must be None! Use the `entropy_coeff` setting to setup a schedule.rJ   zentropy coefficient)fixed_value_or_schedulesetting_namedescriptionr6   z`entropy_coeff` must be >= 0.0)r:   validate4validate_train_batch_size_vs_rollout_fragment_lengthenable_rl_module_and_learnerrC   r?   _value_errorrB   train_batch_size_per_learner
isinstanceintin_evaluation
batch_moderA   rQ   rP   r$   rJ   float)rS   mbstbsrT   s      rU   rp   zPPOConfig.validate  s    	 	AACCC 1	#d&;;;T%8  '+'< '+':  DH?      . 		%C3Lt7LC#s## 
3(<(< s!!O O O7:O O+.O O 7;oO O O   "
	#666L 7 0   , 	+!!8   *6!!C   (,(:,1   
 d(%00 	@T5G#5M5M>?????	@ 	@5M5MrV   c                 4    t                      j        ddiz  S )Nr8   F)r:   _model_config_auto_includes)rS   rT   s    rU   r}   z%PPOConfig._model_config_auto_includesg  s     ww26G5OOOrV   NrW   N)__name__
__module____qualname____doc__r;   r   r   r   r`   r	   r   strrf   r   r   r   boolry   r   rv   r
   ri   rp   propertyr   r   r}   __classcell__)rT   s   @rU   r.   r.   ?   s       * *X/0 /0 /0 /0 /0 /0b XoL     Xo5i#1E+F    $ Xo &1"-#.&1$/%0)4)4JU&1)4%0?J(%[ [ [ TN[ $	[
 %[ d^[ 5/[ E?[  [  [ !)d5e3D.E)F G[ UO[  [ E?[  d4c5j(9#:;<![( 
)[ [ [ [ [ [z XoH@ H@ H@ H@ H@ H@T XoPT#s(^ P P P P P  XP P P P PrV   r.   c                       e Zd Ze ee          defd                        Ze ee          dede	e
e                  fd                        Z ee          dd            Zedefd            ZdS )	r<   rW   c                     t                      S r~   )r.   )clss    rU   get_default_configzPPO.get_default_confign  s     {{rV   configc                 b    |d         dk    rddl m} |S |d         dk    rddlm} |S ddlm} |S )N	frameworkrY   r   )PPOTorchPolicyrd   )PPOTF1Policy)PPOTF2Policy))ray.rllib.algorithms.ppo.ppo_torch_policyr   &ray.rllib.algorithms.ppo.ppo_tf_policyr   r   )r   r   r   r   r   s        rU   get_default_policy_classzPPO.get_default_policy_classs  sl    
 +'))PPPPPP!!K D((KKKKKKKKKKKKrV   Nc                 D   | j         j        s|                                 S | j                            t
          t          f          5  | j         j        dk    r;t          | j	        | j         j
        | j         j        | j         j        d          \  }}n:t          | j	        | j         j
        | j         j        | j         j        d          \  }}|s	 d d d            d S | j                            |t                     d d d            n# 1 swxY w Y   | j                            t
          t          f          5  | j                            |t"          | j                            t          t"          f          t&          | j                            t(          t*          t&          fd          i| j         j        | j         j        | j         j                  }| j                            |t(                     d d d            n# 1 swxY w Y   | j                            t
          t2          f          5  t5          |d                                                   t*          hz
  }| j	                            | j        |d	           d d d            d S # 1 swxY w Y   d S )
Nagent_stepsT)
worker_setmax_agent_stepssample_timeout_s_uses_new_env_runners_return_metrics)r   max_env_stepsr   r   r   )keyr   )default)episodes	timestepsrB   rC   rD   )from_worker_or_learner_grouppoliciesinference_only)r   "enable_env_runner_and_connector_v2_training_step_old_api_stackmetricslog_timer"   r   count_steps_byr   env_runner_grouptotal_train_batch_sizer   	aggregater   r   learner_groupupdater   peekr   r   r   rB   rC   rD   r!   setkeyssync_weights)rS   r   env_runner_resultslearner_resultsmodules_to_updates        rU   training_stepzPPO.training_step  s    {= 	744666 \""F,E#FGG 	O 	O{)]::/J#4$(K$F%)[%AF$(0 0 0,,, 0K#4"&+"D%)[%AF$(0 0 0,,  1	O 	O 	O 	O 	O 	O 	O 	O6 L""#5;M"NNN7	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O< \""F,@#ABB 	I 	I"077!2))/1OP  6)) / + A
 %& *  "  ;1#{9(,(K+ 8  O. L""?"HHH1	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I8 \""F,F#GHH 	 	 !$OA$6$;$;$=$= > >+ N!..-1-?*#	 /   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s?   B	D
!D

DD;B=HHH5AJJJc           
      h     j         t                   5   j        j        dk    r,t	           j         j        j         j        j                  }n+t	           j         j        j         j        j                  }|si cd d d            S |                                } j	        t          xx         |                                z  cc<    j	        t          xx         |                                z  cc<   t          |dg          }d d d            n# 1 swxY w Y    j        j        rt!           |          }nt#           |          }t%          |                                          } j	        t                    fd|D             d} j         t(                   5   j                                        dk    rd } j                            |||           d d d            n# 1 swxY w Y   |                                D ]\  }}|t0                                       d	          }                     |                              |            j        j        |t0                   d
         z  }	|t0                   d         }
t;          d          rc j                            di                               d          r5|	dk    r/t<                              d                     ||	|
                     |j!        |         "                    d            |j!        |         d         #                                }t;          d          rF| j        j$        k    r6d _%        t<                              d| d j        d          d| d            j&        '                    |           |S )Nr   )r   r   r   )r   r   r   
advantagesc                 @    i | ]}|j         j        |         j        S rh   )
env_runner
policy_mapnum_grad_updates).0pidrS   s     rU   
<dictcomp>z4PPO._training_step_old_api_stack.<locals>.<dictcomp>  s8     , , , T_/4E, , ,rV   )timestepnum_grad_updates_per_policyr   )r   r   global_varsklvf_losspolicy_lossppo_warned_lr_ratiorO   r8   d   zThe magnitude of your value function loss for policy: {} is extremely large ({}) compared to the policy loss ({}). This can prevent the policy from learning. Consider scaling down the VF loss by reducing vf_loss_coeff, or disabling vf_share_layers.rewardsppo_warned_vf_clipTz1The mean reward returned from the environment is z! but the vf_clip_param is set to rL   z%. Consider increasing it for policy: z' to improve value function convergence.)(_timersr    r   r   r   r   r   r   as_multi_agent	_countersr   r   r   	env_stepsr   simple_optimizerr   r   listr   r!   num_remote_workersr   itemsr#   get
get_policy	update_klrI   r&   loggerwarningformatpolicy_batchesset_get_interceptormeanrL   warned_vf_clipr   set_global_vars)rS   train_batchtrain_resultspolicies_to_updater   r   	policy_idpolicy_infokl_divergencescaled_vf_lossr   mean_rewards   `           rU   r   z PPO._training_step_old_api_stack  s    \,' 	J 	J{)]::9#4$(K$F%)[%A   :#4"&+"D%)[%A    	J 	J 	J 	J 	J 	J 	J 	J  &4466KN2333{7N7N7P7PP333N0111[5J5J5L5LL111,[<.IIK)	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J, ;' 	H*4==MM4T;GGM!-"4"4"6"677 '>?, , , ,-, , ,	
 
 \45 	 	$7799A==/3,%221M/ + 3   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 '4&9&9&;&; $	 $	"I{ ((9:>>tDDMOOI&&00??? )K8I,J9,UU  &&78GK.//KOOGR00445FGG #S((' (.vi'U'U   &y1EEdKKK%4Y?	JOOQQK-..
$+";;;&*#3 3 38<O8T3 3;D3 3 3   	''444s+   A+DA=DDD!=G**G.1G.r   )r   r   r   classmethodr   r   r.   r   r   r   r   r   r   r   r   r%   r   rh   rV   rU   r<   r<   m  s        Xi9     [ Xi $ 	$v,	       [ " XiO O O Ob aj a a a [a a arV   r<   )Ar   loggingtypingr   r   r   r   r   r   r	   typing_extensionsr
   ray._common.deprecationr   ray.rllib.algorithms.algorithmr   %ray.rllib.algorithms.algorithm_configr   r   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   r   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.metricsr   r   r   r   r   r   r   r   r   r    r!   r"   $ray.rllib.utils.metrics.learner_infor#   #ray.rllib.utils.schedules.schedulerr$   ray.rllib.utils.typingr%   ray.util.debugr&   ray.rllib.core.learner.learnerr'   	getLoggerr   r   %LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY$LEARNER_RESULTS_VF_EXPLAINED_VAR_KEYLEARNER_RESULTS_KL_KEY!LEARNER_RESULTS_CURR_KL_COEFF_KEY&LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEYr.   r<   rh   rV   rU   <module>r      s  	 	  H H H H H H H H H H H H H H H H H H " " " " " " 4 4 4 4 4 4 4 4 4 4 4 4 N N N N N N N N ; ; ; ; ; ;               + * * * * * = = = = = = = =                            C B B B B B 9 9 9 9 9 9 - - - - - - # # # # # # 7666666 
	8	$	$(; %'9 $' $3 !)= &kP kP kP kP kP kP kP kP\	M M M M M) M M M M MrV   