
    &`iF                        d Z ddlZddlmZmZ ddlmZ ddlmZm	Z	 ddl
mZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ  ej        e          ZdZdZ dZ! G d de          Z" G d de          Z#dS )aO  Asynchronous Proximal Policy Optimization (APPO)

The algorithm is described in [1] (under the name of "IMPACT"):

Detailed documentation:
https://docs.ray.io/en/master/rllib-algorithms.html#appo

[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
Luo et al. 2020
https://arxiv.org/pdf/1912.00167
    N)OptionalType)Self)DEPRECATED_VALUEdeprecation_warning)AlgorithmConfigNotProvided)IMPALAIMPALAConfig)RLModuleSpec)Policy)override)LAST_TARGET_UPDATE_TSLEARNER_STATS_KEYNUM_AGENT_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLEDNUM_TARGET_UPDATESmean_kl_losscurr_kl_coeffold_action_distc                       e Zd ZdZd fd	Z ee          eeeeeeeeeeeeeedde	e
         de	e
         de	e         de	e         d	e	e
         d
e	e         de	e         de	e         de	e         de	e         de	e         de	e         def fd            Z ee          d fd            Z ee          d             Z ee          defd            Ze ee           fd                        Z xZS )
APPOConfiga
  Defines a configuration class from which an APPO Algorithm can be built.

    .. testcode::

        from ray.rllib.algorithms.appo import APPOConfig
        config = (
            APPOConfig()
            .training(lr=0.01, grad_clip=30.0, train_batch_size_per_learner=50)
        )
        config = config.learners(num_learners=1)
        config = config.env_runners(num_env_runners=1)
        config = config.environment("CartPole-v1")

        # Build an Algorithm object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()
        del algo

    .. testcode::

        from ray.rllib.algorithms.appo import APPOConfig
        from ray import tune

        config = APPOConfig()
        # Update the config object.
        config = config.training(lr=tune.grid_search([0.001,]))
        # Set the config object's env.
        config = config.environment(env="CartPole-v1")
        # Use to_dict() to get the old-style python config dict when running with tune.
        tune.Tuner(
            "APPO",
            run_config=tune.RunConfig(
                stop={"training_iteration": 1},
                verbose=0,
            ),
            param_space=config.to_dict(),

        ).fit()

    .. testoutput::
        :hide:

        ...
    Nc                 \   ddi| _         t                                          |pt                     d| _        d| _        d| _        d| _        d| _        d| _	        d| _
        d	| _        d
| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _         d| _!        d| _"        d| _#        tH          | _%        tH          | _&        dS )z"Initializes a APPOConfig instance.typeStochasticSampling)
algo_classTg      ?g?Fg{Gz?g       @         g      D@global_normadamgMb@?gGz?        g?g      ?Nr   d      i,  )'exploration_configsuper__init__APPOvtraceuse_gaelambda_
clip_paramuse_kl_losskl_coeff	kl_targettarget_worker_clippingcircular_buffer_num_batches$circular_buffer_iterations_per_batchnum_env_runnerstarget_network_update_freqbroadcast_interval	grad_clipgrad_clip_byopt_typelrdecaymomentumepsilonvf_loss_coeffentropy_coefftaulr_scheduleentropy_coeff_schedulenum_gpusnum_multi_gpu_tower_stacksminibatch_buffer_sizereplay_proportionreplay_buffer_num_slotslearner_queue_sizelearner_queue_timeoutr   target_update_frequency
use_critic)selfr   	__class__s     r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.pyr'   zAPPOConfig.__init__V   sH    (#
 	J$6$777
  &)# ,-(451  !*+'"# *
 !  &*#*+'%&"!$'*$"$%(" (8$*    )r)   r*   r+   r,   r-   r.   r/   r4   r?   r0   r1   r2   rI   rJ   r)   r*   r+   r,   r-   r.   r/   r4   r?   r0   r1   r2   returnc                .   |t           k    rt          ddd           |t           k    rt          ddd            t                      j        di | |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        |t          ur|| _        |t          ur|| _        |	t          ur|	| _        |
t          ur|
| _        |t          ur|| _        |t          ur|| _        | S )	u  Sets the training related configuration.

        Args:
            vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
                advantages will be used instead.
            use_gae: If true, use the Generalized Advantage Estimator (GAE)
                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
                Only applies if vtrace=False.
            lambda_: GAE (lambda) parameter.
            clip_param: PPO surrogate slipping parameter.
            use_kl_loss: Whether to use the KL-term in the loss function.
            kl_coeff: Coefficient for weighting the KL-loss term.
            kl_target: Target term for the KL-term to reach (via adjusting the
                `kl_coeff` automatically).
            target_network_update_freq: NOTE: This parameter is only applicable on
                the new API stack. The frequency with which to update the target
                policy network from the main trained policy network. The metric
                used is `NUM_ENV_STEPS_TRAINED_LIFETIME` and the unit is `n` (see [1]
                4.1.1), where: `n = [circular_buffer_num_batches (N)] *
                [circular_buffer_iterations_per_batch (K)] * [train batch size]`
                For example, if you set `target_network_update_freq=2`, and N=4, K=2,
                and `train_batch_size_per_learner=500`, then the target net is updated
                every 2*4*2*500=8000 trained env steps (every 16 batch updates on each
                learner).
                The authors in [1] suggests that this setting is robust to a range of
                choices (try values between 0.125 and 4).
            target_network_update_freq: The frequency to update the target policy and
                tune the kl loss coefficients that are used during training. After
                setting this parameter, the algorithm waits for at least
                `target_network_update_freq` number of environment samples to be trained
                on before updating the target networks and tune the kl loss
                coefficients. NOTE: This parameter is only applicable when using the
                Learner API (enable_rl_module_and_learner=True).
            tau: The factor by which to update the target policy network towards
                the current policy network. Can range between 0 and 1.
                e.g. updated_param = tau * current_param + (1 - tau) * target_param
            target_worker_clipping: The maximum value for the target-worker-clipping
                used for computing the IS ratio, described in [1]
                IS = min(π(i) / π(target), ρ) * (π / π(i))
            circular_buffer_num_batches: The number of train batches that fit
                into the circular buffer. Each such train batch can be sampled for
                training max. `circular_buffer_iterations_per_batch` times.
            circular_buffer_iterations_per_batch: The number of times any train
                batch in the circular buffer can be sampled for training. A batch gets
                evicted from the buffer either if it's the oldest batch in the buffer
                and a new batch is added OR if the batch reaches this max. number of
                being sampled.

        Returns:
            This updated AlgorithmConfig object.
        rI   r4   T)oldnewerrorrJ   zM`use_critic` no longer supported! APPO always uses a value function (critic).)rQ   helprS    )r   r   r&   trainingr	   r)   r*   r+   r,   r-   r.   r/   r4   r?   r0   r1   r2   )rK   r)   r*   r+   r,   r-   r.   r/   r4   r?   r0   r1   r2   rI   rJ   kwargsrL   s                   rM   rV   zAPPOConfig.training   sh   P #&666-0   
 ))) %	    	""6"""$$ DK+%%"DL+%%"DL[(((DOk))*D;&&$DMK''&DN%[88.HD+k!!DH!44*@D'&k99/JD,/{BB4 5 rN   c                 4   t                                                       | j        rm| j        dk    s| j        dk    r|                     d           | j        dk    r|                     d           | j        dk    r|                     d           d S d S d S )Nr   r"   a  `minibatch_buffer_size/replay_proportion` not valid on new API stack with APPO! Use `circular_buffer_num_batches` for the number of train batches in the circular buffer. To change the maximum number of times any batch may be sampled, set `circular_buffer_iterations_per_batch`.aO  `num_multi_gpu_tower_stacks` not supported on new API stack with APPO! In order to train on multi-GPU, use `config.learners(num_learners=[number of GPUs], num_gpus_per_learner=1)`. To scale the throughput of batch-to-GPU-pre-loading on each of your `Learners`, set `num_gpu_loader_threads` to a higher number (recommended values: 1-8).r$   aE  `learner_queue_size` not supported on new API stack with APPO! In order set the size of the circular buffer (which acts as a 'learner queue'), use `config.training(circular_buffer_num_batches=..)`. To change the maximum number of times any batch may be sampled, set `config.training(circular_buffer_iterations_per_batch=..)`.)r&   validateenable_rl_module_and_learnerrD   rE   _value_errorrC   rG   rK   rL   s    rM   rY   zAPPOConfig.validate  s     , 	)Q..$2HC2O2O!!>   .!33!!   &",,!!R    +	 	( -,rN   c                     | j         dk    rddlm} |S | j         dv rt          d          t          d| j          d          )Ntorchr   )APPOTorchLearner)tf2tfzPTensorFlow is no longer supported on the new API stack! Use `framework='torch'`.The framework z+ is not supported. Use `framework='torch'`.)framework_str2ray.rllib.algorithms.appo.torch.appo_torch_learnerr_   
ValueError)rK   r_   s     rM   get_default_learner_classz$APPOConfig.get_default_learner_class4  s    ((      $#=00+  
 +!3 + + +  rN   c                 v    | j         dk    rddlm} nt          d| j          d          t	          |          S )Nr^   r   )APPOTorchRLModulerb   z/ is not supported. Use either 'torch' or 'tf2'.)module_class)rc   4ray.rllib.algorithms.appo.torch.appo_torch_rl_modulerh   re   r   )rK   RLModules     rM   get_default_rl_module_specz%APPOConfig.get_default_rl_module_specG  ss    ((       /!3 / / /  
 2222rN   c                 4    t                      j        ddiz  S )Nvf_share_layersF)r&   _model_config_auto_includesr\   s    rM   ro   z&APPOConfig._model_config_auto_includesU  s     ww26G5OOOrN   NrO   N)__name__
__module____qualname____doc__r'   r   r   r	   r   r   boolfloatintr   rV   rY   rf   r   rl   propertyr   ro   __classcell__rL   s   @rM   r   r   (   sS       + +ZC+ C+ C+ C+ C+ C+J Xl "-"-#.&1&1$/%04?*2=5@>I 0##s s s s $	s
 %s UOs d^s 5/s E?s %-SMs e_s !)s &.c]s /7sms& 
's s s s s sj Xl           D Xl  $ Xl3L 3 3 3 3 XoP P P P  XP P P P PrN   r   c                        e Zd Z fdZ ee          d fd            Ze ee          defd                        Z	e ee          de
deee                  fd                        Z xZS )	r(   c                      t                      j        |g|R i | | j        j        s| j                            d            dS dS )zInitializes an APPO instance.c                 *    |                                  S rp   update_targetp_s     rM   <lambda>zAPPO.__init__.<locals>.<lambda>e      ARAR rN   N)r&   r'   configrZ   
env_runnerforeach_policy_to_train)rK   r   argsrW   rL   s       rM   r'   zAPPO.__init__\  sf    1$111&111 {7 	TO334R4RSSSSS	T 	TrN   rO   Nc                 :   | j         j        r t                                                      S t                                                      | j        t
                   }| j        | j         j        dk    rt          nt                   }| j         j	        | j         j
        z  }||z
  |k    ro| j        t          xx         dz  cc<   || j        t
          <   | j                            d            | j         j        rfd}| j                            |           S )Nagent_stepsr   c                 *    |                                  S rp   r   r   s     rM   r   z$APPO.training_step.<locals>.<lambda>~  r   rN   c                 Z   t           vs$J d                    t                     f            |v rK|         t                                        d          }|J |f            |                     |           d S t                              d                    |                     d S )Nz'{} should be nested under policy id keyklzNo data for {}, not updating kl)r   formatget	update_klloggerwarning)pipi_idr   train_resultss      rM   updatez"APPO.training_step.<locals>.update  s    ,MAAAAHH-  &	DAAA --*512CDHHNN!~~u/E~~~R((((('H'O'OPU'V'VWWWWWrN   )r   rZ   r&   training_step	_countersr   count_steps_byr   r   
num_epochsrD   r   r   r   r-   )rK   last_updatecur_tstarget_update_freqr   r   rL   s        @rM   r   zAPPO.training_stepg  s0   ;3 	+77((***--// n%:; ;->> ('*	
 "[3dk6WWK"444N-...!3...4:DN01 O334R4RSSS {& @X X X X X" 77???rN   c                     t                      S rp   )r   )clss    rM   get_default_configzAPPO.get_default_config  s     ||rN   r   c                     |d         dk    rddl m} |S |d         dk    r|j        rt          d          ddlm} |S ddlm} |S )	N	frameworkr^   r   )APPOTorchPolicyra   zWRLlib's RLModule and Learner API is not supported for tf1. Use framework='tf2' instead.)APPOTF1Policy)APPOTF2Policy)+ray.rllib.algorithms.appo.appo_torch_policyr   rZ   re   (ray.rllib.algorithms.appo.appo_tf_policyr   r   )r   r   r   r   r   s        rM   get_default_policy_classzAPPO.get_default_policy_class  s    
 +'))SSSSSS""K D((2  /  
 ONNNNN  NNNNNN  rN   rq   )rr   rs   rt   r'   r   r
   r   classmethodr   r   r   r   r   r   r   rz   r{   s   @rM   r(   r(   [  s        	T 	T 	T 	T 	T Xf. . . . . .` Xf:     [ Xf!$!	$v,	! ! !  [! ! ! ! !rN   r(   )$ru   loggingtypingr   r   typing_extensionsr   ray._common.deprecationr   r   %ray.rllib.algorithms.algorithm_configr   r	   "ray.rllib.algorithms.impala.impalar
   r   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.policy.policyr   ray.rllib.utils.annotationsr   ray.rllib.utils.metricsr   r   r   r   r   	getLoggerrr   r   LEARNER_RESULTS_KL_KEY!LEARNER_RESULTS_CURR_KL_COEFF_KEYOLD_ACTION_DIST_KEYr   r(   rU   rN   rM   <module>r      s  
 
  ! ! ! ! ! ! ! ! " " " " " " I I I I I I I I N N N N N N N N C C C C C C C C ; ; ; ; ; ; * * * * * * 0 0 0 0 0 0              
	8	$	$ ( $3 !' pP pP pP pP pP pP pP pPf	X! X! X! X! X!6 X! X! X! X! X!rN   