
    &`il                     z   d dl Z d dlmZmZmZmZmZmZ d dlm	Z	 d dl
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)  e#            \  Z*Z+Z, e$            Z- e j.        e/          Z0 G d de          Z1 G d de          Z2dS )    N)AnyDictOptionalTupleTypeUnion)Self)DEPRECATED_VALUEdeprecation_warning)AlgorithmConfigNotProvided)DQN)SACTFPolicy)"AddObservationsFromEpisodesToBatch)+AddNextObservationsFromEpisodesToTrainBatch)Learner)RLModuleSpec)Policy)deep_update)override)try_import_tftry_import_tfp)EpisodeReplayBuffer)LearningRateOrScheduleRLModuleSpecTypec            ,       V    e Zd ZdZd$ fd	Z ee          eeeeeeeeeeeeeeeeeeeeddee	         dee
eef                  dee
eef                  dee         d	ee         d
eeeef                  deeeeeef         f                  dee	         dee
eef                  dee         dee	         dee         dee
eef                  dee         dee         dee         dee         dee	         dee	         dee         def* fd            Z ee          d% fd            Z ee          d&dedefd            Z ee          defd            Z ee          deed          ef         fd!            Z ee          	 d$ fd"	            Ze fd#            Z xZS )'	SACConfiga   Defines a configuration class from which an SAC Algorithm can be built.

    .. testcode::

        config = (
            SACConfig()
            .environment("Pendulum-v1")
            .env_runners(num_env_runners=1)
            .training(
                gamma=0.9,
                actor_lr=0.001,
                critic_lr=0.002,
                train_batch_size_per_learner=32,
            )
        )
        # Build the SAC algo object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()
    Nc                 X   ddi| _         t                                          |pt                     d| _        ddgdg d d i d| _        ddgdg d d i d| _        d| _        d	| _        d
| _	        d| _
        d| _        dt          d          ddd| _        d| _        d | _        dddd| _        d| _        d| _        d| _        d | _        d | _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        t>          | _         t>          | _!        d S )NtypeStochasticSampling)
algo_classT   relu)fcnet_hiddensfcnet_activationpost_fcnet_hiddenspost_fcnet_activationcustom_modelcustom_model_configFg{Gzt?g      ?auto   PrioritizedEpisodeReplayBufferg    .Ag333333?g?)r   capacityalphabetaga2U0*3?)actor_learning_ratecritic_learning_rateentropy_learning_rategiUMu>r   i  d   )"exploration_configsuper__init__SACtwin_qq_model_configpolicy_model_configclip_actionstauinitial_alphatarget_entropyn_stepintreplay_buffer_configstore_buffer_in_checkpointstraining_intensityoptimizationactor_lr	critic_lralpha_lrlr	grad_cliptarget_network_update_freqrollout_fragment_lengthtrain_batch_size_per_learnertrain_batch_size(num_steps_sampled_before_learning_startsmin_time_s_per_iteration"min_sample_timesteps_per_iteration_deterministic_loss_use_beta_distributionr
   use_state_preprocessorworker_side_prioritization)selfr!   	__class__s     p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/sac/sac.pyr6   zSACConfig.__init__4   s    (#
 	J$5#666 !3Z &"$%) #%
 
 "3Z &"$%) #%$
 $
  " $ 5 C%
 %
! ,1("&#'$(%)
 

 *+' (.$ -0) #8<5 )*%25/ $) &+#&6#*:'''    )r8   r9   r:   r<   r=   r>   r?   rB   rA   rC   r;   rI   optimization_configrE   rF   rG   rJ   rQ   rR   rN   r8   r9   r:   r<   r=   r>   r?   rB   rA   rC   r;   rI   rY   rE   rF   rG   rJ   rQ   rR   rN   returnc                N    t                      j        di | |t          ur|| _        |t          ur| j                            |           |t          ur| j                            |           |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        |t          ur|| _        |	t          ur+t          d| j        id|	iddgdg          }|d         | _        |
t          ur|
| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        | S )u *  Sets the training related configuration.

        Args:
            twin_q: Use two Q-networks (instead of one) for action-value estimation.
                Note: Each Q-network will have its own target network.
            q_model_config: Model configs for the Q network(s). These will override
                MODEL_DEFAULTS. This is treated just as the top-level `model` dict in
                setting up the Q-network(s) (2 if twin_q=True).
                That means, you can do for different observation spaces:
                `obs=Box(1D)` -> `Tuple(Box(1D) + Action)` -> `concat` -> `post_fcnet`
                obs=Box(3D) -> Tuple(Box(3D) + Action) -> vision-net -> concat w/ action
                -> post_fcnet
                obs=Tuple(Box(1D), Box(3D)) -> Tuple(Box(1D), Box(3D), Action)
                -> vision-net -> concat w/ Box(1D) and action -> post_fcnet
                You can also have SAC use your custom_model as Q-model(s), by simply
                specifying the `custom_model` sub-key in below dict (just like you would
                do in the top-level `model` dict.
            policy_model_config: Model options for the policy function (see
                `q_model_config` above for details). The difference to `q_model_config`
                above is that no action concat'ing is performed before the post_fcnet
                stack.
            tau: Update the target by 	au * policy + (1-	au) * target_policy.
            initial_alpha: Initial value to use for the entropy weight alpha.
            target_entropy: Target entropy lower bound. If "auto", will be set
                to `-|A|` (e.g. -2.0 for Discrete(2), -3.0 for Box(shape=(3,))).
                This is the inverse of reward scale, and will be optimized
                automatically.
            n_step: N-step target updates. If >1, sars' tuples in trajectories will be
                postprocessed to become sa[discounted sum of R][s t+n] tuples. An
                integer will be interpreted as a fixed n-step value. If a tuple of 2
                ints is provided here, the n-step value will be drawn for each sample(!)
                in the train batch from a uniform distribution over the closed interval
                defined by `[n_step[0], n_step[1]]`.
            store_buffer_in_checkpoints: Set this to True, if you want the contents of
                your buffer(s) to be stored in any saved checkpoints as well.
                Warnings will be created if:
                - This is True AND restoring from a checkpoint that contains no buffer
                data.
                - This is False AND restoring from a checkpoint that does contain
                buffer data.
            replay_buffer_config: Replay buffer config.
                Examples:
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentReplayBuffer",
                "capacity": 50000,
                "replay_batch_size": 32,
                "replay_sequence_length": 1,
                }
                - OR -
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentPrioritizedReplayBuffer",
                "capacity": 50000,
                "prioritized_replay_alpha": 0.6,
                "prioritized_replay_beta": 0.4,
                "prioritized_replay_eps": 1e-6,
                "replay_sequence_length": 1,
                }
                - Where -
                prioritized_replay_alpha: Alpha parameter controls the degree of
                prioritization in the buffer. In other words, when a buffer sample has
                a higher temporal-difference error, with how much more probability
                should it drawn to use to update the parametrized Q-network. 0.0
                corresponds to uniform probability. Setting much above 1.0 may quickly
                result as the sampling distribution could become heavily “pointy” with
                low entropy.
                prioritized_replay_beta: Beta parameter controls the degree of
                importance sampling which suppresses the influence of gradient updates
                from samples that have higher probability of being sampled via alpha
                parameter and the temporal-difference error.
                prioritized_replay_eps: Epsilon parameter sets the baseline probability
                for sampling so that when the temporal-difference error of a sample is
                zero, there is still a chance of drawing the sample.
            training_intensity: The intensity with which to update the model (vs
                collecting samples from the env).
                If None, uses "natural" values of:
                `train_batch_size` / (`rollout_fragment_length` x `num_env_runners` x
                `num_envs_per_env_runner`).
                If not None, will make sure that the ratio between timesteps inserted
                into and sampled from th buffer matches the given values.
                Example:
                training_intensity=1000.0
                train_batch_size=250
                rollout_fragment_length=1
                num_env_runners=1 (or 0)
                num_envs_per_env_runner=1
                -> natural value = 250 / 1 = 250.0
                -> will make sure that replay+train op will be executed 4x asoften as
                rollout+insert op (4 * 250 = 1000).
                See: rllib/algorithms/dqn/dqn.py::calculate_rr_weights for further
                details.
            clip_actions: Whether to clip actions. If actions are already normalized,
                this should be set to False.
            grad_clip: If not None, clip gradients during optimization at this value.
            optimization_config: Config dict for optimization. Set the supported keys
                `actor_learning_rate`, `critic_learning_rate`, and
                `entropy_learning_rate` in here.
            actor_lr: The learning rate (float) or learning rate schedule for the
                policy in the format of
                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
                schedule, intermediary timesteps will be assigned to linearly
                interpolated learning rate values. A schedule config's first entry
                must start with timestep 0, i.e.: [[0, initial_value], [...]].
                Note: It is common practice (two-timescale approach) to use a smaller
                learning rate for the policy than for the critic to ensure that the
                critic gives adequate values for improving the policy.
                Note: If you require a) more than one optimizer (per RLModule),
                b) optimizer types that are not Adam, c) a learning rate schedule that
                is not a linearly interpolated, piecewise schedule as described above,
                or d) specifying c'tor arguments of the optimizer that are not the
                learning rate (e.g. Adam's epsilon), then you must override your
                Learner's `configure_optimizer_for_module()` method and handle
                lr-scheduling yourself.
                The default value is 3e-5, one decimal less than the respective
                learning rate of the critic (see `critic_lr`).
            critic_lr: The learning rate (float) or learning rate schedule for the
                critic in the format of
                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
                schedule, intermediary timesteps will be assigned to linearly
                interpolated learning rate values. A schedule config's first entry
                must start with timestep 0, i.e.: [[0, initial_value], [...]].
                Note: It is common practice (two-timescale approach) to use a smaller
                learning rate for the policy than for the critic to ensure that the
                critic gives adequate values for improving the policy.
                Note: If you require a) more than one optimizer (per RLModule),
                b) optimizer types that are not Adam, c) a learning rate schedule that
                is not a linearly interpolated, piecewise schedule as described above,
                or d) specifying c'tor arguments of the optimizer that are not the
                learning rate (e.g. Adam's epsilon), then you must override your
                Learner's `configure_optimizer_for_module()` method and handle
                lr-scheduling yourself.
                The default value is 3e-4, one decimal higher than the respective
                learning rate of the actor (policy) (see `actor_lr`).
            alpha_lr: The learning rate (float) or learning rate schedule for the
                hyperparameter alpha in the format of
                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
                schedule, intermediary timesteps will be assigned to linearly
                interpolated learning rate values. A schedule config's first entry
                must start with timestep 0, i.e.: [[0, initial_value], [...]].
                Note: If you require a) more than one optimizer (per RLModule),
                b) optimizer types that are not Adam, c) a learning rate schedule that
                is not a linearly interpolated, piecewise schedule as described above,
                or d) specifying c'tor arguments of the optimizer that are not the
                learning rate (e.g. Adam's epsilon), then you must override your
                Learner's `configure_optimizer_for_module()` method and handle
                lr-scheduling yourself.
                The default value is 3e-4, identical to the critic learning rate (`lr`).
            target_network_update_freq: Update the target network every
                `target_network_update_freq` steps.
            num_steps_sampled_before_learning_starts: Number of timesteps (int)
                that we collect from the runners before we start sampling the
                replay buffers for learning. Whether we count this in agent steps
                or environment steps depends on the value of
                `config.multi_agent(count_steps_by=...)`.
            _deterministic_loss: Whether the loss should be calculated deterministically
                (w/o the stochastic action sampling step). True only useful for
                continuous actions and for debugging.
            _use_beta_distribution: Use a Beta-distribution instead of a
                `SquashedGaussian` for bounded, continuous action spaces (not
                recommended; for debugging only).

        Returns:
            This updated AlgorithmConfig object.
        rA   F )r5   trainingr   r8   r9   updater:   r<   r=   r>   r?   rB   r   rA   rC   r;   rI   rD   rE   rF   rG   rJ   rQ   rR   rN   )rU   r8   r9   r:   r<   r=   r>   r?   rB   rA   rC   r;   rI   rY   rE   rF   rG   rJ   rQ   rR   rN   kwargsnew_replay_buffer_configrV   s                          rW   r]   zSACConfig.training   s   @ 	""6"""$$ DK,,&&~666k11$++,?@@@k!!DH++!.D,,"0D$$ DK&k99/JD,{22 (3')BC')=>'('(( ($ )AAW(XD%[00&8D#{** ,DK''&DNk11 3D;&&$DMK''&DN;&&$DM%[88.HD+k11':D$!44*@D'3;FF8 9 rX   c                 *   t                                                       t          | j        t                    r| j        d         }n| j        }| j        sL| j        dk    rA| j        |k     r6t          d| j         d| j         d| j         d| j        d          d	          | j        t          k    rt          dd	
           t          | _        | j        | j        dk    rt          d          | j        dv rHt          At                              dt           rt           j        nd  d           t%          d           | j        r| j        d         dvr|| j        rft          | j        t,                    s:t          | j        t.                    r2t          | j        d         t,                    r| j        dk    r| j        st          d          | j        s~t          | j        d         t,                    rd| j        d         v s@t          | j        d         t2                    r/t5          | j        d         t6                    rt          d          | j        r2| j        t          d          t                              d           d S d S )Nr+   r*   z Your `rollout_fragment_length` (z') is smaller than needed for `n_step` (zB)! If `n_step` is an integer try setting `rollout_fragment_length=z@`. If `n_step` is a tuple, try setting `rollout_fragment_length=z`.z config['use_state_preprocessor']F)olderrorg        z `grad_clip` value must be > 0.0!)tftf2zYou need `tensorflow_probability` in order to run SAC! Install it via `pip install tensorflow_probability`. Your tf.__version__=z5.Trying to import tfp results in the following error:T)rc   r   )r   r,   MultiAgentEpisodeReplayBuffer(MultiAgentPrioritizedEpisodeReplayBufferr   samplerz[When using the new `EnvRunner API` the replay buffer must be of type `EpisodeReplayBuffer`.Episodead  When using the old API stack the replay buffer must not be of type `EpisodeReplayBuffer`! We suggest you use the following config to run SAC on the old API stack: `config.training(replay_buffer_config={'type': 'MultiAgentPrioritizedReplayBuffer', 'prioritized_replay_alpha': [alpha], 'prioritized_replay_beta': [beta], 'prioritized_replay_eps': [eps], })`.zBasic learning rate parameter `lr` is not `None`. For SAC use the specific learning rate parameters `actor_lr`, `critic_lr` and `alpha_lr`, for the actor, critic, and the hyperparameter `alpha`, respectively and set `config.lr` to None.aa  You are running SAC on the new API stack! This is the new default behavior for this algorithm. If you don't want to use the new API stack, set `config.api_stack(enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False)`. For a detailed migration guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.html)r5   validate
isinstancer?   tuplein_evaluationrK   
ValueErrorrS   r
   r   rI   	frameworktfploggerwarningrd   __version__r   "enable_env_runner_and_connector_v2rA   input_strlistenable_rl_module_and_learnerr   
issubclassr   rH   )rU   min_rollout_fragment_lengthrV   s     rW   rj   zSACConfig.validate  s,    	 dk5)) 	6*.+a.''*.+' "	,66,)* * ?43O ? ?59[? ?CG;? ? -1KN	? ? ?   &*:::6    +;D'>%$.C*?*??@@@>]**s{NNG46"@"..DG G G   &&&&
 3/	)&1   t{C00 #4;55 't{1~s;;$ K9,,5 - )   8 	44V<cBB	 !:6!BBB 44V<dCC C t8@BUVV	 C 	 	 	 , 	w" I   NNt    	 	rX   r   worker_indexc                     | j         dk    r5t          | j        t          t          f          r| j        d         n| j        S | j         S )Nr*   r+   )rK   rk   r?   rl   rw   )rU   r{   s     rW   get_rollout_fragment_lengthz%SACConfig.get_rollout_fragment_length  sI    '611 dkE4=99!A[ //rX   c                 t    | j         dk    rddlm} t          |          S t	          d| j          d          )Ntorchr   )DefaultSACTorchRLModule)module_classThe framework  is not supported. Use `torch`.)framework_str:ray.rllib.algorithms.sac.torch.default_sac_torch_rl_moduler   r   rn   )rU   r   s     rW   get_default_rl_module_specz$SACConfig.get_default_rl_module_spec  se    ((       -DEEEET!3TTT  rX   r   c                 X    | j         dk    rddlm} |S t          d| j          d          )Nr   r   )SACTorchLearnerr   r   )r   0ray.rllib.algorithms.sac.torch.sac_torch_learnerr   rn   )rU   r   s     rW   get_default_learner_classz#SACConfig.get_default_learner_class  sK    ((XXXXXX""T!3TTT  rX   c                     t                                          |||          }|                    t          t	                                 |S )N)input_observation_spaceinput_action_spacedevice)r5   build_learner_connectorinsert_afterr   r   )rU   r   r   r   pipelinerV   s        rW   r   z!SACConfig.build_learner_connector  sY     7722$;1 3 
 
 	.799	
 	
 	

 rX   c                 >    t                      j        d| j        iz  S )Nr8   )r5   _model_config_auto_includesr8   )rU   rV   s    rW   r   z%SACConfig._model_config_auto_includes,  s    ww2h5LLLrX   N)rZ   N)r   )__name__
__module____qualname____doc__r6   r   r   r   r   boolr   rv   r   floatr   r@   r   r   r	   r]   rj   r}   r   r   r   r   r   propertyr   __classcell__rV   s   @rW   r   r      s        (U; U; U; U; U; U;n Xo "-3>8C*)46A8C6A9D.9'2%08C5@6A5@4?.91<BM-u u u u !c3h0	u
 &d38n5u e_u  u !sEz!23u sE#s(O345u &.d^u 'tCH~6u %UOu tnu E?u &d38n5u  12!u" 23#u$ 12%u& %-SM'u( &d^)u* !)+u, 3;3--u0 
1u u u u u un Xom m m m m m^ Xo0 0 0C 0 0 0 0 Xo
,< 
 
 
 
 Xo5i#1E+F     Xo
 	     * M M M M XM M M M MrX   r   c                        e Zd ZdZ fdZe ee          defd                        Z	e ee          de
deee                  fd                        Z xZS )r7   a0  Soft Actor Critic (SAC) Algorithm class.

    This file defines the distributed Algorithm class for the soft actor critic
    algorithm.
    See `sac_[tf|torch]_policy.py` for the definition of the policy loss.

    Detailed documentation:
    https://docs.ray.io/en/master/rllib-algorithms.html#sac
    c                 ^    | xj         ddgz  c_          t                      j        |i | d S )Nr:   r9   )_allow_unknown_subkeysr5   r6   )rU   argsr_   rV   s      rW   r6   zSAC.__init__<  s@    ##(=?O'PP##$)&)))))rX   rZ   c                     t                      S r   )r   )clss    rW   get_default_configzSAC.get_default_config@  s     {{rX   configc                 8    |d         dk    rddl m} |S t          S )Nro   r   r   )SACTorchPolicy))ray.rllib.algorithms.sac.sac_torch_policyr   r   )r   r   r   s      rW   get_default_policy_classzSAC.get_default_policy_classE  s2    
 +'))PPPPPP!!rX   )r   r   r   r   r6   classmethodr   r   r   r   r   r   r   r   r   r   r   s   @rW   r7   r7   1  s         * * * * * Xc]]9    ] [ Xc]]$	$v,	   ] [    rX   r7   )3loggingtypingr   r   r   r   r   r   typing_extensionsr	   ray._common.deprecationr
   r   %ray.rllib.algorithms.algorithm_configr   r   ray.rllib.algorithms.dqn.dqnr   &ray.rllib.algorithms.sac.sac_tf_policyr   Cray.rllib.connectors.common.add_observations_from_episodes_to_batchr   Oray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batchr   ray.rllib.core.learnerr   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.policy.policyr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   ray.rllib.utils.frameworkr   r   4ray.rllib.utils.replay_buffers.episode_replay_bufferr   ray.rllib.utils.typingr   r   tf1rd   tfvrp   	getLoggerr   rq   r   r7   r\   rX   rW   <module>r      s    : : : : : : : : : : : : : : : : " " " " " " I I I I I I I I N N N N N N N N , , , , , , > > > > > >           + * * * * * ; ; ; ; ; ; * * * * * * ' ' ' ' ' ' 0 0 0 0 0 0 C C C C C C C C T T T T T T K K K K K K K K}Rn		8	$	$OM OM OM OM OM OM OM OMd    #     rX   