
    &`i
!                         d dl mZmZmZ d dlmZmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ  G d
 de	          Z G d de          ZdS )    )OptionalTypeUnion)AlgorithmConfigNotProvided)MARWILMARWILConfig)"AddObservationsFromEpisodesToBatch)+AddNextObservationsFromEpisodesToTrainBatch)Learner)RLModuleSpec)override)LearningRateOrScheduleRLModuleSpecTypec                       e Zd ZdZd fd	Z ee          eeeeeeeddee	         dee
         dee         dee         d	ee         d
ee         dee
         dd f fd            Z ee          deed         ef         fd            Z ee          defd            Z ee          	 d fd	            Z ee          d fd            Ze fd            Z xZS )	IQLConfiga  Defines a configuration class from which a new IQL Algorithm can be built

    .. testcode::
        :skipif: True

        from ray.rllib.algorithms.iql import IQLConfig
        # Run this from the ray directory root.
        config = IQLConfig().training(actor_lr=0.00001, gamma=0.99)
        config = config.offline_data(
            input_="./rllib/offline/tests/data/pendulum/pendulum-v1_enormous")

        # Build an Algorithm object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()

    .. testcode::
        :skipif: True

        from ray.rllib.algorithms.iql import IQLConfig
        from ray import tune
        config = IQLConfig()
        # Print out some default values.
        print(config.beta)
        # Update the config object.
        config.training(
            lr=tune.grid_search([0.001, 0.0001]), beta=0.75
        )
        # Set the config object's data path.
        # Run this from the ray directory root.
        config.offline_data(
            input_="./rllib/offline/tests/data/pendulum/pendulum-v1_enormous"
        )
        # Set the config object's env, used for evaluation.
        config.environment(env="Pendulum-v1")
        # Use to_dict() to get the old-style python config dict
        # when running with tune.
        tune.Tuner(
            "IQL",
            param_space=config.to_dict(),
        ).fit()
    Nc                     t                                          |pt                     d| _        d| _        d| _        d| _        d| _        d | _        d| _	        d| _
        d| _        d S )N)
algo_classg?g?ga2U0*3?Tr         ?)super__init__IQLbeta	expectileactor_lr	critic_lrvalue_lrlrtwin_qtarget_network_update_freqtau)selfr   	__class__s     p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/iql/iql.pyr   zIQLConfig.__init__<   sq    J$5#666
 	    +,'    )r   r   r   r   r   r    r!   r   r   r   r   r   r    r!   returnc                    t                      j        di | |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        | S )a  Sets the training related configuration.

        Args:
            beta: The temperature to scaling advantages in exponential terms.
                Must be >> 0.0. The higher this parameter the less greedy
                (exploitative) the policy becomes. It also means that the policy
                is fitting less to the best actions in the dataset.
            twin_q: If a twin-Q architecture should be used (advisable).
            expectile: The expectile to use in expectile regression for the value
                function. For high expectiles the value function tries to match
                the upper tail of the Q-value distribution.
            actor_lr: The learning rate for the actor network. Actor learning rates
                greater than critic learning rates work well in experiments.
            critic_lr: The learning rate for the Q-network. Critic learning rates
                greater than value function learning rates work well in experiments.
            value_lr: The learning rate for the value function network.
            target_network_update_freq: The number of timesteps in between the target
                Q-network is fixed. Note, too high values here could harm convergence.
                The target network is updated via Polyak-averaging.
            tau: The update parameter for Polyak-averaging of the target Q-network.
                The higher this value the faster the weights move towards the actual
                Q-network.

        Return:
            This updated `AlgorithmConfig` object.
         )
r   trainingr   r   r   r   r   r   r    r!   )
r"   r   r   r   r   r   r    r!   kwargsr#   s
            r$   r)   zIQLConfig.trainingY   s    N 	""6"""$$ DKK''&DN;&&$DMK''&DN;&&$DM%[88.HD+k!!DHr%   r   c                 X    | j         dk    rddlm} |S t          d| j          d          )Ntorchr   )IQLTorchLearnerThe framework z) is not supported. Use `'torch'` instead.)framework_str0ray.rllib.algorithms.iql.torch.iql_torch_learnerr-   
ValueError)r"   r-   s     r$   get_default_learner_classz#IQLConfig.get_default_learner_class   sS    ((XXXXXX"")!3 ) ) )  r%   c                 t    | j         dk    rddlm} t          |          S t	          d| j          d          )Nr,   r   )DefaultIQLTorchRLModule)module_classr.   z' is not supported. Use `torch` instead.)r/   :ray.rllib.algorithms.iql.torch.default_iql_torch_rl_moduler4   r   r1   )r"   r4   s     r$   get_default_rl_module_specz$IQLConfig.get_default_rl_module_spec   sm    ((       -DEEEE'!3 ' ' '  r%   c                     t                                          |||          }|                    d           |                    d           |                    t          t                                 |S )N)input_observation_spaceinput_action_spacedeviceAddOneTsToEpisodesAndTruncateGeneralAdvantageEstimation)r   build_learner_connectorremoveinsert_afterr
   r   )r"   r9   r:   r;   pipeliner#   s        r$   r>   z!IQLConfig.build_learner_connector   s     7722$;1 3 
 
 	78884555 	.799	
 	
 	

 r%   c                     t                                                       | j        dk    r|                     d           d| j        cxk     rdk     sn |                     d           d S d S )Ng        zFFor meaningful results, `beta` (temperature) parameter must be >> 0.0!r   z@For meaningful results, `expectile` parameter must be in (0, 1).)r   validater   _value_errorr   r"   r#   s    r$   rC   zIQLConfig.validate   s     	 9X   T^))))c))))R     *)r%   c                 >    t                      j        d| j        iz  S )Nr   )r   _model_config_auto_includesr   rE   s    r$   rG   z%IQLConfig._model_config_auto_includes   s    ww2h5LLLr%   N)r&   N)__name__
__module____qualname____doc__r   r   r	   r   r   boolfloatr   intr)   r   r   strr2   r   r7   r>   rC   propertyrG   __classcell__)r#   s   @r$   r   r      s!       ( (T     : Xl "-%05@6A5@4?*7 7 7 7 E?	7
 127 237 127 %-SM7 e_7 
7 7 7 7 7 7r Xl	5i#1E+F 	 	 	 	 Xl,<     Xl
 	     2 Xl      M M M M XM M M M Mr%   r   c                   P    e Zd ZdZe ee          defd                        ZdS )r   zOImplicit Q-learning (derived from MARWIL).

    Uses MARWIL training step.
    r&   c                     t                      S rH   )r   )clss    r$   get_default_configzIQL.get_default_config   s     {{r%   N)	rI   rJ   rK   rL   classmethodr   r   r   rV   r(   r%   r$   r   r      sZ         
 Xf?     [  r%   r   N)typingr   r   r   %ray.rllib.algorithms.algorithm_configr   r   "ray.rllib.algorithms.marwil.marwilr   r	   Cray.rllib.connectors.common.add_observations_from_episodes_to_batchr
   Oray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batchr   ray.rllib.core.learner.learnerr   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.utils.annotationsr   ray.rllib.utils.typingr   r   r   r   r(   r%   r$   <module>ra      sN   ( ( ( ( ( ( ( ( ( ( N N N N N N N N C C C C C C C C           3 2 2 2 2 2 ; ; ; ; ; ; 0 0 0 0 0 0 K K K K K K K KGM GM GM GM GM GM GM GMT	 	 	 	 	& 	 	 	 	 	r%   