
    &`i:Q                     F   d dl mZmZmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1  G d de          Z2 G d de
          Z3dS )    )CallableOptionalTypeUnion)Self)deprecation_warning)	Algorithm)AlgorithmConfigNotProvided)+AddNextObservationsFromEpisodesToTrainBatch"AddObservationsFromEpisodesToBatchAddOneTsToEpisodesAndTruncateGeneralAdvantageEstimation)Learner)TrainingData)RLModuleSpec)synchronous_parallel_sample)multi_gpu_train_one_steptrain_one_step)Policy)OldAPIStackoverride)LEARNER_RESULTSLEARNER_UPDATE_TIMERNUM_AGENT_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLEDOFFLINE_SAMPLING_TIMERSAMPLE_TIMERSYNCH_WORKER_WEIGHTS_TIMERTIMERS)EnvType
ResultDictRLModuleSpecType)Loggerc                       e Zd ZdZd fd	Z ee          eeeeeeddee	         dee	         dee	         dee	         d	ee	         d
ee	         de
f fd            Z ee          defd            Z ee          deed         ef         fd            Z ee          de
f fd            Z ee          de
f fd            Z ee          	 	 ddeeeef                  deeg ef                  ddf fd            Z ee          	 d fd	            Z ee          d fd            Ze fd            Z xZS )MARWILConfiga  Defines a configuration class from which a MARWIL Algorithm can be built.

    .. testcode::

        import gymnasium as gym
        import numpy as np

        from pathlib import Path
        from ray.rllib.algorithms.marwil import MARWILConfig

        # Get the base path (to ray/rllib)
        base_path = Path(__file__).parents[2]
        # Get the path to the data in rllib folder.
        data_path = base_path / "offline/tests/data/cartpole/cartpole-v1_large"

        config = MARWILConfig()
        # Enable the new API stack.
        config.api_stack(
            enable_rl_module_and_learner=True,
            enable_env_runner_and_connector_v2=True,
        )
        # Define the environment for which to learn a policy
        # from offline data.
        config.environment(
            observation_space=gym.spaces.Box(
                np.array([-4.8, -np.inf, -0.41887903, -np.inf]),
                np.array([4.8, np.inf, 0.41887903, np.inf]),
                shape=(4,),
                dtype=np.float32,
            ),
            action_space=gym.spaces.Discrete(2),
        )
        # Set the training parameters.
        config.training(
            beta=1.0,
            lr=1e-5,
            gamma=0.99,
            # We must define a train batch size for each
            # learner (here 1 local learner).
            train_batch_size_per_learner=2000,
        )
        # Define the data source for offline data.
        config.offline_data(
            input_=[data_path.as_posix()],
            # Run exactly one update per training iteration.
            dataset_num_iters_per_learner=1,
        )

        # Build an `Algorithm` object from the config and run 1 training
        # iteration.
        algo = config.build()
        algo.train()

    .. testcode::

        import gymnasium as gym
        import numpy as np

        from pathlib import Path
        from ray.rllib.algorithms.marwil import MARWILConfig
        from ray import tune

        # Get the base path (to ray/rllib)
        base_path = Path(__file__).parents[2]
        # Get the path to the data in rllib folder.
        data_path = base_path / "offline/tests/data/cartpole/cartpole-v1_large"

        config = MARWILConfig()
        # Enable the new API stack.
        config.api_stack(
            enable_rl_module_and_learner=True,
            enable_env_runner_and_connector_v2=True,
        )
        # Print out some default values
        print(f"beta: {config.beta}")
        # Update the config object.
        config.training(
            lr=tune.grid_search([1e-3, 1e-4]),
            beta=0.75,
            # We must define a train batch size for each
            # learner (here 1 local learner).
            train_batch_size_per_learner=2000,
        )
        # Set the config's data path.
        config.offline_data(
            input_=[data_path.as_posix()],
            # Set the number of updates to be run per learner
            # per training step.
            dataset_num_iters_per_learner=1,
        )
        # Set the config's environment for evalaution.
        config.environment(
            observation_space=gym.spaces.Box(
                np.array([-4.8, -np.inf, -0.41887903, -np.inf]),
                np.array([4.8, np.inf, 0.41887903, np.inf]),
                shape=(4,),
                dtype=np.float32,
            ),
            action_space=gym.spaces.Discrete(2),
        )
        # Set up a tuner to run the experiment.
        tuner = tune.Tuner(
            "MARWIL",
            param_space=config,
            run_config=tune.RunConfig(
                stop={"training_iteration": 1},
            ),
        )
        # Run the experiment.
        tuner.fit()
    Nc                 R   ddi| _         t                                          |pt                     d| _        d| _        d| _        d| _        d| _        d| _	        d| j
        d	<   d
| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d
S )z$Initializes a MARWILConfig instance.typeStochasticSampling)
algo_classF      ?        g:0yE>g      Y@vf_share_layersNsamplerTg-C6?i  )exploration_configsuper__init__MARWIL
_is_onlinebetabc_logstd_coeff'moving_average_sqd_adv_norm_update_rate!moving_average_sqd_adv_norm_startvf_coeffmodel	grad_clipinput_postprocess_inputslrlambda_train_batch_sizematerialize_datamaterialize_mapped_data"_set_off_policy_estimation_methods)selfr*   	__class__s     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil.pyr1   zMARWILConfig.__init__   s     (#
 	J$8&999 	"7;416.(-
$%  "& $ !%',$ 38///    )r4   r5   r6   r7   r8   r:   r4   r5   r6   r7   r8   r:   returnc                     t                      j        di | |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        | S )a  Sets the training related configuration.

        Args:
            beta: Scaling  of advantages in exponential terms. When beta is 0.0,
                MARWIL is reduced to behavior cloning (imitation learning);
                see bc.py algorithm in this same directory.
            bc_logstd_coeff: A coefficient to encourage higher action distribution
                entropy for exploration.
            moving_average_sqd_adv_norm_update_rate: The rate for updating the
                squared moving average advantage norm (c^2). A higher rate leads
                to faster updates of this moving avergage.
            moving_average_sqd_adv_norm_start: Starting value for the
                squared moving average advantage norm (c^2).
            vf_coeff: Balancing value estimation loss and policy optimization loss.
            grad_clip: If specified, clip the global norm of gradients by this amount.

        Returns:
            This updated AlgorithmConfig object.
         )	r0   trainingr   r4   r5   r6   r7   r8   r:   )	rC   r4   r5   r6   r7   r8   r:   kwargsrD   s	           rE   rJ   zMARWILConfig.training   s    @ 	""6"""{""DI+--#2D 2+EE7 8 -K??5VD2;&&$DMK''&DNrF   c                 t    | j         dk    rddlm} t          |          S t	          d| j          d          )Ntorchr   )DefaultPPOTorchRLModule)module_classThe framework ' is not supported. Use 'torch' instead.)framework_str:ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_modulerN   r   
ValueError)rC   rN   s     rE   get_default_rl_module_specz'MARWILConfig.get_default_rl_module_spec  sm    ((       -DEEEE'!3 ' ' '  rF   r   c                 X    | j         dk    rddlm} |S t          d| j          d          )NrM   r   )MARWILTorchLearnerrP   rQ   )rR   6ray.rllib.algorithms.marwil.torch.marwil_torch_learnerrW   rT   )rC   rW   s     rE   get_default_learner_classz&MARWILConfig.get_default_learner_class  sa    ((      &%'!3 ' ' '  rF   c                 P     t                      j        di | d|v rd| _        | S )zuSets the evaluation related configuration.
        Returns:
            This updated AlgorithmConfig object.
        off_policy_estimation_methodsTrI   )r0   
evaluationrB   )rC   rK   rD   s     rE   r\   zMARWILConfig.evaluation"  s:     	$$V$$$*f446:D3rF   c                      t                      j        di | d|v rOddlm} t	          |                    d          |          s&t          d|                    d           d          | S )Nprelearner_classr   )OfflinePreLearnerz`prelearner_class` z is not a subclass of `OfflinePreLearner`. Any class passed to `prelearner_class` needs to implement the interface given by `OfflinePreLearner`.rI   )r0   offline_dataray.rllib.offline.offline_datar_   
issubclassgetrT   )rC   rK   r_   rD   s      rE   r`   zMARWILConfig.offline_data4  s     	&&v&&& ''HHHHHHfjj);<<>OPP  +&**5G*H*H + + +   rF   envlogger_creatorr	   c                 x    | j         st          dd           t                                          ||          S )NzMARWIL used to have off_policy_estimation_methods is and wis by default. This haschanged to off_policy_estimation_methods: \{\}.If you want to use an off-policy estimator, specify it in.evaluation(off_policy_estimation_methods=...)F)olderror)rB   r   r0   build)rC   rd   re   rD   s      rE   ri   zMARWILConfig.buildH  sL     6 	A
     ww}}S.111rF   c                 <   t                                          |||          }|                    t                                 |                    t
          t                                 |                    t          | j	        | j
                             |S )N)input_observation_spaceinput_action_spacedevice)gammar>   )r0   build_learner_connectorprependr   insert_afterr   r   appendr   rn   r>   )rC   rk   rl   rm   pipelinerD   s        rE   ro   z$MARWILConfig.build_learner_connectorY  s     7722$;1 3 
 
 	688999 	.799	
 	
 	
 	&TZNNN	
 	
 	
 rF   c                 X   t                                                       | j        dk     s| j        dk    r|                     d           | j        du r | j        dk    r|                     d           | j        dk    r%| j        s | j        r|                     d           d S d S d S d S )Nr,   r+   z"`beta` must be within 0.0 and 1.0!Fz`postprocess_inputs` must be True for MARWIL (to calculate accum., discounted returns)! Try setting `config.offline_data(postprocess_inputs=True)`.r   zWhen using a local Learner (`config.num_learners=0`), the number of iterations per learner (`dataset_num_iters_per_learner`) has to be defined! Set this hyperparameter through `config.offline_data(dataset_num_iters_per_learner=...)`.)r0   validater4   _value_errorr<   num_learnersdataset_num_iters_per_learnerenable_rl_module_and_learnerrC   rD   s    rE   ru   zMARWILConfig.validate}  s     	9s??di#ooBCCC"e++	CB   ""6 #1 # 7    	 #"""""rF   c                 @    t                      j        | j        ddz  S )NF)r4   r-   )r0   _model_auto_keysr4   rz   s    rE   r|   zMARWILConfig._model_auto_keys  s     ww'49QV*W*WWWrF   N)NNrG   N)__name__
__module____qualname____doc__r1   r   r
   r   r   floatr   rJ   r#   rU   r   r   strrY   r\   r`   r!   r   r$   ri   ro   ru   propertyr|   __classcell__)rD   s   @rE   r&   r&   ,   s       n n`68 68 68 68 68 68p Xo !,+6CN=H$/%0. . . uo. "%	.
 2:%. ,4E?. 5/. E?. 
. . . . . .` Xo,<     Xo5i#1E+F     Xo 
     " Xo      & Xo .29=2 2eCL)*2 !"f*!562 
	2 2 2 2 2 2  Xo
 	! ! ! ! ! !F Xo     : X X X X XX X X X XrF   r&   c                       e Zd Ze ee          defd                        Ze ee          dede	e
e                  fd                        Z ee          dd            Zedefd            ZdS )	r2   rG   c                     t                      S r}   )r&   )clss    rE   get_default_configzMARWIL.get_default_config  s     ~~rF   configc                 b    |d         dk    rddl m} |S |d         dk    rddlm} |S ddlm} |S )N	frameworkrM   r   )MARWILTorchPolicytf)MARWILTF1Policy)MARWILTF2Policy)/ray.rllib.algorithms.marwil.marwil_torch_policyr   ,ray.rllib.algorithms.marwil.marwil_tf_policyr   r   )r   r   r   r   r   s        rE   get_default_policy_classzMARWIL.get_default_policy_class  s    
 +'))      %$K D((      #"TTTTTT""rF   Nc                    | j         j        s|                                 S | j                            t
          t          f          5  | j         j        dk    p| j         j        dk    }| j	        
                    | j         j        | j         j        |          }|rt          |          }nt          |          }ddd           n# 1 swxY w Y   | j                            t
          t          f          5   | j        j        d	|| j         j        | j         j        d| j	        j        }| j                            |t$                     ddd           dS # 1 swxY w Y   dS )
a-  Implements training logic for the new stack

        Note, this includes so far training with the `OfflineData`
        class (multi-/single-learner setup) and evaluation on
        `EnvRunner`s. Note further, evaluation on the dataset itself
        using estimators is not implemented, yet.
        r      )num_samples
num_shardsreturn_iterator)data_iterators)batchN)training_dataminibatch_size	num_iters)keyrI   )r   "enable_env_runner_and_connector_v2_training_step_old_api_stackmetricslog_timer    r   rw   rx   r`   sampletrain_batch_size_per_learnerr   r   learner_groupupdateiter_batches_kwargs	aggregater   )rC   r   batch_or_iteratorr   learner_resultss        rE   training_stepzMARWIL.training_step  s)    {= 	744666 \""F,B#CDD 	F 	F (1, B;<A  !% 1 8 8 KD;3 !0 !9 ! !  F ,<M N N N ,3D E E E'	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F* \""F,@#ABB 
	I 
	I7d07 +#{G+C  #7	 O L""?"HHH
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	I 
	Is&   A5CCC9AEE #E c                 T   | j         t                   5  t          | j                  }ddd           n# 1 swxY w Y   |                    t          | j        j                  d                   }| j        t          xx         |
                                z  cc<   | j        t          xx         |                                z  cc<   | j        j        rt          | |          }nt          | |          }d| j        t                   i}| j                                        dk    re| j         t"                   5  | j                            t          |                                          |           ddd           n# 1 swxY w Y   | j                            |           |S )zImplements training step for the old stack.

        Note, there is no hybrid stack anymore. If you need to use `RLModule`s,
        use the new api stack.
        )
worker_setNr   )	module_idtimestep)policiesglobal_vars)_timersr   r   env_runner_groupas_multi_agentlistr   r   	_countersr   agent_stepsr   	env_stepssimple_optimizerr   r   num_remote_env_runnersr   sync_weightskeys
env_runnerset_global_vars)rC   train_batchtrain_resultsr   s       rE   r   z#MARWIL._training_step_old_api_stack  s-    \,' 	X 	X5AVWWWK	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X!004;/003 1 
 
 	.///;3J3J3L3LL///,---1F1F1H1HH--- ;' 	H*4==MM4T;GGM '>?
  7799A==89  %22!-"4"4"6"677[ 3                  	''444s   599;<FF
Fr~   )r   r   r   classmethodr   r	   r&   r   r
   r   r   r   r   r   r   r"   r   rI   rF   rE   r2   r2     s        Xi<     [ Xi#$#	$v,	# # #  [#( Xi/I /I /I /Ib )j ) ) ) [) ) )rF   r2   N)4typingr   r   r   r   typing_extensionsr   ray._common.deprecationr   ray.rllib.algorithms.algorithmr	   %ray.rllib.algorithms.algorithm_configr
   r   ray.rllib.connectors.learnerr   r   r   r   ray.rllib.core.learner.learnerr   $ray.rllib.core.learner.training_datar   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.metricsr   r   r   r   r   r   r   r    ray.rllib.utils.typingr!   r"   r#   ray.tune.loggerr$   r&   r2   rI   rF   rE   <module>r      s   2 2 2 2 2 2 2 2 2 2 2 2 " " " " " " 7 7 7 7 7 7 4 4 4 4 4 4 N N N N N N N N            3 2 2 2 2 2 = = = = = = ; ; ; ; ; ;             + * * * * * = = = = = = = =	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	         
 # " " " " "qX qX qX qX qX? qX qX qXhx x x x xY x x x x xrF   