
    &`i                         d Z ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ  e
            \  ZZdefd	Zed
k    r e                                d                              d          Ze                                Zded<   ded<    ej        eej                            e                    Z ej        ee          Ze                                Ze                                Z e dej!        d                     dS dS )u  Example of a custom Ray Tune experiment wrapping an RLlib Algorithm.

You should only use such a customized workflow if the following conditions apply:
- You know exactly what you are doing :)
- Configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig
is not sufficient and doesn't allow you to shape the Algorithm into behaving the way
you'd like. Note that for complex, custom evaluation procedures there are many
AlgorithmConfig options one can use (for more details, see:
https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py).  # noqa
- Subclassing an RLlib Algorithm class and overriding the new class' `training_step`
method is not sufficient and doesn't allow you to define the algorithm's execution
logic the way you'd like. See an example here on how to customize the algorithm's
`training_step()` method:
https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py  # noqa


How to run this script
----------------------
`python [script file name].py`


Results to expect
-----------------
You should see the following output (at the end of the experiment) in your console:

╭───────────────────────────────────────────────────────────────────────────────────────
│ Trial name                              status         iter     total time (s)      ts
├───────────────────────────────────────────────────────────────────────────────────────
│ my_experiment_CartPole-v1_77083_00000   TERMINATED       10            36.7799   60000
╰───────────────────────────────────────────────────────────────────────────────────────
╭───────────────────────────────────────────────────────╮
│     reward    episode_len_mean     episodes_this_iter │
├───────────────────────────────────────────────────────┤
│    254.821             254.821                     12 │
╰───────────────────────────────────────────────────────╯
evaluation episode returns=[500.0, 500.0, 500.0]

Note that evaluation results (on the CartPole-v1 env) should be close to perfect
(episode return of ~500.0) as we are acting greedily inside the evaluation procedure.
    )DictN)tune)	PPOConfig)try_import_torch)NUM_ENV_STEPS_SAMPLED_LIFETIMEconfigc           	         |                      dd          }|                      dd          }t                                          |                               d                              d          } |                     d	           |                                 }t          |          D ]<}|                                }d|d
<   t          j
        |           |t                   }=|                                }|                                 |                     d	           |                                 }|                    |           t          |          D ]D}|                                }d|d
<   |t          xx         |z  cc<   t          j
        |           E|                                }	|                                 |                     d           |                                 }
|
                    |	           |
j        }|j        j        j        d         }|j        }|                                \  }}g }g }dx}}d}||k     r|                    dt.                              t3          j        |d                    i          }|d         d         }t3          j        |                                                                                                          }|                    |          \  }}}}}||z  }|dz  }|s|rJ|dz  }|                     |           |                     |           dx}}|                                \  }}||k     ||d}i ||}t          j
        |           d S )Ntrain-iterations   eval-episodes   T)enable_rl_module_and_learnerCartPole-v1gMbP?)lrphasegh㈵>r   num_env_runnersobsaction_dist_inputs)eval_returnseval_episode_lengths)!popr   update_from_dict	api_stackenvironmenttrainingbuildrangetrainr   reportr   savestoprestoreenv_runners
env_runnerenv	unwrappedenvsmoduleresetforward_inferencetorch
from_numpynpexpand_dimsargmaxdetachcpunumpystepappend)r   train_iterationseval_episodes_to_doalgo_high_lr_train_resultsphase_high_lr_timecheckpoint_training_high_lralgo_low_lrcheckpoint_training_low_lr	eval_algolocal_env_runnerr&   	rl_moduler   infosepisode_returnsepisode_lengthssum_rewardslengthnum_episodesrl_module_outaction_logitsactionreward
terminated	truncatedinfoeval_resultsresultss                                 /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.pymy_experimentrR   5   s    zz"4a88 **_a88 			&	!	!		5	5	]	#	#	  OOuO<<>>L#$$ K K$**,,!"gM"""*+IJ"."3"3"5"5 OOwO,,..K3444#$$ # ##))++!"g45559KK555M""""!,!1!1!3!3 q)))I0111 !+ 

(
-a
0C !'I JCOOK&L
,
,
, "33u''sA(>(>??
 

 &&:;A>=//115577==??@@ 4788F3C3C0VZD 	v! 	% 	%AL"";///""6***#$$K&JC- ,
,
,4 ( / L 0/,/GK    __main__r   r      r
      r   )	resources)param_spacezevaluation episode returns=r   )"__doc__typingr   r3   r.   rayr   ray.rllib.algorithms.ppor   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r,   r9   rR   __name__r   r$   base_configto_dictconfig_dictwith_resources
algo_classdefault_resource_requesttraining_functionTunertunerfitrP   get_best_resultbest_resultsprintmetrics rS   rQ   <module>ro      s  ' 'P                 . . . . . . 6 6 6 6 6 6 B B B B B Bqd$ d d d dN z)++))-88DDUVDWWK %%''K '(K"##$K ++(AA+NN  
 DJ  E
 iikkG**,,L	E
N(<^(L
N
NOOOOO9 rS   