
    &`ip                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	l m!Z! erd d
l"m#Z#  ej$        e%          Z&	 	 	 	 d"dee j'                 de(de)de)de j'        f
dZ*	 d#de	e+ef         dede,de,fdZ-	 d$ddddddddddddee j.                 dee	         dee	         dee         dee
         d e,de,deeej/        j0        f         fd!Z1dS )%    N)TYPE_CHECKINGAnyDictListOptionalTypeUnion)tune)WANDB_ENV_VARWandbLoggerCallback)ENV_RUNNER_RESULTSEPISODE_RETURN_MEANEVALUATION_RESULTSNUM_ENV_STEPS_SAMPLED_LIFETIME)"convert_numpy_to_python_primitives)
ResultDict)CLIReporter)TRAINING_ITERATION)AlgorithmConfig      Y@   順 parserdefault_rewarddefault_itersdefault_timestepsreturnc                 :   | t          j                    } |                     dt          dd           |                     dg ddd	
           |                     dt          dd           |                     dt          dd           |                     dt          dd           |                     dt          dd           |                     dt          dd           |                     dt          dd           |                     dd dd           |                     dt          dddgd            |                     d!d"d#$           |                     d%t          dd&           |                     d't          dg d(d)            |                     d*d"d+$           |                     d,t          d-d.           |                     d/t          dd0           |                     d1t          d2d3           |                     d4t          dd5           |                     d6d"d7$           |                     d8t          dd9           |                     d:t          dd;           |                     d<t          dd=           |                     d>t
          |d?           |                     d@t          |dA           |                     dBt          |dC           |                     dDd"dE$           |                     dFd"dG$           |                     dHt          ddI           |                     dJt
          ddK           |                     dLt
          ddM           |                     dNt          ddO           |                     dPt          dQ           |                     dRd"dS$           |                     dTt          ddU           |                     dVd"dW$           |                     dXd"Y           | S )Zac  Adds RLlib-typical (and common) examples scripts command line args to a parser.

    TODO (sven): This function should be used by most of our examples scripts, which
     already mostly have this logic in them (but written out).

    Args:
        parser: The parser to add the arguments to. If None, create a new one.
        default_reward: The default value for the --stop-reward option.
        default_iters: The default value for the --stop-iters option.
        default_timesteps: The default value for the --stop-timesteps option.

    Returns:
        The altered (or newly created) parser object.
    Nz--algoPPOz&The RLlib-registered algorithm to use.)typedefaulthelpz--framework)tftf2torchr%   zThe DL framework specifier.)choicesr!   r"   z--envz2The gym.Env identifier to run the experiment with.z--num-env-runnersz<The number of (remote) EnvRunners to use for the experiment.z--num-envs-per-env-runnerzThe number of (vectorized) environments per EnvRunner. Note that this is identical to the batch size for (inference) action computations.z--num-agentsr   a  If 0 (default), will run as single-agent. If > 0, will run as multi-agent with the environment simply cloned n times and each agent acting independently at every single timestep. The overall reward for this experiment is then the sum over all individual agents' rewards.z--evaluation-num-env-runnerszGThe number of evaluation (remote) EnvRunners to use for the experiment.z--evaluation-intervalz`Every how many iterations to run one round of evaluation. Use 0 (default) to disable evaluation.z--evaluation-durationc                 0    | dk    r| nt          |           S )Nauto)int)vs    l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/examples/utils.py<lambda>z/add_rllib_example_script_args.<locals>.<lambda>u   s    AKKqqSVV     
   zThe number of evaluation units to run each evaluation round. Use `--evaluation-duration-unit` to count either in 'episodes' or 'timesteps'. If 'auto', will run as many as possible during train pass (`--evaluation-parallel-to-training` must be set then).z--evaluation-duration-unitepisodes	timestepszThe evaluation duration unit to count by. One of 'episodes' or 'timesteps'. This unit will be run `--evaluation-duration` times in each evaluation round. If `--evaluation-duration=auto`, this setting does not matter.)r    r!   r&   r"   z!--evaluation-parallel-to-training
store_truezWhether to run evaluation parallel to training. This might help speed up your overall iteration time. Be aware that when using this option, your reported evaluation results are referring to one iteration before the current one.)actionr"   z--outputz\The output directory to write trajectories to, which are collected by the algo's EnvRunners.z--log-level)INFODEBUGWARNERRORz-The log-level to be used by the RLlib logger.z	--no-tunezWhether to NOT use tune.Tuner(), but rather a simple for-loop calling `algo.train()` repeatedly until one of the stop criteria is met.z--num-samples   zMHow many (tune.Tuner.fit()) experiments to execute - if possible in parallel.z--max-concurrent-trialsz1How many (tune.Tuner) trials to run concurrently.z	--verbose   zBThe verbosity level for the `tune.Tuner()` running the experiment.z--checkpoint-freqzThe frequency (in training iterations) with which to create checkpoints. Note that if --wandb-key is provided, all checkpoints will automatically be uploaded to WandB.z--checkpoint-at-endzWhether to create a checkpoint at the very end of the experiment. Note that if --wandb-key is provided, all checkpoints will automatically be uploaded to WandB.z--wandb-keyz/The WandB API key to use for uploading results.z--wandb-projectzThe WandB project name to use.z--wandb-run-namezThe WandB run name to use.z--stop-rewardz0Reward at which the script should stop training.z--stop-itersz"The number of iterations to train.z--stop-timestepsz8The number of (environment sampling) timesteps to train.z	--as-testzWhether this script should be run as a test. If set, --stop-reward must be achieved within --stop-timesteps AND --stop-iters, otherwise this script will throw an exception at the end.z--as-release-testzWhether this script should be run as a release test. If set, all that applies to the --as-test option is true, plus, a short JSON summary will be written into a results file whose location is given by the ENV variable `TEST_OUTPUT_JSON`.z--num-learnerszLThe number of Learners to use. If `None`, use the algorithm's default value.z--num-cpus-per-learnerzTThe number of CPUs per Learner to use. If `None`, use the algorithm's default value.z--num-gpus-per-learnerzThe number of GPUs per Learner to use. If `None` and there are enough GPUs for all required Learners (--num-learners), use a value of 1, otherwise 0.z#--num-aggregator-actors-per-learnerzaThe number of Aggregator actors to use per Learner. If `None`, use the algorithm's default value.z
--num-cpus)r    r!   z--local-modez,Init Ray in local mode for easier debugging.z
--num-gpusz6The number of GPUs to use (only on the old API stack).z--old-api-stackz.Run this script on the old API stack of RLlib.z--enable-new-api-stack)r2   )argparseArgumentParseradd_argumentstrr)   float)r   r   r   r   s       r+   add_rllib_example_script_argsr>   '   s=   ( ~(** sE0X     &&&*	     A	     K	     #S	     J	     &V	     1	     33A	     $[)  	 	 	 +     !	     222<     K     	     !@	     Q	     2  	 	 	 2	     >	     -	     )	     ?	     1	     !G	     5     '     	      	      	     -%	     3:::
;     E	     =          
 Mr-   Fstopresultskeep_ray_upc                    |                                  D ]\  }}|}|                    d          D ]:}|                                }t          |t                    r||v r	||         }8d} |Z	 t          j        |          s4||k    r.t          d| d| d           |st          j	                      dS # t          $ r Y w xY wdS )a  Checks stopping criteria on `ResultDict`

    Args:
        stop: Dictionary of stopping criteria. Each criterium is a mapping of
            a metric in the `ResultDict` of the algorithm to a certain criterium.
        results: An RLlib `ResultDict` containing all results from a training step.
        keep_ray_up: Optionally shutting down the runnin Ray instance.

    Returns: True, if any stopping criterium is fulfilled. Otherwise, False.
    /NzStop criterion (=) fulfilled!TF)itemssplitstrip
isinstancedictnpisnanprintrayshutdown	TypeError)r?   r@   rA   key	thresholdvalks          r+   should_stoprU   D  s    **,,  Y3 	 	A		A #t$$ c!f ;		8C== SI%5%5FFFyFFFGGG" #LNNNtt 	 	 	H	
 5s   0AB99
CC)r?   success_metric	trainabletune_callbackskeep_configrA   	schedulerprogress_reporterbase_configr   argsrV   rW   rX   rY   c                  ' |"t                      }
|
                                }|j        rt          d          |j        rd|_        t          j        |j        pd|j	        d           |:t           dt           |j        t           dt           |j        t          |j        i}| }|s|                    |j                   |j        !|j        |                    |j                   |j        r|                    dd           |j        |                    |j                   |j        |                    |j        	           |j        rz|j        |j        d
k    rt          d          t          j                                        dd
          }|j        |j        n|j        pd}|j        pd
|z  }|j        |j        nd|z  }|                    d
           |j        |                     |j                   |j!        |                     |j!                   |j        4||k    r|                     d           n[|                     d
           nD||k     r#t          d|j         d|j         d| d          |                     |j                   |j"        |                     |j"                   n"|j        |                    |j                   |j#        d
k    r3|$                    |j%        |j#        |j&        |j'        |j(                   |j)        |*                    |j)                   |j+        |,                    |j+                   |j-        r|j        s|j        rJ |.                                }t_          |                    t          |j                            D ]f}|0                                }t          |v rG|t                                       t          tb          j2                  }tg          d| d| d           th          |v rKt          |th                   v r7|th                   t                   t                   }tg          d| d           tg                       |5                                D ]\  }}|}|6                    d          D ]}	 ||         }# tn          $ r d}Y  nw xY w|Ktc          j8        |          s7||k    r1tg          d| d | d!           |st          j9                     |c c S h|st          j9                     |S |pg }tu          |d"          r|j;        tx          tz          j>        v r|j;        ptz          j>        tx                   }|j?        pV|j@        A                                d#z   t          jC        d$d#t          |j                  A                                          z   }|E                    t          d@||dd%|jG        r	d&|jG        ini            |	O|jH        d
k    rDt          i t          d'd(d)t          d*t           dt           d+id, |jJ        D             -          }	d.tz          j>        d/<   t          jK                    }t          jM        |p|jN        |t          jO        ||jP        |t          jQ        |jR        |jS        0          |	1          t          jT        |jU        |jV        |2          3          W                                }t          jK                    |z
  }|st          j9                     |jX        r#d4 |jX        D             }t          d5|           d}|j        r|Wth           dt           dt           t           dt           fD ]}||v r|||         i} n|t           dt           |j        i}t          t          |5                                                    \  '} t          'fd6|]                                ^                                D                       }!|!| k    rd}tg          d7' d8|  d9           |j        r|j_        j`        d
         }"|"ja        }#|#b                    d:d           t          |          |"jd        gt          t          jK                              t          |#          |g| g|st          |"          dini d;}$tz          j>                            d<d=          }%t          |%d>          5 }&t          jh        |$|&           ddd           n# 1 swxY w Y   |st          d7' d8|  d?          |S )Aav  Given an algorithm config and some command line args, runs an experiment.

    There are some constraints on what properties must be defined in `args`.
    It should ideally be generated via calling
    `args = add_rllib_example_script_args()`, which can be found in this very module
    here.

    The function sets up an Algorithm object from the given config (altered by the
    contents of `args`), then runs the Algorithm via Tune (or manually, if
    `args.no_tune` is set to True) using the stopping criteria in `stop`.

    At the end of the experiment, if `args.as_test` is True, checks, whether the
    Algorithm reached the `success_metric` (if None, use `env_runners/
    episode_return_mean` with a minimum value of `args.stop_reward`).

    See https://github.com/ray-project/ray/tree/master/rllib/examples for an overview
    of all supported command line options.

    Args:
        base_config: The AlgorithmConfig object to use for this experiment. This base
            config will be automatically "extended" based on some of the provided
            `args`. For example, `args.num_env_runners` is used to set
            `config.num_env_runners`, etc..
        args: A argparse.Namespace object, ideally returned by calling
            `args = add_rllib_example_script_args()`. It must have the following
            properties defined: `stop_iters`, `stop_reward`, `stop_timesteps`,
            `no_tune`, `verbose`, `checkpoint_freq`, `as_test`. Optionally, for WandB
            logging: `wandb_key`, `wandb_project`, `wandb_run_name`.
        stop: An optional dict mapping ResultDict key strings (using "/" in case of
            nesting, e.g. "env_runners/episode_return_mean" for referring to
            `result_dict['env_runners']['episode_return_mean']` to minimum
            values, reaching of which will stop the experiment). Default is:
            {
            "env_runners/episode_return_mean": args.stop_reward,
            "training_iteration": args.stop_iters,
            "num_env_steps_sampled_lifetime": args.stop_timesteps,
            }
        success_metric: Only relevant if `args.as_test` is True.
            A dict mapping a single(!) ResultDict key string (using "/" in
            case of nesting, e.g. "env_runners/episode_return_mean" for referring
            to `result_dict['env_runners']['episode_return_mean']` to a single(!)
            minimum value to be reached in order for the experiment to count as
            successful. If `args.as_test` is True AND this `success_metric` is not
            reached with the bounds defined by `stop`, will raise an Exception.
        trainable: The Trainable sub-class to run in the tune.Tuner. If None (default),
            use the registered RLlib Algorithm class specified by args.algo.
        tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner.
            In case `args.wandb_key` is provided, appends a WandB logger to this
            list.
        keep_config: Set this to True, if you don't want this utility to change the
            given `base_config` in any way and leave it as-is. This is helpful
            for those example scripts which demonstrate how to set config settings
            that are otherwise taken care of automatically in this function (e.g.
            `num_env_runners`).

    Returns:
        The last ResultDict from a --no-tune run OR the tune.Tuner.fit()
        results.
    Nz`--enable-new-api-stack` flag no longer supported (it's the default behavior now)! To switch back to the old API stack on your scripts, use the `--old-api-stack` flag.T)num_cpus
local_modeignore_reinit_errorrC   F)enable_rl_module_and_learner"enable_env_runner_and_connector_v2)num_env_runners)num_envs_per_env_runnerr   z--num-gpus is not supported on the new API stack! To train on GPUs, use the command line options `--num-gpus-per-learner=1` and `--num-learners=[your number of available GPUs]`, instead.GPUr7   )num_gpus)num_learners)!num_aggregator_actors_per_learner)num_gpus_per_learnerz0You are running your script with --num-learners=z and --num-gpus-per-learner=z, but your cluster only has z GPUs!)num_cpus_per_learner)evaluation_num_env_runnersevaluation_intervalevaluation_durationevaluation_duration_unitevaluation_parallel_to_training)	log_level)outputziter=z R= )endz	 R(eval)=zStop criterium (rD   rE   	wandb_key-z\W+)api_keyprojectupload_checkpointsnameitertime_total_sztotal time (s)tszcombined returnc                 .    i | ]}t            d | d| S )z/module_episode_returns_mean/zreturn )r   ).0pids     r+   
<dictcomp>z7run_rllib_example_script_experiment.<locals>.<dictcomp>  sB         .UUPSUU&  r-   )metric_columns0RAY_AIR_NEW_OUTPUT)checkpoint_frequencycheckpoint_at_end)r?   verbose	callbackscheckpoint_configr[   )num_samplesmax_concurrent_trialsrZ   )param_space
run_configtune_configc                     g | ]n}|j         rVt          |j         d          d          r;t          |j         d          j                   dk    r|j         d          j         d         nt          |          oS )r   r]   r8   )r]   hasattrlenrepr)r   es     r+   
<listcomp>z7run_rllib_example_script_experiment.<locals>.<listcomp>  s     
 
 
  v!!&)V449<QVAY^9L9Lq9P9P F1IN1a
 
 
r-   z;Running the example script resulted in one or more errors! c              3   .   K   | ]\  }}|         V  d S N )r   _rowsuccess_metric_keys      r+   	<genexpr>z6run_rllib_example_script_experiment.<locals>.<genexpr>  s>       
 
(.3C"#
 
 
 
 
 
r-   `z` of z reached! okconfig)
time_takentrial_stateslast_updatestatspassed
not_passedfailuresTEST_OUTPUT_JSONz/tmp/learning_test.jsonwtz not reached!r   )ir>   
parse_argsenable_new_api_stack
ValueErroras_release_testas_testrN   initr_   r`   r   r   stop_rewardr   stop_timestepsr   
stop_iters	frameworkenvenvironmentold_api_stack	api_stackrd   env_runnersre   rb   rg   cluster_resourcesgetrh   rj   	resourceslearnersri   rk   rm   
evaluationrl   rn   ro   rp   rq   	debuggingrr   offline_datano_tunebuildrangetrainrK   nanrM   r   rF   rG   KeyErrorrL   rO   r   ru   r   osenvironwandb_projectalgolowerresubr<   appendr   wandb_run_name
num_agentsr   policiestimer
   Tuner
algo_class	RunConfigr   CheckpointConfigcheckpoint_freqr   
TuneConfigr   r   fiterrorsRuntimeErrornextr{   maxget_dataframeiterrows_experiment_analysistrialslast_resultpopr=   statusr   openjsondump)(r\   r]   r?   rV   rW   rX   rY   rA   rZ   r[   r   r   num_gpus_availablenum_actual_learnersnum_gpus_requestednum_gpus_needed_if_availabler   ir@   mean_returnRevalrQ   rR   rS   rT   ru   rx   
start_timer   r   test_passedtry_itsuccess_metric_value
best_valuetrialr   json_summaryfilenamefr   s(                                          @r+   #run_rllib_example_script_experimentr   u  s   P |.00  ""   
*
 
 	
   H&$?     |!99$7994;K!DD$BDD#
 F  j4((( 8FJ$6tx(((  	-238     +t/CDDD'3t7STTT
 . @	5}(T]Q->-> Q   "%!6!8!8!<!<UA!F!F
 $0 !!(# 	   #'";"@qDW!W ,8 ))#	,$( a((( ,T->??? 5A>      (0%)EEEOOO;;;;OOO;;;;#&888 2(2 202 2 *2 2 2   T5NOOO (4T5NOOO ]&dm444 #a''+/+J$($<$($<)-)F040T     >%t~666 ;"t{333 | #<<(<<<<||~~txx 2DODDEE 	# 	#AjjllG!W,,%&89=='  1a11K11r::::"g--&'2D*EEE 234FG' )%))r2222GGG"&**,, # #Y3  A!!f#   " ?28C==?SI=M=MJSJJ9JJJKKK& '"NNNNN#  	LNNN
 $)rNt[!! 
"mrz&A&AN?bj&?	$ 
IOO#bfVS#fj//:O:O:Q:Q&R&RR 	 	 !#'  594GOFD/00R	 	
 	
 	
  T_q%8%8'&"$42D)AA,?AACT	   &	  
 
 
& (+BJ#$ Jj&V&>L$"3%)%9"&"8   0	
 	
 	
 O("&"<
 
 
  $ 
cee% & z)J 
 ~ 


 
 ^	
 
 
 R&RR
 
 	

 K| -!%RR(:RR=PRR%==(;==   T>>&,d6l%;NE " %)AA,?AA4CS" 48^=Q=Q=S=S8T8T3U3U00 
 
 
 
292G2G2I2I2R2R2T2T
 
 
 
 

 ---KQ(QQ/CQQQRRR 	+07:E%EIIh%%%#J//!&$TY[[11;EBB&-#./3>FSZZOOB L z~~&8:STTHh%% +	,***+ + + + + + + + + + + + + + +  	P&PP-APPP   Ns$   SS&	%S&	4eee)Nr   r   r   )Fr   )2r9   r   loggingr   r   r   typingr   r   r   r   r   r   r	   numpyrK   rN   r
   ray.air.integrations.wandbr   r   ray.rllib.utils.metricsr   r   r   r   ray.rllib.utils.serializationr   ray.rllib.utils.typingr   ray.tuner   ray.tune.resultr   ray.rllib.algorithmsr   	getLogger__name__loggerr:   r=   r)   r>   r<   boolrU   	Namespaceresult_grid
ResultGridr   r   r-   r+   <module>r     s      				 				                       



       I I I I I I I I            M L L L L L - - - - - -             . . . . . . 5444444		8	$	$ 15!#	X XX,-XX X 	X
 X X X X| DI) )
sCx.)#-)<@)	) ) ) )f *.C  %) $%)C C C"C
8%
&C 4.	C
 TNC ~C TNC C C :t'223C C C C C Cr-   