
    &`i                       d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlZd dlmZmZmZ d dlmZ d dl m!Z! d d	l"m#Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZBmCZC d dlDmEZE d dlFmGZGmHZHmIZI d dlJmKZKmLZL d dlMmNZNmOZO d dlPmQZQmRZR d dlSmTZT d dlUmVZV d d lWmXZX d d!lYmZZZm[Z[m\Z\ d d"l]m^Z^ d d#l_m`Z` d d$lambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZl d d%lmmnZn d d&lompZp d d'lqmrZr d d(lsmtZt d d)lumvZv d d*lwmxZx er*d d+lymzZz d d,l{m|Z| d d-l}m~Z~ d d.lmZ d d/lmZ d d0l.mZ d d1lamZ  ej        e          Zd2ekd3dfd4Z G d5 d6et          Z G d7 d8e          Z G d9 d:ee          ZdS );    N)Enum)
TYPE_CHECKINGAnyCallable
CollectionDictListOptionalTupleTypeUnion)version)Self)DEPRECATED_VALUE
Deprecateddeprecation_warning)RLlibCallback)ConnectorV2DEFAULT_MODULE_ID)Columns)DifferentiableLearnerConfig)validate_module_id)DefaultModelConfig)MultiRLModuleSpec)RLModuleSpec)INPUT_ENV_SINGLE_SPACESINPUT_ENV_SPACES)MultiAgentEnv)is_atari)SampleCollector)SimpleListCollector)MODEL_DEFAULTS)InputReader)	IOContext)Policy
PolicySpecDEFAULT_POLICY_ID)deep_update
force_listmerge_dicts)OldAPIStack5OverrideToImplementCustomLogic_CallToSuperRecommended)try_import_tftry_import_torch)NotProvidedfrom_config)DEFAULT_STATS_CLS_LOOKUP)	StatsBase)	Scheduler)NOT_SERIALIZABLEdeserialize_typeserialize_type)check)TORCH_COMPILE_REQUIRED_VERSION)AgentIDAlgorithmConfigDictEnvConfigDictEnvTypeLearningRateOrScheduleModuleIDMultiAgentPolicyConfigDictPartialAlgorithmConfigDictPolicyIDRLModuleSpecTypeSampleBatchType)Logger)get_trainable_cls)
TRIAL_INFO)_Config)log_once)PlacementGroup	Algorithm)Learner)DifferentiableLearnerLearnerGroupTorchMetaLearner)RLModule)EpisodeTypemodule_specreturnc                 ~    t          | t          t          f          s t          dt	          |            d          d S )NzLrl_module_spec must be an instance of RLModuleSpec or MultiRLModuleSpec.Got z	 instead.)
isinstancer   r   
ValueErrortype)rV   s    y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/algorithm_config.py_check_rl_module_specr]   e   sU    kL2C#DEE 
0$$0 0 0
 
 	

 
    c            M       '    e Zd ZdZed             Zed             Zedede	fd            Z
ed             Zdd	ee         fd
ZdefdZdede	fdZdeeef         fdZedeeef         dee	ef         fd            Zdeeef         fdZddee         de	fdZddZedd            Z	 	 	 ddeeeef                  dee g e!f                  deddfdZ"	 	 	 dde#fdZ$dde#fdZ%	 dde#fdZ&ddddddee         deee'e(e)j*        e)j*        f         f                  d ee+         d!ed"         dd#f
d$Z,ddd%dee         deee-e(e)j*        e)j*        f         f                  dd&fd'Z.d(e'de	fd)Z/e0e0d*d+ee         d,ee         de	fd-Z1e0e0e0e0e2e2e2e2e2e2e2e2d.d/ee3         d0eee4e3f                  d1ee         d2ee         de	f
d3Z5e0fe0e0e0e0e0e0e0e0e0e0e0e0e0d4d5ee         d6ee         d7ee3         d8eeeef                  d9eeeef                  d:ee         d;ee         d<ee         d=ee         d>ee         d?ee         d@ee         dAeeeef                  dBee         de	fdCZ6e0e0fdDee         dEee         de	fdFZ7e0fe0e0e0e0e0e0e0e0e0e0e2dGdeeeef                  dHee8         dIee)j*                 dJee)j*                 dKee         dLeeee4f                  dMee         dNee         dOee         dPee         dQee         de	fdRZ9e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e2e2e2e2e2e2e2e2e2e2e2dS-dTee         dUee3         dVee         dWee         dXee3         dYeeee)j:        f                  dZee3         d[eee4e3f                  d\ee         d]ee         d^ee4         d_ee3         d`ee egedae;da         f         f                  dbee edcgedae;da         f         f                  ddee         deee         dfee3         dgeeeef                  dhee         diee         djeee3ef                  dkee         dlee         dmee         dnee         doee         dpee         dqee<e=                  dree         dsee4         dtee         duee         dvee         dwee4         de	fFdxZ>e0e0e0e0e0e0e0e0e0e0e0dydzee3         d{eeee4e3f                  d|eee4e3f                  d}ee3         d~ee4         dee3         dee3         dee<d&                  dee e)j?        j*        e)j?        j*        gedae;da         f         f                  dee         deeeef                  de	fdZ@e0e0e0e0e0e0e0e0e0e0e0e2e2e2e2e0e0e0e0ddee4         deeA         dee4         dee         dee3         dee3         dee3         dee3         dee         dee         dee         dee<d&                  dee e)j?        j*        e)j?        j*        gedae;da         f         f                  dee         deeeef                  de	f dZBe0fe0e0e0e0e0e0e0e0e0e0e0e0e0e0e0ddeee<eC         e;e<eC                  f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  deee e;e          f                  de	f"dZDe0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e2e2d#dee3         deee3ef                  dee         dee3         dee3         dee4         dee         dee         deed ef                  dee         dee         dee3         dee          dee3         dee3         dee          dee          dee          dee3         dee3         dee         dee3         dee3         deeeef                  dee4         dee3         dee         dee         dee         dee         dee3         dee4         dee4         de	fDdǄZEe0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0e0dȜ%deeee eFgeGf         f                  dee<         deeee f                  dee         deeeef                  dee         dee         dee3         dee         dee         dee;e                  dee         dee         dee         dee         dee         dee<         dee<         dee         dee3         dee3         dee         dee         dee         dee3         dee         dee         dee;e                  dee4         dee3         dee         dee         dee         dee         dee         dee         dee         de	fLdZHe0e0e0e0e0e0e0e2e2e2d
deeeIeJe-         f                  dee3         dee eKdge-f                  deeeJe-         e e-eLgef         f                  dee         dee          dee         de	fdZMe0e0e0e0e0e0e0e0ddee         dee4         dee3         dee4         dee3         dee3         d ee         deeee<eN         f                  de	fdZOe0e0fdee         dee         de	fdZPe0e0e0e0e0e0ddee g e!f                  dee         dee         d	ee         d
ee         dee3         de	fdZQe0e0e0e0e0e0e0e0e2e2e2e2e2e2e2e2ddee         dee         dee3         dee4         dee         dee3         dee4         dee4         de	fdZRe0e0e0e2e2ddeeeeef         eSf                  d ee+         deee'ef                  de	fdZTde0e0e0e0e0e0e0ddee         dee         dee<         deee;e<         ee'e;e<         f         f                  d ee         d!ee         d"ee         d#ee         de	fd$ZUeVdefd%            ZWeVdefd&            ZXeVde<d&         fd'            ZYeVd(             ZZeVd)             Z[eVde3fd*            Z\e\j]        d+e3ddfd,            Z\eVde3fd-            Z^dd/e3de3fd0Z_ded          fd1Z`dd2Zad3 Zbde+fd4Zcdee<d&         ef         fd5Zd	 	 	 ddee         deeee(e)j*        e)j*        f         f                  d6ee         deefd7Zfddd8ddd9dee         deee-e(e)j*        e)j*        f         f                  d6ed:eeeegf                  d;eee         dehfd<Zi fd=Zjd> Zk fd?Zldefd@ZmddAZnddBZodC ZpdD ZqdE ZreVdeeef         fdF            ZsddGZtddHZuddIZvddJZwdK ZxdL ZydM ZzdN Z{dO Z|dP Z}dQ Z~eVdefdR            ZeVdefdS            ZedT             ZeddUedVedefdW            ZdX ZdY ZedddddZdeeI         dee         deee-e(e)j*        e)j*        f         f                  d[ee<e                  de(eIe e-eLgef         f         f
d\            Z ed]d8^          d_             Z ed`d^          da             Z edbd^          dc             Z edbd^          dd             ZeV eded^          df                         Zej]        dg             ZeV edhd^          di                         Zej]        dj             ZeV edkd^          dl                         ZeV edmd^          dn                         Zej]        do             ZeV edpd^          dq                         Zej]        dr             ZeV edsd^          dt                         Zej]        du             ZeV edvd^          dw                         Zej]        dx             ZeV edyd^          dz                         Zej]        d{             ZeV ed|d^          d}                         Zej]        d~             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             ZeV edd^          d                         Zej]        d             Z xZS (  AlgorithmConfiga`  A RLlib AlgorithmConfig builds an RLlib Algorithm from a given configuration.

    .. testcode::

        from ray.rllib.algorithms.ppo import PPOConfig
        from ray.rllib.callbacks.callbacks import MemoryTrackingCallbacks
        # Construct a generic config object, specifying values within different
        # sub-categories, e.g. "training".
        config = (
            PPOConfig()
            .training(gamma=0.9, lr=0.01)
            .environment(env="CartPole-v1")
            .env_runners(num_env_runners=0)
            .callbacks(MemoryTrackingCallbacks)
        )
        # A config object can be used to construct the respective Algorithm.
        rllib_algo = config.build()

    .. testcode::

        from ray.rllib.algorithms.ppo import PPOConfig
        from ray import tune
        # In combination with a tune.grid_search:
        config = PPOConfig()
        config.training(lr=tune.grid_search([0.01, 0.001]))
        # Use `to_dict()` method to get the legacy plain python config dict
        # for usage with `tune.Tuner().fit()`.
        tune.Tuner("PPO", param_space=config.to_dict())
    c                     t           S Nr   )agent_idepisodes     r\   "DEFAULT_AGENT_TO_MODULE_MAPPING_FNz2AlgorithmConfig.DEFAULT_AGENT_TO_MODULE_MAPPING_FN   s
    
 ! r^   c                     t           S rb   r(   )aidrd   workerkwargss       r\   DEFAULT_POLICY_MAPPING_FNz)AlgorithmConfig.DEFAULT_POLICY_MAPPING_FN   s
     ! r^   config_dictrW   c                 p     |             }|                     dd           |                    |           |S )a  Creates an AlgorithmConfig from a legacy python config dict.

        .. testcode::

            from ray.rllib.algorithms.ppo.ppo import PPOConfig
            # pass a RLlib config dict
            ppo_config = PPOConfig.from_dict({})
            ppo = ppo_config.build(env="Pendulum-v1")

        Args:
            config_dict: The legacy formatted python config dict for some algorithm.

        Returns:
            A new AlgorithmConfig object that matches the given python config dict.
        
_is_frozenN)popupdate_from_dict)clsrk   
config_objs      r\   	from_dictzAlgorithmConfig.from_dict   s?    $ SUU
 	d+++##K000r^   c                      |             }i }|                                 D ]L\  }}t          ||          st          d| d| j         d          |                     |d          }|||<   M|S )a  Generates and validates a set of config key/value pairs (passed via kwargs).

        Validation whether given config keys are valid is done immediately upon
        construction (by comparing against the properties of a default AlgorithmConfig
        object of this class).
        Allows combination with a full AlgorithmConfig object to yield a new
        AlgorithmConfig object.

        Used anywhere, we would like to enable the user to only define a few config
        settings that would change with respect to some main config, e.g. in multi-agent
        setups and evaluation configs.

        .. testcode::

            from ray.rllib.algorithms.ppo import PPOConfig
            from ray.rllib.policy.policy import PolicySpec
            config = (
                PPOConfig()
                .multi_agent(
                    policies={
                        "pol0": PolicySpec(config=PPOConfig.overrides(lambda_=0.95))
                    },
                )
            )


        .. testcode::

            from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
            from ray.rllib.algorithms.ppo import PPOConfig
            config = (
                PPOConfig()
                .evaluation(
                    evaluation_num_env_runners=1,
                    evaluation_interval=1,
                    evaluation_config=AlgorithmConfig.overrides(explore=False),
                )
            )

        Returns:
            A dict mapping valid config property-names to values.

        Raises:
            KeyError: In case a non-existing property name (kwargs key) is being
                passed in. Valid property names are taken from a default
                AlgorithmConfig object of `cls`.
        zInvalid property name z for config class !Twarn_deprecated)itemshasattrKeyError__name___translate_special_keys)rp   ri   default_configconfig_overrideskeyvalues         r\   	overrideszAlgorithmConfig.overrides   s    b  ,,.. 	* 	*JC>3// SSSSCLSSS   --c4-HHC$)S!!r^   N
algo_classc                    || _         i | _        i | _        d| _        d| _        d| _        d| _        d| _        d| _        d| _	        ddd	didd
didd| _
        ddd| _        d| _        t          j        | _        t           j        dk    rdnd| _        d| _        d| _        t           j        dk    rdnd| _        d| _        i | _        d| _        d| _        i | _        d| _        d| _        d| _        d| _        d| _        d| _         d| _!        d| _"        d| _#        d| _$        d| _%        d| _&        d| _'        d| _(        d| _)        d| _*        i | _+        d| _,        d| _-        d| _.        d| _/        d| _0        d| _1        d| _2        d| _3        d| _4        d| _5        d| _6        d| _7        d| _8        d| _9        d| _:        d| _;        d| _<        d| _=        t|          | _?        d| _@        d| _A        d| _B        d| _C        d| _D        d| _E        d| _F        d| _G        d| _H        d| _I        d| _J        d| _K        d| _L        d| _M        d| _N        d| _O        d| _P        d| _Q        d | _R        d| _S        d| _T        d| _U        	 t          jW        t                    | _Y        n# t          $ r Y nw xY wd| _[        d| _\        i | _]        i | _^        d| __        t          | _a        d| _b        d| _c        d| _d        d| _e        d| _f        d| _g        d| _h        d| _i        d| _j        d| _k        d| _l        d| _m        d| _n        d| _o        d| _p        d| _q        t          | d!          sd| _s        i | _t        d| _u        d| _v        | w                    dd"           d#| _x        t          t                      i| _{        d$| _|        | j}        | _~        d| _        d| _        d| _        d%| _        d| _        d| _        d&| _        i | _        i | _        d| _        d| _        d| _        d| _        i | _        t          j        t          j        g| _        d| _        d| _        d| _        i | _        i | _        d| _        d| _        d| _        i | _        d'| _        d| _        i | _        d| _        d| _        d| _        d| _        i | _        t          j        t          j        g| _        d(| _        d| _        d| _        d)| _        i | _        d| _        i | _        d| _        d| _        d| _        d'| _        d*| _        d+| _        d$| _        d,| _        d| _        d| _        d| _        i | _        d| _        d| _        d| _        d| _        d-| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d+| _        d| _        d| _        i | _        d| _        d| _        d.| _        d/| _        d| _        d| _        d0| _        d| _        d| _        d1| _        d| _        d| _        d| _        d$| _        d| _        d| _        d| _        d| _        t          | _        d| _        d| _        d| _        d| _        d2| _        d| _        d| _        d| _        d| _        d| _        d.| _        d| _        d| _        d$| _        d0| _        d/| _        i | _        d| _        i | _        i | _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _         t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _	        t          | _
        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        d| _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        t          | _        dS )3a  Initializes an AlgorithmConfig instance.

        Args:
            algo_class: An optional Algorithm class that this config class belongs to.
                Used (if provided) to build a respective Algorithm instance from this
                config.
        PACKr   F   torchT      allow_growthCPU)intra_op_parallelism_threadsinter_op_parallelism_threadsgpu_optionslog_device_placementdevice_countallow_soft_placement   )r   r   darwin	aot_eagerinductorNonnxrtaction_masksyncg      N@training_only   truncate_episodesdeepmindNoFilterauto   gGz?gMbP?global_norm    exploration_configenable_rl_module_and_learner"enable_env_runner_and_connector_v2	env_stepsd   samplerread_parquet
   i   write_parquetepisodesg      ^@i  g      $@i  g      @g      >@   WARN(  r    extra_python_environs_for_driver extra_python_environs_for_workerplacement_strategynum_gpus
_fake_gpusnum_cpus_for_main_processframework_streager_tracingeager_max_retracestf_session_argslocal_tf_session_argstorch_compile_learnerTorchCompileWhatToCompileFORWARD_TRAIN%torch_compile_learner_what_to_compilesysplatform$torch_compile_learner_dynamo_backend!torch_compile_learner_dynamo_modetorch_compile_worker#torch_compile_worker_dynamo_backend torch_compile_worker_dynamo_modetorch_ddp_kwargstorch_skip_nan_gradientsenv
env_configobservation_spaceaction_spaceclip_rewardsnormalize_actionsclip_actions	_is_ataridisable_env_checking
render_envaction_mask_keyenv_runner_clsnum_env_runnerscreate_local_env_runnernum_envs_per_env_runnergym_env_vectorize_modenum_cpus_per_env_runnernum_gpus_per_env_runnercustom_resources_per_env_runner'validate_env_runners_after_constructionepisodes_to_numpy%max_requests_in_flight_per_env_runnersample_timeout_screate_env_on_local_worker_env_to_module_connector0add_default_connectors_to_env_to_module_pipeline_module_to_env_connector0add_default_connectors_to_module_to_env_pipelinemerge_env_runner_statesbroadcast_env_runner_statesepisode_lookback_horizonrollout_fragment_length
batch_modecompress_observationsremote_worker_envsremote_env_batch_wait_msenable_tf1_exec_eagerlyr"   sample_collectorpreprocessor_prefobservation_filterupdate_worker_filter_statsuse_worker_filter_statssampler_perf_stats_ema_coef
_is_onlinenum_learnersnum_gpus_per_learnernum_cpus_per_learner!num_aggregator_actors_per_learner+max_requests_in_flight_per_aggregator_actorlocal_gpu_idx"max_requests_in_flight_per_learnergammalr	grad_clipgrad_clip_by_train_batch_size_per_learnertrain_batch_size
num_epochsminibatch_sizeshuffle_batch_per_epochcopydeepcopyr#   modelAttributeError_learner_connector*add_default_connectors_to_learner_pipelinelearner_config_dict	optimizer_learner_classr   callbacks_classcallbacks_on_algorithm_init"callbacks_on_env_runners_recreated+callbacks_on_offline_eval_runners_recreatedcallbacks_on_checkpoint_loaded callbacks_on_environment_createdcallbacks_on_episode_createdcallbacks_on_episode_startcallbacks_on_episode_stepcallbacks_on_episode_endcallbacks_on_evaluate_startcallbacks_on_evaluate_end#callbacks_on_evaluate_offline_start!callbacks_on_evaluate_offline_endcallbacks_on_sample_endcallbacks_on_train_resultexplorerx   _prior_exploration_configr   r   r   	api_stackcount_steps_byr)   r'   policiespolicy_map_capacityrj   policy_mapping_fnpolicies_to_trainpolicy_states_are_swappableobservation_fninput_offline_data_classinput_read_methodinput_read_method_kwargsinput_read_schemainput_read_episodesinput_read_sample_batchesinput_read_batch_sizeinput_filesysteminput_filesystem_kwargsr   OBSNEXT_OBSinput_compress_columnsinput_spaces_jsonablematerialize_datamaterialize_mapped_datamap_batches_kwargsiter_batches_kwargsignore_final_observationprelearner_classprelearner_buffer_classprelearner_buffer_kwargsprelearner_module_synch_perioddataset_num_iters_per_learnerinput_configactions_in_input_normalizedpostprocess_inputsshuffle_buffer_sizeoutputoutput_configoutput_compress_columnsoutput_max_file_sizeoutput_max_rows_per_fileoutput_write_remaining_dataoutput_write_methodoutput_write_method_kwargsoutput_filesystemoutput_filesystem_kwargsoutput_write_episodesoffline_samplingevaluation_intervalevaluation_durationevaluation_duration_unitevaluation_sample_timeout_s1evaluation_auto_duration_min_env_steps_per_sample1evaluation_auto_duration_max_env_steps_per_sampleevaluation_parallel_to_training,evaluation_force_reset_envs_before_iterationevaluation_configoff_policy_estimation_methodsope_split_batch_by_episodeevaluation_num_env_runnerscustom_evaluation_functionin_evaluation)sync_filters_on_rollout_workers_timeout_soffline_evaluation_intervalnum_offline_eval_runnersoffline_evaluation_typeoffline_eval_runner_classoffline_loss_for_module_fnoffline_evaluation_duration'offline_evaluation_parallel_to_trainingoffline_evaluation_timeout_s num_cpus_per_offline_eval_runner num_gpus_per_offline_eval_runner(custom_resources_per_offline_eval_runner#restart_failed_offline_eval_runners#ignore_offline_eval_runner_failures$max_num_offline_eval_runner_restarts%offline_eval_runner_restore_timeout_s.max_requests_in_flight_per_offline_eval_runner0validate_offline_eval_runners_after_construction*offline_eval_runner_health_probe_timeout_s%offline_eval_rl_module_inference_only$broadcast_offline_eval_runner_states"offline_eval_batch_size_per_runner!dataset_num_iters_per_eval_runnerkeep_per_episode_custom_metrics$metrics_episode_collection_timeout_s"metrics_num_episodes_for_smoothingmin_time_s_per_iteration!min_train_timesteps_per_iteration"min_sample_timesteps_per_iterationlog_gradientsr3   stats_cls_lookupexport_native_model_files"checkpoint_trainable_policies_onlylogger_creatorlogger_config	log_levellog_sys_usagefake_samplerseedrestart_failed_env_runnersignore_env_runner_failuresmax_num_env_runner_restarts#delay_between_env_runner_restarts_srestart_failed_sub_environments-num_consecutive_env_runner_failures_tolerance!env_runner_health_probe_timeout_senv_runner_restore_timeout_s_model_config_rl_module_spec%algorithm_config_overrides_per_module_per_module_overrides_validate_config_use_msgpack_checkpoints_torch_grad_scaler_class_torch_lr_scheduler_classes%_tf_policy_handles_more_than_one_loss_disable_preprocessor_api_disable_action_flattening)_disable_initialize_loss_from_dummy_batch!_dont_auto_sync_env_runner_statesrm   r   env_task_fnenable_connectorssimple_optimizermonitorevaluation_num_episodesmetrics_smoothing_episodestimesteps_per_iterationmin_iter_time_scollect_metrics_timeoutmin_time_s_per_reporting!min_train_timesteps_per_reporting"min_sample_timesteps_per_reportinginput_evaluationpolicy_map_cache
worker_clssynchronize_filtersenable_async_evaluation custom_async_evaluation_function_enable_rl_module_apiauto_wrap_old_gym_envs always_attach_evaluation_resultsbuffer_sizeprioritized_replaylearning_startsreplay_batch_sizereplay_sequence_lengthreplay_modeprioritized_replay_alphaprioritized_replay_betaprioritized_replay_eps_disable_execution_plan_api)selfr   s     r\   __init__zAlgorithmConfig.__init__   sh	    % 13-02- #))*& %!"$ -.,- %*"AJ$( 
  
 -.,-	&
 &
" &+"%3 	2
 <833KK 	1 26.$)! <833KK 	0 15- "(-% !%  !%!$)!, # '+$'($ '-#'($'($/1,7;4!%562 $*/'(,%@D=(,%@D='6$+/(()% (+$-%*""'()%',$ 3!+",*.''+$+/( $%!$*!12.;<8
 34/ 
)-1* " "',$	~66DJJ 	 	 	D	 #':>7#% "
  -+/(26/;?8.2+04-,0)*.')-&(,%+/()-&37015.'+$)-&  t122 	) .2D*&(D# -1)26/)-/3 	 	
 	
 	
 * +JLL9#& !%!?!%+0("  "&"&!/(*%!##( ).&%)" $')$'.{G4D&E#%)" %'+$"$#%  ).% $'+$(*%.0+-1*+0("'#$ (/W5E'F$$4!(,%+0(#2 *,'!%(*%%)" % $( #% (2%+0(AD>AE>/4,<@9!%-/**.'*+'*.' # :>6+/(()%,0$)-&
 +/'+,(7<4,1)01-01-8:53703804815;2>?;@D=:>75:249125/12. 05,48125/(,%12.23/" 8 */&27/ #!!!	 +/'*/'
 ,0(
 480/4,=@:15.,2)  #572 IK" !%(-%(,%+/(5:2).&*/'9>616.  
 ,!1 0''7$*:''7$/'7$(8%1A.2B/ 0 0*#3 '7$0@-%5"&6#0@- ,"2/!1&*#+(8%'7$&6#(8%1A.2B/+;(((s   :J 
J&%J&c                    t          j        t          |                     }|                    d           |                    d           d|v r1t	          | d          sJ | j        |d<   |                    d           d|v r1t	          | d          sJ | j        |d<   |                    d           d|v rt          |d         t                    rfi }|                    d          	                                D ]7\  }}t          |t                    r|                                ||<   2|||<   8||d<   |                    dd	          |d
<   |                    dd          |d<   |                    dd	          |d<   |                    dd	          |d<   dD ]6}|                    |          t          k    r|                    |d	           7|S )zConverts all settings into a legacy config dict for backward compatibility.

        Returns:
            A complete AlgorithmConfigDict, usable in backward-compatible Tune/RLlib
            use cases.
        r   rm   lambda_lambdar$  inputr  r
  N	callbacksr   r   create_env_on_driverrX  custom_eval_functionr   	framework)r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  _enable_new_api_stack)r  r  varsrn   rx   r  r$  rY   dictrw   r'   	get_stategetr   )r  configpolicies_dict	policy_idpolicy_specdep_ks         r\   to_dictzAlgorithmConfig.to_dict  s    tDzz**

<   

<    4+++++#|F8JJy!!!v4*****"kF7OJJx   
 Jvj/A4$H$HM*0**Z*@*@*F*F*H*H ; ;&	;k:66 ;/:/D/D/F/FM),,/:M),,!.F: %jj):DAA{)/4PRS)T)T%&)/4PRV)W)W%&$jj$??{
 	( 	(E* zz%  $444

5$'''r^   c                     i }|                     d|                     d                    }||                     ||           |                                D ]%\  }|                     |d          }|t          k    r)|dv r.|dk    rfd	d
D             } | j        di | P|dk    rHt          k    r=t          t                    rt          d          | 
                               |dk    r|                                |                    d          r||<   |dk    rS|r| _        t          t                    rdv rt          d                   d<   |                                /|dk    rSt          t                    r-                     d          rt          d                   d<    | j        di |i |dk    r | j        di |i |dk    rBt          t                    rdv rt          d                   d<    | j        di |i |dk    r't                    |                                t#          | |           ' | j        di | | S )a  Modifies this AlgorithmConfig via the provided python config dict.

        Warns if `config_dict` contains deprecated keys.
        Silently sets even properties of `self` that do NOT exist. This way, this method
        may be used to configure custom Policies which do not have their own specific
        AlgorithmConfig classes, e.g.
        `ray.rllib.examples.policy.random_policy::RandomPolicy`.

        Args:
            config_dict: The old-style python config dict (PartialAlgorithmConfigDict)
                to use for overriding some properties defined in there.

        Returns:
            This updated AlgorithmConfig object.
        r   r   Nr   Fru   )r  
multiagentc                 *    i | ]}|v ||         S  r  ).0kr   s     r\   
<dictcomp>z4AlgorithmConfig.update_from_dict.<locals>.<dictcomp>  s4        Ezz uQx "zzr^   )r  r  r   r!  r"  r#  r  r
  Terror)r
  r   )r   evaluation_r   r[   )r   r  custom_modelr  replay_buffer_configr   )r   r  )r  r  rw   r{   rH   multi_agentr6   rY   strr7   r  environment
startswithr   r  env_runnerstrainingsetattr
evaluation)r  rk   	eval_callenable_new_api_stackr~   ri   r   s         @r\   ro   z AlgorithmConfig.update_from_dict  s_   & 	  +*OO@AA 
  
  +NN-A3G     &++-- @	* @	*JC..sE.JJC j  ///$$      ! **6**** )))e7G.G.G eS)) @,U$???Eu5555$$  E 2222.. *!&	#,,,' .3D+eT** Dv$4U6]$C$CE&M  E ::::eT** Tuyy/H/H T,<U>=R,S,SE.)--e----##--e----...eT** Dv$4U6]$C$CE&M--e----***(//  % 8888 c5))))$$)$$$r^   c                    | j                                         }t          |           |d<   |                    d           |                    d           d |                                D             }d|v rt          |d         t                    rfi }|                    d                                          D ]7\  }}t          |t                    r|                                ||<   2|||<   8||d<   |S )zReturns a dict state that can be pickled.

        Returns:
            A dictionary containing all attributes of the instance.
        classr   rm   c                 0    i | ]\  }}|t           k    ||S r  )r   r  r  vs      r\   r  z-AlgorithmConfig.get_state.<locals>.<dictcomp>U  s)    III$!Q18H3H3HA3H3H3Hr^   r  )	__dict__r  r[   rn   rw   rY   r  r'   r  )r  stater  r  r  s        r\   r  zAlgorithmConfig.get_stateJ  s     ""$$dg		,		,II%++--III :eJ.?#F#FM*/))J*?*?*E*E*G*G ; ;&	;k:66 ;/:/D/D/F/FM),,/:M),, -E* r^   r  c                 ^    |d         } |            }|j                             |           |S )a  Returns an instance constructed from the state.

        Args:
            state: A dictionary containing the state of an `AlgorithmConfig`.
                See `AlgorithmConfig.get_state` for creating a state.
                The constructed class will be of  ``state["class"]``.

        Returns:
            An `AlgorithmConfig` instance with attributes from the `state`.
        r  )r  update)rp   r  ctorr  s       r\   
from_statezAlgorithmConfig.from_stateh  s3     W~u%%%r^   c                 T    |                                  }|                     |          S )a~  Returns a mapping from str to JSON'able values representing this config.

        The resulting values don't have any code in them.
        Classes (such as `callbacks_class`) are converted to their full
        classpath, e.g. `ray.rllib.callbacks.callbacks.RLlibCallback`.
        Actual code such as lambda functions ware written as their source
        code (str) plus any closure information for properly restoring the
        code inside the AlgorithmConfig object made from the returned dict data.
        Dataclass objects get converted to dicts.

        Returns:
            A dict mapping from str to JSON'able values.
        )r  _serialize_dict)r  r  s     r\   	serializezAlgorithmConfig.serialize  s%     ##F+++r^   copy_frozenc                     t          j        |           }|du r|                                 n1|du r-d|_        t	          |j        t                    rd|j        _        |S )a9  Creates a deep copy of this config and (un)freezes if necessary.

        Args:
            copy_frozen: Whether the created deep copy is frozen or not. If None,
                keep the same frozen status that `self` currently has.

        Returns:
            A deep copy of `self` that is (un)frozen.
        TF)r  r  freezerm   rY   rT  r`   )r  r  cps      r\   r  zAlgorithmConfig.copy  sd     ]4  $IIKKKKE!!!BM".@@ 827$/	r^   c                     | j         rdS d| _         t          | j        t                    r| j                                         dS dS )zFreezes this config object, such that no attributes can be set anymore.

        Algorithms should use this method to make sure that their config objects
        remain read-only after this.
        NT)rm   rY   rT  r`   r  r  s    r\   r  zAlgorithmConfig.freeze  sV     ? 	F d,o>> 	,"))+++++	, 	,r^   c                    | j         sdS |                                  |                                  |                                  |                                  |                                  |                                  |                                  |                                  | 	                                 | 
                                 dS )$Validates all values in this config.N)r  _validate_env_runner_settings_validate_callbacks_settings_validate_framework_settings_validate_resources_settings_validate_multi_agent_settings_validate_input_settings_validate_evaluation_settings_validate_offline_settings _validate_new_api_stack_settings#_validate_to_be_deprecated_settingsr  s    r\   validatezAlgorithmConfig.validate  s    
 $ 	F**,,,))+++))+++))+++++---%%'''**,,,'')))--///0022222r^   Tr   r{  use_copyrM   c                    ||| _         | j        
|| j        d<   ||| _        | j        }t	          | j        t
                    rt          | j                  } ||s| nt          j        |           | j                  S )a  Builds an Algorithm from this AlgorithmConfig (or a copy thereof).

        Args:
            env: Name of the environment to use (e.g. a gym-registered str),
                a full class path (e.g.
                "ray.rllib.examples.envs.classes.random_env.RandomEnv"), or an Env
                class directly. Note that this arg can also be specified via
                the "env" key in `config`.
            logger_creator: Callable that creates a ray.tune.Logger
                object. If unspecified, a default logger is created.
            use_copy: Whether to deepcopy `self` and pass the copy to the Algorithm
                (instead of `self`) as config. This is useful in case you would like to
                recycle the same AlgorithmConfig over and over, e.g. in a test case, in
                which we loop over different DL-frameworks.

        Returns:
            A ray.rllib.algorithms.algorithm.Algorithm object.
        Nr   )r  r{  )	r   rT  r{  r   rY   r  rG   r  r  )r  r   r{  r  r   s        r\   
build_algozAlgorithmConfig.build_algo  s    0 ?DH%103&u-%"0D_
dos++ 	<*4?;;Jz'@44T]4-@-@.
 
 
 	
r^   c                 4   ddl m}m}m}m}m}m}	m}
 g }| j        	 |                     ||          }ng# t          $ rZ}d|j
        d         v r?t          d          rt                              d           |                               }n|Y d }~nd }~ww xY wt          |t                    r|g}n?t          |t           t"          f          rt!          |          }nt%          d| d          t'          dj                  }n&|t*          |v r|t*                   d         }n| j        }|G| j        r@t.          j                            fd	j        d         j        j        D                       }t'          d
j                  }n&|t*          |v r|t*                   d         }n| j        }|G| j        r@t.          j                            fdj        d         j        j        D                       } |	|||          }| j        r|                     |                       |                     |                       |                     |                       | j        r]|                     |t          | j         tB                    r| j         j"        ntG          | j$                  | j%                             |                     || j                             |                     |
|                     |S )Nr   )"AddObservationsFromEpisodesToBatchAddStatesFromEpisodesToBatchAddTimeDimToBatchAndZeroPadAgentToModuleMappingBatchIndividualItemsEnvToModulePipelineNumpyToTensorpositional argumentzenv-to-module-wrong-signaturea  Your `config.env_to_module_connector` function seems to have a wrong or outdated signature! It should be: `def myfunc(env, spaces, device): ...`, where any of these arguments are optional and may be None.
`env` is the (vectorized) gym env.
`spaces` is a dict of structure `{'__env__': ([vectorized env obs. space, vectorized env act. space]),'__env_single__': ([env obs. space, env act. space])}`.
`device` is a (torch) device.
z`AlgorithmConfig.env_runners(env_to_module_connector=..)` must return a ConnectorV2 object or a list thereof to be added to a connector pipeline! Your function returned .single_observation_spacec                 \    i | ](}|j         d          j                            |          )S r   envs	unwrappedget_observation_spacer  rg   r   s     r\   r  zAAlgorithmConfig.build_env_to_module_connector.<locals>.<dictcomp>2  B        !.DDSII  r^   single_action_spacer   c                 \    i | ](}|j         d          j                            |          )S r  r  r  get_action_spacer  s     r\   r  zAAlgorithmConfig.build_env_to_module_connector.<locals>.<dictcomp>?  B        !.??DD  r^   input_observation_spaceinput_action_space
connectorsrl_module_specsagent_to_module_mapping_fnr  )device)&"ray.rllib.connectors.env_to_moduler  r  r  r  r	  r
  r  r   	TypeErrorargsrJ   loggerr  rY   r   listtuplerZ   getattrr   r   is_multi_agentgymspacesr   r  r  possible_agentsr   r   appendrl_module_specr   r!  setr  r   )r  r   r.  r$  r  r  r  r  r	  r
  r  custom_connectorsval_e	obs_space	act_spacepipelines    `               r\   build_env_to_module_connectorz-AlgorithmConfig.build_env_to_module_connector  s   	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
  (444S&&II   (AF1I55 ?@@ >
 
 
  88==DDG DDDD& $,, %)F!!D4-00 $(JJ!! !JBFJ J J   ?%?AVWWII$;v$E$E67:II.I!4
   "x{4D   I ?%:C<LMMII$;v$E$E67:II)I!4
   "x{4D   I '&$-((
 
 
 @ 	:OO>>@@AAAOO7799:::OO88::;;;" 
((  *$*=?PQQ4D/??!$T]!3!3373I  	 	 	 OO00T=PQQQRRROOMM888999s   6 
B ABBc                    ddl m}m}m}m}m}m}m}	m}
 g }| j	        	 | 	                    |          }ng# t          $ rZ}d|j        d         v r?t          d          rt                              d           | 	                              }n|Y d }~nd }~ww xY wt          |t                     r|g}n?t          |t"          t$          f          rt#          |          }nt'          d| d          t)          dj                  }n&|t,          |v r|t,                   d         }n| j        }|G| j        r@t0          j                            fd	j        d         j        j        D                       }t)          d
j                  }n&|t,          |v r|t,                   d         }n| j        }|G| j        r@t0          j                            fdj        d         j        j        D                       } ||||          }| j        r|                      |                       | j        r|                      |                       |                      |
                       |                      |	                       |                      |                       |!                     || j"        | j#                             |!                     |                       |S )Nr   )
GetActionsListifyDataForVectorEnvModuleToAgentUnmappingModuleToEnvPipelineNormalizeAndClipActionsRemoveSingleTsTimeRankFromBatchTensorToNumpyUnBatchToIndividualItemsr  zmodule-to-env-wrong-signaturea  Your `config.module_to_env_connector` function seems to have a wrong or outdated signature! It should be: `def myfunc(env, spaces): ...`, where any of these arguments are optional and may be None.
`env` is the (vectorized) gym env.
`spaces` is a dict of structure `{'__env__': ([vectorized env obs. space, vectorized env act. space]),'__env_single__': ([env obs. space, env act. space])}`.
z`AlgorithmConfig.env_runners(module_to_env_connector=..)` must return a ConnectorV2 object or a list thereof to be added to a connector pipeline! Your function returned r  r  c                 \    i | ](}|j         d          j                            |          )S r  r  r  s     r\   r  zAAlgorithmConfig.build_module_to_env_connector.<locals>.<dictcomp>  r  r^   r  r   c                 \    i | ](}|j         d          j                            |          )S r  r  r  s     r\   r  zAAlgorithmConfig.build_module_to_env_connector.<locals>.<dictcomp>  r  r^   r  )r   r   )$"ray.rllib.connectors.module_to_envr;  r<  r=  r>  r?  r@  rA  rB  r   r&  r'  rJ   r(  r  rY   r   r)  r*  rZ   r+  r   r   r,  r-  r.  r   r  r  r/  r   r   prependr0  r   r   )r  r   r.  r;  r<  r=  r>  r?  r@  rA  rB  r3  r4  r5  r6  r7  r8  s    `               r\   build_module_to_env_connectorz-AlgorithmConfig.build_module_to_env_connectord  s   		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
  (444S&AA   (AF1I55 ?@@ 
X	 	 	  88==DDG DDDD$ $,, %)F!!D4-00 $(JJ!! !JBFJ J J   ?%?AVWWII$;v$E$E67:II.I!4
   "x{4D   I ?%:C<LMMII$;v$E$E67:II)I!4
   "x{4D   I '&$-((
 
 
 @ 	7
 <<>>??? " ;  !7!7!9!9::: 5577888 ]]__--- ZZ\\*** OO''&*&<!%!2     OO3355666s   7 
BABBc                    ddl m}m}m}m}m}m}	m}
m} g }| j	        n| 	                    ||          }t          |t                    r|g}n?t          |t          t          f          rt          |          }nt          d| d           |
|||          }| j        r"|                     |d                     |                     |                       |                     |d                     |                     |d                     | j        r]|                     |t          | j        t&                    r| j        j        nt+          | j                  | j                             |                     |	| j        	                     |                     |d|
                     |S )Nr   )"AddColumnsFromEpisodesToTrainBatchr  r  r  r  r	  LearnerConnectorPipeliner  z`AlgorithmConfig.learners(learner_connector=..)` must return a ConnectorV2 object or a list thereof to be added to a connector pipeline! Your function returned r  )r  r  r  T)as_learner_connectorr   r#  )rK  r$  )ray.rllib.connectors.learnerrI  r  r  r  r  r	  rJ  r  r  rY   r   r)  r*  rZ   r  r0  r,  r1  r   r!  r2  r  r   )r  r  r  r$  rI  r  r  r  r  r	  rJ  r  r3  r4  r8  s                  r\   build_learner_connectorz'AlgorithmConfig.build_learner_connector  sv   		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
 		
  ".**'" D $,, %)F!!D4-00 $(JJ!! !@8<@ @ @   ,+($;1
 
 

 : 	UOO22MMM   OO>>@@AAAOO77TRRRSSSOO88dSSSTTT" 
((  *$*=?PQQ4D/??!$T]!3!3373I  	 	 	 OO00T=PQQQRRROOMMtFSSSTTTr^   )r   r.  r1  placement_groupr.  r1  rN  rK   rQ   c                    ddl m} ||                     ||          } ||                                 ||          }|S )a  Builds and returns a new LearnerGroup object based on settings in `self`.

        Args:
            env: An optional EnvType object (e.g. a gym.Env) useful for extracting space
                information for the to-be-constructed RLModule inside the LearnerGroup's
                Learner workers. Note that if RLlib cannot infer any space information
                either from this `env` arg, from the optional `spaces` arg or from
                `self`, the LearnerGroup cannot be created.
            spaces: An optional dict mapping ModuleIDs to
                (observation-space, action-space)-tuples for the to-be-constructed
                RLModule inside the LearnerGroup's Learner workers. Note that if RLlib
                cannot infer any space information either from this `spces` arg,
                from the optional `env` arg or from `self`, the LearnerGroup cannot
                be created.
            rl_module_spec: An optional (single-agent or multi-agent) RLModuleSpec to
                use for the constructed LearnerGroup. If None, RLlib tries to infer
                the RLModuleSpec using the other information given and stored in this
                `AlgorithmConfig` object.

        Returns:
            The newly created `LearnerGroup` object.
        r   rP   Nr   r.  )r  rV   rN  )$ray.rllib.core.learner.learner_grouprQ   get_multi_rl_module_specr  )r  r   r.  r1  rN  rQ   learner_groups          r\   build_learner_groupz#AlgorithmConfig.build_learner_group$  sh    < 	FEEEEE !!::s6:RRN %99;;&+
 
 
 r^   rP  rN   c                    d}|||                      ||          }|                     | |          }|                                 |S )a!  Builds and returns a new Learner object based on settings in `self`.

        This Learner object already has its `build()` method called, meaning
        its RLModule is already constructed.

        Args:
            env: An optional EnvType object (e.g. a gym.Env) useful for extracting space
                information for the to-be-constructed RLModule inside the Learner.
                Note that if RLlib cannot infer any space information
                either from this `env` arg, from the optional `spaces` arg or from
                `self`, the Learner cannot be created.
            spaces: An optional dict mapping ModuleIDs to
                (observation-space, action-space)-tuples for the to-be-constructed
                RLModule inside the Learner. Note that if RLlib cannot infer any
                space information either from this `spaces` arg, from the optional
                `env` arg or from `self`, the Learner cannot be created.

        Returns:
            The newly created (and already built) Learner object.
        NrP  )r  rV   )rR  learner_classbuild)r  r   r.  r1  learners        r\   build_learnerzAlgorithmConfig.build_learnerR  sU    8 ?f0!::s6:RRN$$Dn$MMr^   	module_idc                     || j         vrC|| j        v r:|                                                     | j        |                   | j         |<   || j         v r| j         |         S | S )a  Returns an AlgorithmConfig object, specific to the given module ID.

        In a multi-agent setup, individual modules might override one or more
        AlgorithmConfig properties (e.g. `train_batch_size`, `lr`) using the
        `overrides()` method.

        In order to retrieve a full AlgorithmConfig instance (with all these overrides
        already translated and built-in), users can call this method with the respective
        module ID.

        Args:
            module_id: The module ID for which to get the final AlgorithmConfig object.

        Returns:
            A new AlgorithmConfig object for the specific module ID.
        )r  r  r  ro   )r  rZ  s     r\   get_config_for_modulez%AlgorithmConfig.get_config_for_modulex  su    ( T777TGGG48IIKK4P4P:9E5 5D&y1
 222-i88 Kr^   )r   r   r   r   c                F    |t           ur|| _        |t           ur|| _        | S )a  Sets the config's python environment settings.

        Args:
            extra_python_environs_for_driver: Any extra python env vars to set in the
                algorithm's process, e.g., {"OMP_NUM_THREADS": "16"}.
            extra_python_environs_for_worker: The extra python environments need to set
                for worker processes.

        Returns:
            This updated AlgorithmConfig object.
        )r1   r   r   )r  r   r   s      r\   python_environmentz"AlgorithmConfig.python_environment  s/    " ,;>>4TD1+;>>4TD1r^   )r   r   r   r   num_cpus_per_workernum_gpus_per_workercustom_resources_per_workernum_learner_workersnum_cpus_per_learner_workernum_gpus_per_learner_workerr   num_cpus_for_local_workerr   r   r   r   c                   |t           k    rt          ddd           || _        |t           k    rt          ddd           || _        |t           k    rt          ddd           || _        |t           k    rt          d	d
d           || _        |	t           k    rt          ddd           |	| _        |
t           k    rt          ddd           |
| _        |t           k    rt          ddd           || _        |t           k    rt          ddd           || _	        |t          ur|| _	        |t          ur|| _        |t          ur|| _        |t          ur|| _        | S )a  Specifies resources allocated for an Algorithm and its ray actors/workers.

        Args:
            num_cpus_for_main_process: Number of CPUs to allocate for the main algorithm
                process that runs `Algorithm.training_step()`.
                Note: This is only relevant when running RLlib through Tune. Otherwise,
                `Algorithm.training_step()` runs in the main program (driver).
            num_gpus: Number of GPUs to allocate to the algorithm process.
                Note that not all algorithms can take advantage of GPUs.
                Support for multi-GPU is currently only available for
                tf-[PPO/IMPALA/DQN/PG]. This can be fractional (e.g., 0.3 GPUs).
            _fake_gpus: Set to True for debugging (multi-)?GPU funcitonality on a
                CPU machine. GPU towers are simulated by graphs located on
                CPUs in this case. Use `num_gpus` to test for different numbers of
                fake GPUs.
            placement_strategy: The strategy for the placement group factory returned by
                `Algorithm.default_resource_request()`. A PlacementGroup defines, which
                devices (resources) should always be co-located on the same node.
                For example, an Algorithm with 2 EnvRunners and 1 Learner (with
                1 GPU) requests a placement group with the bundles:
                [{"cpu": 1}, {"gpu": 1, "cpu": 1}, {"cpu": 1}, {"cpu": 1}], where the
                first bundle is for the local (main Algorithm) process, the second one
                for the 1 Learner worker and the last 2 bundles are for the two
                EnvRunners. These bundles can now be "placed" on the same or different
                nodes depending on the value of `placement_strategy`:
                "PACK": Packs bundles into as few nodes as possible.
                "SPREAD": Places bundles across distinct nodes as even as possible.
                "STRICT_PACK": Packs bundles into one node. The group is not allowed
                to span multiple nodes.
                "STRICT_SPREAD": Packs bundles across distinct nodes.

        Returns:
            This updated AlgorithmConfig object.
        z.AlgorithmConfig.resources(num_cpus_per_worker)z4AlgorithmConfig.env_runners(num_cpus_per_env_runner)Foldnewr  z.AlgorithmConfig.resources(num_gpus_per_worker)z4AlgorithmConfig.env_runners(num_gpus_per_env_runner)z6AlgorithmConfig.resources(custom_resources_per_worker)z<AlgorithmConfig.env_runners(custom_resources_per_env_runner)z.AlgorithmConfig.resources(num_learner_workers)z%AlgorithmConfig.learners(num_learner)z6AlgorithmConfig.resources(num_cpus_per_learner_worker)z.AlgorithmConfig.learners(num_cpus_per_learner)z6AlgorithmConfig.resources(num_gpus_per_learner_worker)z.AlgorithmConfig.learners(num_gpus_per_learner)z(AlgorithmConfig.resources(local_gpu_idx)z'AlgorithmConfig.learners(local_gpu_idx)z4AlgorithmConfig.resources(num_cpus_for_local_worker)z4AlgorithmConfig.resources(num_cpus_for_main_process))r   r   r   r   r   r   r   r   r   r   r1   r   r   r   )r  r   r   r   r   r_  r`  ra  rb  rc  rd  r   re  s                r\   	resourceszAlgorithmConfig.resources  s   f "222DJ   
 ,?D("222DJ   
 ,?D(&*:::LR   
 4OD0"222D;   
 !4D&*:::LD   
 )DD%&*:::LD   
 )DD%,,,>=   
 "/D$(888JJ   
 .GD*$K77-FD*;&&$DM[(((DO[00&8D#r^   )r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   c                   |t           ur|dk    rt          ddd           || _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |	t           ur|	| _        |t           ur|| _	        |t           ur|| _
        |
t           ur|
| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        | S )a  Sets the config's DL framework settings.

        Args:
            framework: torch: PyTorch; tf2: TensorFlow 2.x (eager execution or traced
                if eager_tracing=True); tf: TensorFlow (static-graph);
            eager_tracing: Enable tracing in eager mode. This greatly improves
                performance (speedup ~2x), but makes it slightly harder to debug
                since Python code won't be evaluated after the initial eager pass.
                Only possible if framework=tf2.
            eager_max_retraces: Maximum number of tf.function re-traces before a
                runtime error is raised. This is to prevent unnoticed retraces of
                methods inside the `..._eager_traced` Policy, which could slow down
                execution by a factor of 4, without the user noticing what the root
                cause for this slowdown could be.
                Only necessary for framework=tf2.
                Set to None to ignore the re-trace count and never throw an error.
            tf_session_args: Configures TF for single-process operation by default.
            local_tf_session_args: Override the following tf session args on the local
                worker
            torch_compile_learner: If True, forward_train methods on TorchRLModules
                on the learner are compiled. If not specified, the default is to compile
                forward train on the learner.
            torch_compile_learner_what_to_compile: A TorchCompileWhatToCompile
                mode specifying what to compile on the learner side if
                torch_compile_learner is True. See TorchCompileWhatToCompile for
                details and advice on its usage.
            torch_compile_learner_dynamo_backend: The torch dynamo backend to use on
                the learner.
            torch_compile_learner_dynamo_mode: The torch dynamo mode to use on the
                learner.
            torch_compile_worker: If True, forward exploration and inference methods on
                TorchRLModules on the workers are compiled. If not specified,
                the default is to not compile forward methods on the workers because
                retracing can be expensive.
            torch_compile_worker_dynamo_backend: The torch dynamo backend to use on
                the workers.
            torch_compile_worker_dynamo_mode: The torch dynamo mode to use on the
                workers.
            torch_ddp_kwargs: The kwargs to pass into
                `torch.nn.parallel.DistributedDataParallel` when using `num_learners
                > 1`. This is specifically helpful when searching for unused parameters
                that are not used in the backward pass. This can give hints for errors
                in custom models where some parameters do not get touched in the
                backward pass although they should.
            torch_skip_nan_gradients: If updates with `nan` gradients should be entirely
                skipped. This skips updates in the optimizer entirely if they contain
                any `nan` gradient. This can help to avoid biasing moving-average based
                optimizers - like Adam. This can help in training phases where policy
                updates can be highly unstable such as during the early stages of
                training or with highly exploratory policies. In such phases many
                gradients might turn `nan` and setting them to zero could corrupt the
                optimizer's internal state. The default is `False` and turns `nan`
                gradients to zero. If many `nan` gradients are encountered consider (a)
                monitoring gradients by setting `log_gradients` in `AlgorithmConfig` to
                `True`, (b) use proper weight initialization (e.g. Xavier, Kaiming) via
                the `model_config_dict` in `AlgorithmConfig.rl_module` and/or (c)
                gradient clipping via `grad_clip` in `AlgorithmConfig.training`.

        Returns:
            This updated AlgorithmConfig object.
        tfez AlgorithmConfig.framework('tfe')z AlgorithmConfig.framework('tf2')Trg  )r1   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r  r  r   r   r   r   r   r   r   r   r   r   r   r   r   s                  r\   r  zAlgorithmConfig.framework/  sP   ^ K''E!!#::   
 "+D++!.D[00&8D#+--#2D  33)>D& 33)>D&/{BB4 5 -K??5VD20CC5 6  {22(<D%.kAA3 4 ,;>>4TD1;..$4D!#;66,DD)r^   r   r   c                    |t           urf|| _        |du r| j        r| j        | _        i | _        n@|du r<| j        s5| j        | j        | _        d| _        nt                              d           |t           ur|| _        | S )a7  Sets the config's API stack settings.

        Args:
            enable_rl_module_and_learner: Enables the usage of `RLModule` (instead of
                `ModelV2`) and Learner (instead of the training-related parts of
                `Policy`). Must be used with `enable_env_runner_and_connector_v2=True`.
                Together, these two settings activate the "new API stack" of RLlib.
            enable_env_runner_and_connector_v2: Enables the usage of EnvRunners
                (SingleAgentEnvRunner and MultiAgentEnvRunner) and ConnectorV2.
                When setting this to True, `enable_rl_module_and_learner` must be True
                as well. Together, these two settings activate the "new API stack" of
                RLlib.

        Returns:
            This updated AlgorithmConfig object.
        TFNzoconfig.enable_rl_module_and_learner was set to False, but no prior exploration config was found to be restored.)r1   r   r   r  r(  warningr   )r  r   r   s      r\   r  zAlgorithmConfig.api_stack  s    * ({::0LD-+t338O3151H.*,''-66t?V61=.2.LD+59D22NNM  
 .[@@6XD3r^   )r   r   r   r   r   r   r   r   r    r   r  r   r   r   r   r   r   r   r   r    r   c                   |t           k    rt          dd           |t          ur|| _        |t          urt	          | j        |d           |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        |t          ur|| _        |	t          ur|	| _        |
t          ur|
| _        |t          ur|| _        | S )a  Sets the config's RL-environment settings.

        Args:
            env: The environment specifier. This can either be a tune-registered env,
                via `tune.register_env([name], lambda env_ctx: [env object])`,
                or a string specifier of an RLlib supported type. In the latter case,
                RLlib tries to interpret the specifier as either an Farama-Foundation
                gymnasium env, a PyBullet env, or a fully qualified classpath to an Env
                class, e.g. "ray.rllib.examples.envs.classes.random_env.RandomEnv".
            env_config: Arguments dict passed to the env creator as an EnvContext
                object (which is a dict plus the properties: `num_env_runners`,
                `worker_index`, `vector_index`, and `remote`).
            observation_space: The observation space for the Policies of this Algorithm.
            action_space: The action space for the Policies of this Algorithm.
            render_env: If True, try to render the environment on the local worker or on
                worker 1 (if num_env_runners > 0). For vectorized envs, this usually
                means that only the first sub-environment is rendered.
                In order for this to work, your env has to implement the
                `render()` method which either:
                a) handles window generation and rendering itself (returning True) or
                b) returns a numpy uint8 image of shape [height x width x 3 (RGB)].
            clip_rewards: Whether to clip rewards during Policy's postprocessing.
                None (default): Clip for Atari only (r=sign(r)).
                True: r=sign(r): Fixed rewards -1.0, 1.0, or 0.0.
                False: Never clip.
                [float value]: Clip at -value and + value.
                Tuple[value1, value2]: Clip at value1 and value2.
            normalize_actions: If True, RLlib learns entirely inside a normalized
                action space (0.0 centered with small stddev; only affecting Box
                components). RLlib unsquashes actions (and clip, just in case) to the
                bounds of the env's action space before sending actions back to the env.
            clip_actions: If True, the RLlib default ModuleToEnv connector clips
                actions according to the env's bounds (before sending them into the
                `env.step()` call).
            disable_env_checking: Disable RLlib's env checks after a gymnasium.Env
                instance has been constructed in an EnvRunner. Note that the checks
                include an `env.reset()` and `env.step()` (with a random action), which
                might tinker with your env's logic and behavior and thus negatively
                influence sample collection- and/or learning behavior.
            is_atari: This config can be used to explicitly specify whether the env is
                an Atari env or not. If not specified, RLlib tries to auto-detect
                this.
            action_mask_key: If observation is a dictionary, expect the value by
                the key `action_mask_key` to contain a valid actions mask (`numpy.int8`
                array of zeros and ones). Defaults to "action_mask".

        Returns:
            This updated AlgorithmConfig object.
        z+AlgorithmConfig.environment(env_task_fn=..)Trh  r  )r   r   r1   r   r*   r   r   r   r   r   r   r   r   r   r   )r  r   r   r   r   r   r   r   r   r   r    r   r  s                r\   r  zAlgorithmConfig.environment  s   D ***A    k!!DH[((T:::K//%6D"{** ,D[(((DO{** ,DK//%6D"{** ,D{22(<D%;&&%DN+--#2D r^   )-r   r   r   r   r   r   r   r   r   r   r   r   env_to_module_connectormodule_to_env_connectorr   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   num_rollout_workersnum_envs_per_worker#validate_workers_after_constructionignore_worker_failuresrecreate_failed_workersr  )num_consecutive_worker_failures_toleranceworker_health_probe_timeout_sworker_restore_timeout_ssynchronize_filterr  r   r   r   r   r   r   r   r   r   r   r   r   rq  r   rr  rT   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   c       -            |-t           k    rt          dd           |#t           k    rt          ddd           |$t           k    rt          dd	d           |%t           k    rt          d
dd           |t          ur|| _        |t          ur|| _        |t          ur |dk    rt          d| d          || _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |	t          ur|	| _
        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur7t3          |t4                    r|dk    s|dk    st          d          || _        |t          ur|dvrt          d| d          || _        |t          ur|| _        |t          ur|| _        |t          ur+t?          d| j         id|iddgdg          }.|.d         | _         |t          ur|| _!        |t          ur|| _"        |
t          ur|
| _#        |t          ur|| _$        | t          ur| | _%        |,t          ur|,| _&        |t          ur|| _'        |!t          ur|!| _(        |"t          ur|"| _)        |,t           k    rt          ddd           |&t           k    rt          dd           |'t           k    rt          ddd           |(t           k    rt          ddd           |)t           k    rt          ddd           |*t           k    rt          ddd           |+t           k    rt          d d!d           | S )"a`3  Sets the rollout worker configuration.

        Args:
            env_runner_cls: The EnvRunner class to use for environment rollouts (data
                collection).
            num_env_runners: Number of EnvRunner actors to create for parallel sampling.
                Setting this to 0 forces sampling to be done in the local
                EnvRunner (main process or the Algorithm's actor when using Tune).
            num_envs_per_env_runner: Number of environments to step through
                (vector-wise) per EnvRunner. This enables batching when computing
                actions through RLModule inference, which can improve performance
                for inference-bottlenecked workloads.
            gym_env_vectorize_mode: The gymnasium vectorization mode for vector envs.
                Must be a `gymnasium.VectorizeMode` (enum) value.
                Default is SYNC. Set this to ASYNC to parallelize the individual sub
                environments within the vector. This can speed up your EnvRunners
                significantly when using heavier environments. Set this to
                VECTOR_ENTRY_POINT in case your env creator, also known as
                "gym entry point", already returns a gym.vector.VectorEnv and you
                don't need RLlib to vectorize the environments for the runners.
            num_cpus_per_env_runner: Number of CPUs to allocate per EnvRunner.
            num_gpus_per_env_runner: Number of GPUs to allocate per EnvRunner. This can
                be fractional. This is usually needed only if your env itself requires a
                GPU (i.e., it is a GPU-intensive video game), or model inference is
                unusually expensive.
            custom_resources_per_env_runner: Any custom Ray resources to allocate per
                EnvRunner.
            sample_timeout_s: The timeout in seconds for calling `sample()` on remote
                EnvRunner workers. Results (episode list) from workers that take longer
                than this time are discarded. Only used by algorithms that sample
                synchronously in turn with their update step (e.g., PPO or DQN). Not
                relevant for any algos that sample asynchronously, such as APPO or
                IMPALA.
            max_requests_in_flight_per_env_runner: Max number of in-flight requests
                to each EnvRunner (actor)). See the
                `ray.rllib.utils.actor_manager.FaultTolerantActorManager` class for more
                details.
                Tuning these values is important when running experiments with
                large sample batches, where there is the risk that the object store may
                fill up, causing spilling of objects to disk. This can cause any
                asynchronous requests to become very slow, making your experiment run
                slowly as well. You can inspect the object store during your experiment
                through a call to `ray memory` on your head node, and by using the Ray
                dashboard. If you're seeing that the object store is filling up,
                turn down the number of remote requests in flight or enable compression
                or increase the object store memory through, for example:
                `ray.init(object_store_memory=10 * 1024 * 1024 * 1024)  # =10 GB`
            sample_collector: For the old API stack only. The SampleCollector class to
                be used to collect and retrieve environment-, model-, and sampler data.
                Override the SampleCollector base class to implement your own
                collection/buffering/retrieval logic.
            create_local_env_runner: If True, create a local EnvRunner instance, besides
                the `num_env_runners` remote EnvRunner actors. If `num_env_runners` is
                0, this setting is ignored and one local EnvRunner is created
                regardless.
            create_env_on_local_worker: When `num_env_runners` > 0, the driver
                (local_worker; worker-idx=0) does not need an environment. This is
                because it doesn't have to sample (done by remote_workers;
                worker_indices > 0) nor evaluate (done by evaluation workers;
                see below).
            env_to_module_connector: A callable taking an Env as input arg and returning
                an env-to-module ConnectorV2 (might be a pipeline) object.
            module_to_env_connector: A callable taking an Env and an RLModule as input
                args and returning a module-to-env ConnectorV2 (might be a pipeline)
                object.
            add_default_connectors_to_env_to_module_pipeline: If True (default), RLlib's
                EnvRunners automatically add the default env-to-module ConnectorV2
                pieces to the EnvToModulePipeline. These automatically perform adding
                observations and states (in case of stateful Module(s)), agent-to-module
                mapping, batching, and conversion to tensor data. Only if you know
                exactly what you are doing, you should set this setting to False.
                Note that this setting is only relevant if the new API stack is used
                (including the new EnvRunner classes).
            add_default_connectors_to_module_to_env_pipeline: If True (default), RLlib's
                EnvRunners automatically add the default module-to-env ConnectorV2
                pieces to the ModuleToEnvPipeline. These automatically perform removing
                the additional time-rank (if applicable, in case of stateful
                Module(s)), module-to-agent unmapping, un-batching (to lists), and
                conversion from tensor data to numpy. Only if you know exactly what you
                are doing, you should set this setting to False.
                Note that this setting is only relevant if the new API stack is used
                (including the new EnvRunner classes).
            episode_lookback_horizon: The amount of data (in timesteps) to keep from the
                preceeding episode chunk when a new chunk (for the same episode) is
                generated to continue sampling at a later time. The larger this value,
                the more an env-to-module connector can look back in time
                and compile RLModule input data from this information. For example, if
                your custom env-to-module connector (and your custom RLModule) requires
                the previous 10 rewards as inputs, you must set this to at least 10.
            merge_env_runner_states: True, if remote EnvRunner actor states should be
                merged into central connector pipelines. Use "training_only" (default)
                for only doing this for the training EnvRunners, NOT for the evaluation
                EnvRunners.
            broadcast_env_runner_states: True, if merged EnvRunner states (from the
                central connector pipelines) should be broadcast back to all remote
                EnvRunner actors.
            use_worker_filter_stats: Whether to use the workers in the EnvRunnerGroup to
                update the central filters (held by the local worker). If False, stats
                from the workers aren't used and are discarded.
            update_worker_filter_stats: Whether to push filter updates from the central
                filters (held by the local worker) to the remote workers' filters.
                Setting this to True might be useful within the evaluation config in
                order to disable the usage of evaluation trajectories for synching
                the central filter (used for training).
            rollout_fragment_length: Divide episodes into fragments of this many steps
                each during sampling. Trajectories of this size are collected from
                EnvRunners and combined into a larger batch of `train_batch_size`
                for learning.
                For example, given rollout_fragment_length=100 and
                train_batch_size=1000:
                1. RLlib collects 10 fragments of 100 steps each from rollout workers.
                2. These fragments are concatenated and we perform an epoch of SGD.
                When using multiple envs per worker, the fragment size is multiplied by
                `num_envs_per_env_runner`. This is since we are collecting steps from
                multiple envs in parallel. For example, if num_envs_per_env_runner=5,
                then EnvRunners return experiences in chunks of 5*100 = 500 steps.
                The dataflow here can vary per algorithm. For example, PPO further
                divides the train batch into minibatches for multi-epoch SGD.
                Set `rollout_fragment_length` to "auto" to have RLlib compute an exact
                value to match the given batch size.
            batch_mode: How to build individual batches with the EnvRunner(s). Batches
                coming from distributed EnvRunners are usually concat'd to form the
                train batch. Note that "steps" below can mean different things (either
                env- or agent-steps) and depends on the `count_steps_by` setting,
                adjustable via `AlgorithmConfig.multi_agent(count_steps_by=..)`:
                1) "truncate_episodes": Each call to `EnvRunner.sample()` returns a
                batch of at most `rollout_fragment_length * num_envs_per_env_runner` in
                size. The batch is exactly `rollout_fragment_length * num_envs`
                in size if postprocessing does not change batch sizes. Episodes
                may be truncated in order to meet this size requirement.
                This mode guarantees evenly sized batches, but increases
                variance as the future return must now be estimated at truncation
                boundaries.
                2) "complete_episodes": Each call to `EnvRunner.sample()` returns a
                batch of at least `rollout_fragment_length * num_envs_per_env_runner` in
                size. Episodes aren't truncated, but multiple episodes
                may be packed within one batch to meet the (minimum) batch size.
                Note that when `num_envs_per_env_runner > 1`, episode steps are
                buffered until the episode completes, and hence batches may contain
                significant amounts of off-policy data.
            explore: Default exploration behavior, iff `explore=None` is passed into
                compute_action(s). Set to False for no exploration behavior (e.g.,
                for evaluation).
            episodes_to_numpy: Whether to numpy'ize episodes before
                returning them from an EnvRunner. False by default. If True, EnvRunners
                call `to_numpy()` on those episode (chunks) to be returned by
                `EnvRunners.sample()`.
            exploration_config: A dict specifying the Exploration object's config.
            remote_worker_envs: If using num_envs_per_env_runner > 1, whether to create
                those new envs in remote processes instead of in the same worker.
                This adds overheads, but can make sense if your envs can take much
                time to step / reset (e.g., for StarCraft). Use this cautiously;
                overheads are significant.
            remote_env_batch_wait_ms: Timeout that remote workers are waiting when
                polling environments. 0 (continue when at least one env is ready) is
                a reasonable default, but optimal value could be obtained by measuring
                your environment step / reset and model inference perf.
            validate_env_runners_after_construction: Whether to validate that each
                created remote EnvRunner is healthy after its construction process.
            preprocessor_pref: Whether to use "rllib" or "deepmind" preprocessors by
                default. Set to None for using no preprocessor. In this case, the
                model has to handle possibly complex observations from the
                environment.
            observation_filter: Element-wise observation filter, either "NoFilter"
                or "MeanStdFilter".
            compress_observations: Whether to LZ4 compress individual observations
                in the SampleBatches collected during rollouts.
            enable_tf1_exec_eagerly: Explicitly tells the rollout worker to enable
                TF eager execution. This is useful for example when framework is
                "torch", but a TF2 policy needs to be restored for evaluation or
                league-based purposes.
            sampler_perf_stats_ema_coef: If specified, perf stats are in EMAs. This
                is the coeff of how much new data points contribute to the averages.
                Default is None, which uses simple global average instead.
                The EMA update rule is: updated = (1 - ema_coef) * old + ema_coef * new

        Returns:
            This updated AlgorithmConfig object.
        z2AlgorithmConfig.env_runners(enable_connectors=...)Frp  z0AlgorithmConfig.env_runners(num_rollout_workers)z,AlgorithmConfig.env_runners(num_env_runners)Trg  z0AlgorithmConfig.env_runners(num_envs_per_worker)z4AlgorithmConfig.env_runners(num_envs_per_env_runner)z@AlgorithmConfig.env_runners(validate_workers_after_construction)zDAlgorithmConfig.env_runners(validate_env_runners_after_construction)r   z`num_envs_per_env_runner` (z) must be larger 0!r   z3`rollout_fragment_length` must be int >0 or 'auto'!)r   complete_episodesz`batch_mode` (z7) must be one of [truncate_episodes|complete_episodes]!r   z2AlgorithmConfig.env_runners(synchronize_filter=..)z:AlgorithmConfig.env_runners(update_worker_filter_stats=..)z>ignore_worker_failures is deprecated, and will soon be a no-opz7AlgorithmConfig.env_runners(recreate_failed_workers=..)>AlgorithmConfig.fault_tolerance(restart_failed_env_runners=..)z?AlgorithmConfig.env_runners(restart_failed_sub_environments=..)zCAlgorithmConfig.fault_tolerance(restart_failed_sub_environments=..)zIAlgorithmConfig.env_runners(num_consecutive_worker_failures_tolerance=..)zQAlgorithmConfig.fault_tolerance(num_consecutive_env_runner_failures_tolerance=..)z=AlgorithmConfig.env_runners(worker_health_probe_timeout_s=..)zEAlgorithmConfig.fault_tolerance(env_runner_health_probe_timeout_s=..)z8AlgorithmConfig.env_runners(worker_restore_timeout_s=..)z@AlgorithmConfig.fault_tolerance(env_runner_restore_timeout_s=..))*r   r   r1   r   r   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rY   intr   r   r  r   r*   r   r   r   r   r   r   r  r   r   r   )/r  r   r   r   r   r   r   r   r   r   r   r   r   rq  rr  r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   rs  rt  ru  rv  rw  r  rx  ry  rz  r{  r  new_exploration_configs/                                                  r\   r  zAlgorithmConfig.env_runners5  s   T  000H    "222FB   
 "222FJ   
 /2BBBV 	    ,,"0D+--#2D "+55&!++  2I         ,CD(!44*@D'"+55+BD("+55+BD(*+==3RD0;..$4D!0CC5 6 ;..$4D!"+55+BD(%[88.HD+"+55,CD)"+55,CD);;NN@ A <;NN@ A $;66,DD)"+55+BD(&k99/JD,"+55+BD(%[88.HD+"+55 6<<X 0!33*f44 !VWWW+BD([((!KKK *Z * * *   )DO+%%"DLK//%6D" [00 &1%t'>?%'9:%&%&& &" '==Q&RD#[00&8D##;66,DD)2+EE7 8 K//%6D"[00&8D#[00'9D$ 33)>D&"+55+BD(&k99/JD, !111HP   
 "%555T    #&666MT   
 +.>>>UY    58HHH_ h    ),<<<S[   
 $'777NV    r^   )r   r   r   r   r   r   r   rV  learner_connectorr  r  r   r   r   r   r   r   r   rV  r  r  r  c                   |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |	t           ur|	| _	        |
t           ur|
| _
        |t           ur| j                            |           | S )ag  Sets LearnerGroup and Learner worker related configurations.

        Args:
            num_learners: Number of Learner workers used for updating the RLModule.
                A value of 0 means training takes place on a local Learner on main
                process CPUs or 1 GPU (determined by `num_gpus_per_learner`).
                For multi-gpu training, you have to set `num_learners` to > 1 and set
                `num_gpus_per_learner` accordingly (e.g., 4 GPUs total and model fits on
                1 GPU: `num_learners=4; num_gpus_per_learner=1` OR 4 GPUs total and
                model requires 2 GPUs: `num_learners=2; num_gpus_per_learner=2`).
            num_cpus_per_learner: Number of CPUs allocated per Learner worker.
                If "auto" (default), use 1 if `num_gpus_per_learner=0`, otherwise 0.
                Only necessary for custom processing pipeline inside each Learner
                requiring multiple CPU cores.
                If `num_learners=0`, RLlib creates only one local Learner instance and
                the number of CPUs on the main process is
                `max(num_cpus_per_learner, num_cpus_for_main_process)`.
            num_gpus_per_learner: Number of GPUs allocated per Learner worker. If
                `num_learners=0`, any value greater than 0 runs the
                training on a single GPU on the main process, while a value of 0 runs
                the training on main process CPUs.
            num_aggregator_actors_per_learner: The number of aggregator actors per
                Learner (if num_learners=0, one local learner is created). Must be at
                least 1. Aggregator actors perform the task of a) converting episodes
                into a train batch and b) move that train batch to the same GPU that
                the corresponding learner is located on. Good values are 1 or 2, but
                this strongly depends on your setup and `EnvRunner` throughput.
            max_requests_in_flight_per_aggregator_actor: How many in-flight requests
                are allowed per aggregator actor before new requests are dropped?
            local_gpu_idx: If `num_gpus_per_learner` > 0, and
                `num_learners` < 2, then RLlib uses this GPU index for training. This is
                an index into the available
                CUDA devices. For example if `os.environ["CUDA_VISIBLE_DEVICES"] = "1"`
                and `local_gpu_idx=0`, RLlib uses the GPU with ID=1 on the node.
            max_requests_in_flight_per_learner: Max number of in-flight requests
                to each Learner (actor). You normally do not have to tune this setting
                (default is 3), however, for asynchronous algorithms, this determines
                the "queue" size for incoming batches (or lists of episodes) into each
                Learner worker, thus also determining, how much off-policy'ness would be
                acceptable. The off-policy'ness is the difference between the numbers of
                updates a policy has undergone on the Learner vs the EnvRunners.
                See the `ray.rllib.utils.actor_manager.FaultTolerantActorManager` class
                for more details.
            learner_class: The `Learner` class to use for (distributed) updating of the
                RLModule.
            learner_connector: A callable taking an env observation space and an env
                action space as inputs and returning a learner ConnectorV2 or
                list of ConnectorV2's as part of pipeline object.
            add_default_connectors_to_learner_pipeline: If True (default), RLlib's
                Learners automatically add the default Learner ConnectorV2
                pieces to the LearnerPipeline. These automatically perform:
                a) adding observations from episodes to the train batch, if this has not
                already been done by a user-provided connector piece
                b) if RLModule is stateful, add a time rank to the train batch, zero-pad
                the data, and add the correct state inputs, if this has not already been
                done by a user-provided connector piece.
                c) add all other information (actions, rewards, terminateds, etc..) to
                the train batch, if this has not already been done by a user-provided
                connector piece.
                Only if you know exactly what you are doing, you
                should set this setting to False.
            learner_config_dict: A dict to insert any settings accessible from within
                the Learner instance. This should only be used in connection with custom
                Learner subclasses and in case the user doesn't want to write an extra
                `AlgorithmConfig` subclass just to add a few settings to the base Algo's
                own config class.

        Returns:
            This updated AlgorithmConfig object.
        )r1   r   r   r   r   r   r   r   r	  r  r  r  r  )r  r   r   r   r   r   r   r   rV  r  r  r  s               r\   learnerszAlgorithmConfig.learners  s    t {** ,D{22(<D%{22(<D%,K??5VD26kII; < ++!.D-[@@6XD3++"/DK//&7D#5[HH: ; k11$++,?@@@r^   )r   r   r   r   r   train_batch_size_per_learnerr   r   r   r  r  r   r   num_sgd_iter)max_requests_in_flight_per_sampler_workerrV  r  r  r  r   r   r   r   r   r  r   r   r   r  r  c                   |t           urt          ddd           || _        |t           urt          ddd           || _        |t           urt          ddd           || _        |t           ur,t          d	d
d           | j                            |           |t          k    rt          ddd           || _        |t          k    rt          ddd           || _	        |t          k    rt          ddd           |}|t          k    r(t          ddd           | 
                    |           |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|dvrt          d| d          || _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |	t           ur|	| _        |
t           urV| j                            |
           |
                    dt                    t          k    rt          dd|
d                    |t           urt/          | j        |          | _        | S )a  Sets the training related configuration.

        Args:
            gamma: Float specifying the discount factor of the Markov Decision process.
            lr: The learning rate (float) or learning rate schedule in the format of
                [[timestep, lr-value], [timestep, lr-value], ...]
                In case of a schedule, intermediary timesteps are assigned to
                linearly interpolated learning rate values. A schedule config's first
                entry must start with timestep 0, i.e.: [[0, initial_value], [...]].
                Note: If you require a) more than one optimizer (per RLModule),
                b) optimizer types that are not Adam, c) a learning rate schedule that
                is not a linearly interpolated, piecewise schedule as described above,
                or d) specifying c'tor arguments of the optimizer that are not the
                learning rate (e.g. Adam's epsilon), then you must override your
                Learner's `configure_optimizer_for_module()` method and handle
                lr-scheduling yourself.
            grad_clip: If None, no gradient clipping is applied. Otherwise,
                depending on the setting of `grad_clip_by`, the (float) value of
                `grad_clip` has the following effect:
                If `grad_clip_by=value`: Clips all computed gradients individually
                inside the interval [-`grad_clip`, +`grad_clip`].
                If `grad_clip_by=norm`, computes the L2-norm of each weight/bias
                gradient tensor individually and then clip all gradients such that these
                L2-norms do not exceed `grad_clip`. The L2-norm of a tensor is computed
                via: `sqrt(SUM(w0^2, w1^2, ..., wn^2))` where w[i] are the elements of
                the tensor (no matter what the shape of this tensor is).
                If `grad_clip_by=global_norm`, computes the square of the L2-norm of
                each weight/bias gradient tensor individually, sum up all these squared
                L2-norms across all given gradient tensors (e.g. the entire module to
                be updated), square root that overall sum, and then clip all gradients
                such that this global L2-norm does not exceed the given value.
                The global L2-norm over a list of tensors (e.g. W and V) is computed
                via:
                `sqrt[SUM(w0^2, w1^2, ..., wn^2) + SUM(v0^2, v1^2, ..., vm^2)]`, where
                w[i] and v[j] are the elements of the tensors W and V (no matter what
                the shapes of these tensors are).
            grad_clip_by: See `grad_clip` for the effect of this setting on gradient
                clipping. Allowed values are `value`, `norm`, and `global_norm`.
            train_batch_size_per_learner: Train batch size per individual Learner
                worker. This setting only applies to the new API stack. The number
                of Learner workers can be set via `config.resources(
                num_learners=...)`. The total effective batch size is then
                `num_learners` x `train_batch_size_per_learner` and you can
                access it with the property `AlgorithmConfig.total_train_batch_size`.
            train_batch_size: Training batch size, if applicable. When on the new API
                stack, this setting should no longer be used. Instead, use
                `train_batch_size_per_learner` (in combination with
                `num_learners`).
            num_epochs: The number of complete passes over the entire train batch (per
                Learner). Each pass might be further split into n minibatches (if
                `minibatch_size` provided).
            minibatch_size: The size of minibatches to use to further split the train
                batch into.
            shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch.
                If the train batch has a time rank (axis=1), shuffling only takes
                place along the batch axis to not disturb any intact (episode)
                trajectories.
            model: Arguments passed into the policy model. See models/catalog.py for a
                full list of the available model options.
                TODO: Provide ModelConfig objects instead of dicts.
            optimizer: Arguments to pass to the policy optimizer. This setting is not
                used when `enable_rl_module_and_learner=True`.

        Returns:
            This updated AlgorithmConfig object.
        z!config.training(learner_class=..)z!config.learners(learner_class=..)Frg  z%config.training(learner_connector=..)z%config.learners(learner_connector=..)z>config.training(add_default_connectors_to_learner_pipeline=..)z>config.learners(add_default_connectors_to_learner_pipeline=..)z'config.training(learner_config_dict=..)z'config.learners(learner_config_dict=..)z5config.training(num_aggregator_actors_per_learner=..)z5config.learners(num_aggregator_actors_per_learner=..)z?config.training(max_requests_in_flight_per_aggregator_actor=..)z?config.learners(max_requests_in_flight_per_aggregator_actor=..)z config.training(num_sgd_iter=..)zconfig.training(num_epochs=..)zGAlgorithmConfig.training(max_requests_in_flight_per_sampler_worker=...)zFAlgorithmConfig.env_runners(max_requests_in_flight_per_env_runner=...))r   )r   normr   z`grad_clip_by` (z4) must be one of: 'value', 'norm', or 'global_norm'!_use_default_native_modelsz9AlgorithmConfig.training(_use_default_native_models=True)z_use_default_native_models is not supported anymore. To get rid of this error, set `config.api_stack(enable_rl_module_and_learner=True)`. Native models will be better supported by the upcoming RLModule API.rh  helpr  )r1   r   r	  r  r  r  r  r   r   r   r  r   r   r   rZ   r   r   r   r   r   r   r  r  r,   r  )r  r   r   r   r   r   r  r   r   r   r  r  r   r   r  r  rV  r  r  r  s                       r\   r  zAlgorithmConfig.trainingO	  sX   @ ++77   
 #0DK//;;   
 '8D#5[HHTT    ; ; k11==   
 $++,?@@@,0@@@KK   
 6WD26:JJJUU    < < +++64   
 &J48HHHA=    =     ##DJ[  DGK''&DN{**#CCC (| ( ( (   !-D'{::1MD.;..$4D![(((DO,,"0D"+55+BD(##Je$$$		68HII#$ $ $SH
   <=    K''(CCDNr^   )on_algorithm_initon_train_resulton_evaluate_starton_evaluate_endon_evaluate_offline_starton_evaluate_offline_endon_env_runners_recreated!on_offline_eval_runners_recreatedon_checkpoint_loadedon_environment_createdon_episode_createdon_episode_starton_episode_stepon_episode_endon_sample_endr
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  c                   |t           }|t          urBt          |          }t          d |D                       st	          d| d          || _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        |t          ur|| _        |t          ur|| _        |	t          ur|	| _        |
t          ur|
| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        | S )a  Sets the callbacks configuration.

        Args:
            callbacks_class: RLlibCallback class, whose methods are called during
                various phases of training and RL environment sample collection.
                TODO (sven): Change the link to new rst callbacks page.
                See the `RLlibCallback` class and
                `examples/metrics/custom_metrics_and_callbacks.py` for more information.
            on_algorithm_init: A callable or a list of callables. If a list, RLlib calls
                the items in the same sequence. `on_algorithm_init` methods overridden
                in `callbacks_class` take precedence and are called first.
                See
                :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_algorithm_init`  # noqa
                for more information.
            on_evaluate_start: A callable or a list of callables. If a list, RLlib calls
                the items in the same sequence. `on_evaluate_start` methods overridden
                in `callbacks_class` take precedence and are called first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_evaluate_start`  # noqa
                for more information.
            on_evaluate_end: A callable or a list of callables. If a list, RLlib calls
                the items in the same sequence. `on_evaluate_end` methods overridden
                in `callbacks_class` take precedence and are called first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_evaluate_end`  # noqa
                for more information.
            on_env_runners_recreated: A callable or a list of callables. If a list,
                RLlib calls the items in the same sequence. `on_env_runners_recreated`
                methods overridden in `callbacks_class` take precedence and are called
                first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_env_runners_recreated`  # noqa
                for more information.
            on_checkpoint_loaded: A callable or a list of callables. If a list,
                RLlib calls the items in the same sequence. `on_checkpoint_loaded`
                methods overridden in `callbacks_class` take precedence and are called
                first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_checkpoint_loaded`  # noqa
                for more information.
            on_environment_created: A callable or a list of callables. If a list,
                RLlib calls the items in the same sequence. `on_environment_created`
                methods overridden in `callbacks_class` take precedence and are called
                first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_environment_created`  # noqa
                for more information.
            on_episode_created: A callable or a list of callables. If a list,
                RLlib calls the items in the same sequence. `on_episode_created` methods
                overridden in `callbacks_class` take precedence and are called first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_episode_created`  # noqa
                for more information.
            on_episode_start: A callable or a list of callables. If a list,
                RLlib calls the items in the same sequence. `on_episode_start` methods
                overridden in `callbacks_class` take precedence and are called first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_episode_start`  # noqa
                for more information.
            on_episode_step: A callable or a list of callables. If a list,
                RLlib calls the items in the same sequence. `on_episode_step` methods
                overridden in `callbacks_class` take precedence and are called first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_episode_step`  # noqa
                for more information.
            on_episode_end: A callable or a list of callables. If a list,
                RLlib calls the items in the same sequence. `on_episode_end` methods
                overridden in `callbacks_class` take precedence and are called first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_episode_end`  # noqa
                for more information.
            on_sample_end: A callable or a list of callables. If a list,
                RLlib calls the items in the same sequence. `on_sample_end` methods
                overridden in `callbacks_class` take precedence and are called first.
                See :py:meth:`~ray.rllib.callbacks.callbacks.RLlibCallback.on_sample_end`  # noqa
                for more information.

        Returns:
            This updated AlgorithmConfig object.
        Nc              3   4   K   | ]}t          |          V  d S rb   )callable)r  cs     r\   	<genexpr>z,AlgorithmConfig.callbacks.<locals>.<genexpr>
  s(      55qx{{555555r^   zr`config.callbacks_class` must be a callable or list of callables that returns a subclass of DefaultCallbacks, got rt   )r   r1   r+   allrZ   r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )r  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  to_checks                     r\   r  zAlgorithmConfig.callbacks 
  s   J "+O+-- "/22H55H55555  *&* * *  
 $3D K///@D,+---<D*K///@D,+---<D*$K777PD4"+555LD2#;666ND3,K??1 <  {222FD/!444JD1[000BD-;...>D++---<D*,,,:D)+++8D(r^   )#rL  rM  rN  rP  rQ  rO  rR  rS  rT  rU  rV  rW  rX  r[  r\  r]  r^  r_  ro  )dataset_num_iters_per_offline_eval_runnerrm  rc  rd  re  rb  rj  rn  rk  rf  rg  rh  rl  ri  r  evaluation_num_workersrL  rM  rN  rP  rQ  rO  rR  rS  rT  rU  rV  rW  rX  r[  r\  r]  r^  r_  ro  r  rm  rc  rd  re  rb  rj  rn  rk  rf  rg  rh  rl  ri  c       #            |"t           k    rt          ddd           |#t           k    rt          ddd           |#| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        |t          ur|| _        |	t          ur?|	d	| _        n5d
dlm}$ t          | j        pi |	d|$j        |$j        |$j                  | _        |
t          ur|
| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _         |t          ur|| _!        |t          ur|| _"        |t          ur|| _#        |t          ur|| _$        |t          ur|| _%        |t          ur|| _&        |t          ur|| _'        | t          ur| | _(        |!t          ur|!| _)        | S )a 9  Sets the config's evaluation settings.

        Args:
            evaluation_interval: Evaluate with every `evaluation_interval` training
                iterations. The evaluation stats are reported under the "evaluation"
                metric key. Set to None (or 0) for no evaluation.
            evaluation_duration: Duration for which to run evaluation each
                `evaluation_interval`. The unit for the duration can be set via
                `evaluation_duration_unit` to either "episodes" (default) or
                "timesteps". If using multiple evaluation workers (EnvRunners) in the
                `evaluation_num_env_runners > 1` setting, the amount of
                episodes/timesteps to run are split amongst these.
                A special value of "auto" can be used in case
                `evaluation_parallel_to_training=True`. This is the recommended way when
                trying to save as much time on evaluation as possible. The Algorithm
                then runs as many timesteps via the evaluation workers as possible,
                while not taking longer than the parallely running training step and
                thus, never wasting any idle time on either training- or evaluation
                workers. When using this setting (`evaluation_duration="auto"`), it is
                strongly advised to set `evaluation_interval=1` and
                `evaluation_force_reset_envs_before_iteration=True` at the same time.
            evaluation_duration_unit: The unit, with which to count the evaluation
                duration. Either "episodes" (default) or "timesteps". Note that this
                setting is ignored if `evaluation_duration="auto"`.
            evaluation_auto_duration_min_env_steps_per_sample: If `evaluation_duration`
                is "auto" (in which case `evaluation_duration_unit` is always
                "timesteps"), at least how many timesteps should be done per remote
                `sample()` call.
            evaluation_auto_duration_max_env_steps_per_sample: If `evaluation_duration`
                is "auto" (in which case `evaluation_duration_unit` is always
                "timesteps"), at most how many timesteps should be done per remote
                `sample()` call.
            evaluation_sample_timeout_s: The timeout (in seconds) for evaluation workers
                to sample a complete episode in the case your config settings are:
                `evaluation_duration != auto` and `evaluation_duration_unit=episode`.
                After this time, the user receives a warning and instructions on how
                to fix the issue.
            evaluation_parallel_to_training: Whether to run evaluation in parallel to
                the `Algorithm.training_step()` call, using threading. Default=False.
                E.g. for evaluation_interval=1 -> In every call to `Algorithm.train()`,
                the `Algorithm.training_step()` and `Algorithm.evaluate()` calls
                run in parallel. Note that this setting - albeit extremely efficient b/c
                it wastes no extra time for evaluation - causes the evaluation results
                to lag one iteration behind the rest of the training results. This is
                important when picking a good checkpoint. For example, if iteration 42
                reports a good evaluation `episode_return_mean`, be aware that these
                results were achieved on the weights trained in iteration 41, so you
                should probably pick the iteration 41 checkpoint instead.
            evaluation_force_reset_envs_before_iteration: Whether all environments
                should be force-reset (even if they are not done yet) right before
                the evaluation step of the iteration begins. Setting this to True
                (default) makes sure that the evaluation results aren't polluted with
                episode statistics that were actually (at least partially) achieved with
                an earlier set of weights. Note that this setting is only
                supported on the new API stack w/ EnvRunners and ConnectorV2
                (`config.enable_rl_module_and_learner=True` AND
                `config.enable_env_runner_and_connector_v2=True`).
            evaluation_config: Typical usage is to pass extra args to evaluation env
                creator and to disable exploration by computing deterministic actions.
                IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal
                policy, even if this is a stochastic one. Setting "explore=False" here
                results in the evaluation workers not using this optimal policy!
            off_policy_estimation_methods: Specify how to evaluate the current policy,
                along with any optional config parameters. This only has an effect when
                reading offline experiences ("input" is not "sampler").
                Available keys:
                {ope_method_name: {"type": ope_type, ...}} where `ope_method_name`
                is a user-defined string to save the OPE results under, and
                `ope_type` can be any subclass of OffPolicyEstimator, e.g.
                ray.rllib.offline.estimators.is::ImportanceSampling
                or your own custom subclass, or the full class path to the subclass.
                You can also add additional config arguments to be passed to the
                OffPolicyEstimator in the dict, e.g.
                {"qreg_dr": {"type": DoublyRobust, "q_model_type": "qreg", "k": 5}}
            ope_split_batch_by_episode: Whether to use SampleBatch.split_by_episode() to
                split the input batch to episodes before estimating the ope metrics. In
                case of bandits you should make this False to see improvements in ope
                evaluation speed. In case of bandits, it is ok to not split by episode,
                since each record is one timestep already. The default is True.
            evaluation_num_env_runners: Number of parallel EnvRunners to use for
                evaluation. Note that this is set to zero by default, which means
                evaluation is run in the algorithm process (only if
                `evaluation_interval` is not 0 or None). If you increase this, also
                increases the Ray resource usage of the algorithm since evaluation
                workers are created separately from those EnvRunners used to sample data
                for training.
            custom_evaluation_function: Customize the evaluation method. This must be a
                function of signature (algo: Algorithm, eval_workers: EnvRunnerGroup) ->
                (metrics: dict, env_steps: int, agent_steps: int) (metrics: dict if
                `enable_env_runner_and_connector_v2=True`), where `env_steps` and
                `agent_steps` define the number of sampled steps during the evaluation
                iteration. See the Algorithm.evaluate() method to see the default
                implementation. The Algorithm guarantees all eval workers have the
                latest policy state before this function is called.
            offline_evaluation_interval: Evaluate offline with every
                `offline_evaluation_interval` training iterations. The offline evaluation
                stats are reported under the "evaluation/offline_evaluation" metric key. Set
                to None (or 0) for no offline evaluation.
            num_offline_eval_runners: Number of OfflineEvaluationRunner actors to create
                for parallel evaluation. Setting this to 0 forces sampling to be done in the
                local OfflineEvaluationRunner (main process or the Algorithm's actor when
                using Tune).
            offline_evaluation_type: Type of offline evaluation to run. Either `"eval_loss"`
                for evaluating the validation loss of the policy, `"is"` for importance
                sampling, or `"pdis"` for per-decision importance sampling. If you want to
                implement your own offline evaluation method write an `OfflineEvaluationRunner`
                and use the `AlgorithmConfig.offline_eval_runner_class`.
            offline_eval_runner_class: An `OfflineEvaluationRunner` class that implements
                custom offline evaluation logic.
            offline_loss_for_module_fn: A callable to compute the loss per `RLModule` in
                offline evaluation. If not provided the training loss function (
                `Learner.compute_loss_for_module`) is used. The signature must be (
                runner: OfflineEvaluationRunner, module_id: ModuleID, config: AlgorithmConfig,
                batch: Dict[str, Any], fwd_out: Dict[str, TensorType]).
            offline_eval_batch_size_per_runner: Evaluation batch size per individual
                OfflineEvaluationRunner worker. This setting only applies to the new API
                stack. The number of OfflineEvaluationRunner workers can be set via
                `config.evaluation(num_offline_eval_runners=...)`. The total effective batch
                size is then `num_offline_eval_runners` x
                `offline_eval_batch_size_per_runner`.
            dataset_num_iters_per_offline_eval_runner: Number of batches to evaluate in each
                OfflineEvaluationRunner during a single evaluation. If None, each learner runs a
                complete epoch over its data block (the dataset is partitioned into
                at least as many blocks as there are runners). The default is `1`.
            offline_eval_rl_module_inference_only: If `True`, the module spec is used in an
                inference-only setting (no-loss) and the RLModule can thus be built in
                its light version (if available). For example, the `inference_only`
                version of an RLModule might only contain the networks required for
                computing actions, but misses additional target- or critic networks.
                Also, if `True`, the module does NOT contain those (sub) RLModules that have
                their `learner_only` flag set to True.
            num_cpus_per_offline_eval_runner: Number of CPUs to allocate per
                OfflineEvaluationRunner.
            num_gpus_per_offline_eval_runner: Number of GPUs to allocate per
                OfflineEvaluationRunner. This can be fractional. This is usually needed only if
                your (custom) loss function itself requires a GPU (i.e., it contains GPU-
                intensive computations), or model inference is unusually expensive.
            custom_resources_per_eval_runner: Any custom Ray resources to allocate per
                OfflineEvaluationRunner.
            offline_evaluation_timeout_s: The timeout in seconds for calling `run()` on remote
                OfflineEvaluationRunner workers. Results (episode list) from workers that take
                longer than this time are discarded.
            max_requests_in_flight_per_offline_eval_runner: Max number of in-flight requests
                to each OfflineEvaluationRunner (actor)). See the
                `ray.rllib.utils.actor_manager.FaultTolerantActorManager` class for more
                details.
                Tuning these values is important when running experiments with
                large evaluation batches, where there is the risk that the object store may
                fill up, causing spilling of objects to disk. This can cause any
                asynchronous requests to become very slow, making your experiment run
                slowly as well. You can inspect the object store during your experiment
                through a call to `ray memory` on your head node, and by using the Ray
                dashboard. If you're seeing that the object store is filling up,
                turn down the number of remote requests in flight or enable compression
                or increase the object store memory through, for example:
                `ray.init(object_store_memory=10 * 1024 * 1024 * 1024)  # =10 GB`.
            broadcast_offline_eval_runner_states: True, if merged OfflineEvaluationRunner
                states (from the central connector pipelines) should be broadcast back to
                all remote OfflineEvaluationRunner actors.
            validate_offline_eval_runners_after_construction: Whether to validate that each
                created remote OfflineEvaluationRunner is healthy after its construction process.
            restart_failed_offline_eval_runners: Whether - upon an OfflineEvaluationRunner
                failure - RLlib tries to restart the lost OfflineEvaluationRunner(s) as an
                identical copy of the failed one(s). You should set this to True when training
                on SPOT instances that may preempt any time and/or if you need to evaluate always a
                complete dataset b/c OfflineEvaluationRunner(s) evaluate through streaming split
                iterators on disjoint batches. The new, recreated OfflineEvaluationRunner(s) only
                differ from the failed one in their `self.recreated_worker=True` property value
                and have the same `worker_index` as the original(s). If this setting is True, the
                value of the `ignore_offline_eval_runner_failures` setting is ignored.
            ignore_offline_eval_runner_failures: Whether to ignore any OfflineEvalautionRunner
                failures and continue running with the remaining OfflineEvaluationRunners. This
                setting is ignored, if `restart_failed_offline_eval_runners=True`.
            max_num_offline_eval_runner_restarts: The maximum number of times any
                OfflineEvaluationRunner is allowed to be restarted (if
                `restart_failed_offline_eval_runners` is True).
            offline_eval_runner_health_probe_timeout_s: Max amount of time in seconds, we should
                spend waiting for OfflineEvaluationRunner health probe calls
                (`OfflineEvaluationRunner.ping.remote()`) to respond. Health pings are very cheap,
                however, we perform the health check via a blocking `ray.get()`, so the
                default value should not be too large.
            offline_eval_runner_restore_timeout_s: Max amount of time we should wait to restore
                states on recovered OfflineEvaluationRunner actors. Default is 30 mins.

        Returns:
            This updated AlgorithmConfig object.
        z?AlgorithmConfig.evaluation(always_attach_evaluation_results=..)zThis setting is no longer needed, b/c Tune does not error anymore (only warns) when a metrics key can't be found in the results.Tr  z5AlgorithmConfig.evaluation(evaluation_num_workers=..)z9AlgorithmConfig.evaluation(evaluation_num_env_runners=..)Frg  Nr   rL   )*r   r   rW  r1   rL  rM  rN  rP  rQ  rO  rR  rS  rT  ray.rllib.algorithms.algorithmrM   r*   _allow_unknown_subkeys%_override_all_subkeys_if_type_changes_override_all_key_listrU  rX  rV  r[  r\  r]  r^  r_  ro  rp  rm  rc  rd  re  rb  rj  rn  rk  rf  rg  rh  rl  ri  )%r  rL  rM  rN  rP  rQ  rO  rR  rS  rT  rU  rV  rW  rX  r[  r\  r]  r^  r_  ro  r  rm  rc  rd  re  rb  rj  rn  rk  rf  rg  rh  rl  ri  r  r  rM   s%                                        r\   r  zAlgorithmConfig.evaluation
  s   P ,/???U     "%555KO   
 /ED+k11':D$k11':D$#;66,DD)<KOOA B =KOOA B 'k99/JD,*+==3RD07{JJ< = K// !()-&& EDDDDD)4*0b%4C4* *& );;1ND.%[88.HD+%[88.HD+%[88.HD+&k99/JD,#;66,DD)"+55+BD($K77-FD*%[88.HD+-[@@6XD34KGG9 2 1CC5 6 ,;>>4TD1+;>>4TD13;FF8 9 ({::0LD-9LL> ? 0{BB4 5 <;NN@ A /kAA3 4 /kAA3 4 0{BB4 5 6[HH: ; 1CC5 6 r^   )%r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r0  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r0  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  c       %         H   |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |	t           ur|	| _	        |
t           ur|
| _
        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           urt-          |t.                    s t1          dt3          |           d          d}&|                    d          #t1          |&                    dd                    |                    d          M| j        r#t1          |&                    dd                    t1          |&                    dd	                    || _        |t           ur|| _        |t           ur|| _        |t           ur|| _         |t           ur|| _!        |t           ur|| _"        |t           ur|| _#        |t           ur|| _$        |t           ur|| _%        |t           ur|| _&        | t           ur| | _'        |!t           ur|!| _(        |"t           ur|"| _)        |#t           ur|#| _*        |$t           ur|$| _+        |%t           ur|%| _,        | S )
aC  Sets the config's offline data settings.

        Args:
            input_: Specify how to generate experiences:
                - "sampler": Generate experiences via online (env) simulation (default).
                - A local directory or file glob expression (e.g., "/tmp/*.json").
                - A list of individual file paths/URIs (e.g., ["/tmp/1.json",
                "s3://bucket/2.json"]).
                - A dict with string keys and sampling probabilities as values (e.g.,
                {"sampler": 0.4, "/tmp/*.json": 0.4, "s3://bucket/expert.json": 0.2}).
                - A callable that takes an `IOContext` object as only arg and returns a
                `ray.rllib.offline.InputReader`.
                - A string key that indexes a callable with
                `tune.registry.register_input`
            offline_data_class: An optional `OfflineData` class that is used to define
                the offline data pipeline, including the dataset and the sampling
                methodology. Override the `OfflineData` class and pass your derived
                class here, if you need some primer transformations specific to your
                data or your loss. Usually overriding the `OfflinePreLearner` and using
                the resulting customization via `prelearner_class` suffices for most
                cases. The default is `None` which uses the base `OfflineData` defined
                in `ray.rllib.offline.offline_data.OfflineData`.
            input_read_method: Read method for the `ray.data.Dataset` to read in the
                offline data from `input_`. The default is `read_parquet` for Parquet
                files. See https://docs.ray.io/en/latest/data/api/input_output.html for
                more info about available read methods in `ray.data`.
            input_read_method_kwargs: Keyword args for `input_read_method`. These
                are passed by RLlib into the read method without checking. Use these
                keyword args together with `map_batches_kwargs` and
                `iter_batches_kwargs` to tune the performance of the data pipeline.
                It is strongly recommended to rely on Ray Data's automatic read
                performance tuning.
            input_read_schema: Table schema for converting offline data to episodes.
                This schema maps the offline data columns to
                ray.rllib.core.columns.Columns:
                `{Columns.OBS: 'o_t', Columns.ACTIONS: 'a_t', ...}`. Columns in
                the data set that are not mapped via this schema are sorted into
                episodes' `extra_model_outputs`. If no schema is passed in the default
                schema used is `ray.rllib.offline.offline_data.SCHEMA`. If your data set
                contains already the names in this schema, no `input_read_schema` is
                needed. The same applies if the data is in RLlib's `EpisodeType` or its
                old `SampleBatch` format.
            input_read_episodes: Whether offline data is already stored in RLlib's
                `EpisodeType` format, i.e. `ray.rllib.env.SingleAgentEpisode` (multi
                -agent is planned but not supported, yet). Reading episodes directly
                avoids additional transform steps and is usually faster and
                therefore the recommended format when your application remains fully
                inside of RLlib's schema. The other format is a columnar format and is
                agnostic to the RL framework used. Use the latter format, if you are
                unsure when to use the data or in which RL framework. The default is
                to read column data, for example, `False`. `input_read_episodes`, and
                `input_read_sample_batches` can't be `True` at the same time. See
                also `output_write_episodes` to define the output data format when
                recording.
            input_read_sample_batches: Whether offline data is stored in RLlib's old
                stack `SampleBatch` type. This is usually the case for older data
                recorded with RLlib in JSON line format. Reading in `SampleBatch`
                data needs extra transforms and might not concatenate episode chunks
                contained in different `SampleBatch`es in the data. If possible avoid
                to read `SampleBatch`es and convert them in a controlled form into
                RLlib's `EpisodeType` (i.e. `SingleAgentEpisode`). The default is
                `False`. `input_read_episodes`, and `input_read_sample_batches` can't
                be `True` at the same time.
            input_read_batch_size: Batch size to pull from the data set. This could
                differ from the `train_batch_size_per_learner`, if a dataset holds
                `EpisodeType` (i.e., `SingleAgentEpisode`) or `SampleBatch`, or any
                other data type that contains multiple timesteps in a single row of
                the dataset. In such cases a single batch of size
                `train_batch_size_per_learner` will potentially pull a multiple of
                `train_batch_size_per_learner` timesteps from the offline dataset. The
                default is `None` in which the `train_batch_size_per_learner` is pulled.
            input_filesystem: A cloud filesystem to handle access to cloud storage when
                reading experiences. Can be either "gcs" for Google Cloud Storage,
                "s3" for AWS S3 buckets, "abs" for Azure Blob Storage, or any
                filesystem supported by PyArrow. In general the file path is sufficient
                for accessing data from public or local storage systems. See
                https://arrow.apache.org/docs/python/filesystems.html for details.
            input_filesystem_kwargs: A dictionary holding the kwargs for the filesystem
                given by `input_filesystem`. See `gcsfs.GCSFilesystem` for GCS,
                `pyarrow.fs.S3FileSystem`, for S3, and `ablfs.AzureBlobFilesystem` for
                ABS filesystem arguments.
            input_compress_columns: What input columns are compressed with LZ4 in the
                input data. If data is stored in RLlib's `SingleAgentEpisode` (
                `MultiAgentEpisode` not supported, yet). Note the providing
                `rllib.core.columns.Columns.OBS` also tries to decompress
                `rllib.core.columns.Columns.NEXT_OBS`.
            materialize_data: Whether the raw data should be materialized in memory.
                This boosts performance, but requires enough memory to avoid an OOM, so
                make sure that your cluster has the resources available. For very large
                data you might want to switch to streaming mode by setting this to
                `False` (default). If your algorithm does not need the RLModule in the
                Learner connector pipeline or all (learner) connectors are stateless
                you should consider setting `materialize_mapped_data` to `True`
                instead (and set `materialize_data` to `False`). If your data does not
                fit into memory and your Learner connector pipeline requires an RLModule
                or is stateful, set both `materialize_data` and
                `materialize_mapped_data` to `False`.
            materialize_mapped_data: Whether the data should be materialized after
                running it through the Learner connector pipeline (i.e. after running
                the `OfflinePreLearner`). This improves performance, but should only be
                used in case the (learner) connector pipeline does not require an
                RLModule and the (learner) connector pipeline is stateless. For example,
                MARWIL's Learner connector pipeline requires the RLModule for value
                function predictions and training batches would become stale after some
                iterations causing learning degradation or divergence. Also ensure that
                your cluster has enough memory available to avoid an OOM. If set to
                `True` (True), make sure that `materialize_data` is set to `False` to
                avoid materialization of two datasets. If your data does not fit into
                memory and your Learner connector pipeline requires an RLModule or is
                stateful, set both `materialize_data` and `materialize_mapped_data` to
                `False`.
            map_batches_kwargs: Keyword args for the `map_batches` method. These are
                passed into the `ray.data.Dataset.map_batches` method when sampling
                without checking. If no arguments passed in the default arguments
                `{'concurrency': max(2, num_learners), 'zero_copy_batch': True}` is
                used. Use these keyword args together with `input_read_method_kwargs`
                and `iter_batches_kwargs` to tune the performance of the data pipeline.
            iter_batches_kwargs: Keyword args for the `iter_batches` method. These are
                passed into the `ray.data.Dataset.iter_batches` method when sampling
                without checking. If no arguments are passed in, the default argument
                `{'prefetch_batches': 2}` is used. Use these keyword args
                together with `input_read_method_kwargs` and `map_batches_kwargs` to
                tune the performance of the data pipeline.
            ignore_final_observation: If the final observation in an episode chunk should
                be ignored. This concerns mainly column-based data and instead of using a
                user-provided `NEXT_OBS` sets final observations to zero. This should be
                used with BC only, as in true Offline RL algorithms the final observation
                is important.
            prelearner_class: An optional `OfflinePreLearner` class that is used to
                transform data batches in `ray.data.map_batches` used in the
                `OfflineData` class to transform data from columns to batches that can
                be used in the `Learner.update...()` methods. Override the
                `OfflinePreLearner` class and pass your derived class in here, if you
                need to make some further transformations specific for your data or
                loss. The default is None which uses the base `OfflinePreLearner`
                defined in `ray.rllib.offline.offline_prelearner`.
            prelearner_buffer_class: An optional `EpisodeReplayBuffer` class that RLlib
                uses to buffer experiences when data is in `EpisodeType` or
                RLlib's previous `SampleBatch` type format. In this case, a single
                data row may contain multiple timesteps and the buffer serves two
                purposes: (a) to store intermediate data in memory, and (b) to ensure
                that RLlib samples exactly `train_batch_size_per_learner` experiences
                per batch. The default is RLlib's `EpisodeReplayBuffer`.
            prelearner_buffer_kwargs: Optional keyword arguments for initializing the
                `EpisodeReplayBuffer`. In most cases this value is simply the `capacity`
                for the default buffer that RLlib uses (`EpisodeReplayBuffer`), but it
                may differ if the `prelearner_buffer_class` uses a custom buffer.
            prelearner_module_synch_period: The period (number of batches converted)
                after which the `RLModule` held by the `PreLearner` should sync weights.
                The `PreLearner` is used to preprocess batches for the learners. The
                higher this value, the more off-policy the `PreLearner`'s module is.
                Values too small force the `PreLearner` to sync more frequently
                and thus might slow down the data pipeline. The default value chosen
                by the `OfflinePreLearner` is 10.
            dataset_num_iters_per_learner: Number of updates to run in each learner
                during a single training iteration. If None, each learner runs a
                complete epoch over its data block (the dataset is partitioned into
                at least as many blocks as there are learners). The default is `None`.
                This value must be set to `1`, if RLlib uses a single (local) learner.
            input_config: Arguments that describe the settings for reading the input.
                If input is "sample", this is the environment configuration, e.g.
                `env_name` and `env_config`, etc. See `EnvContext` for more info.
                If the input is "dataset", this contains e.g. `format`, `path`.
            actions_in_input_normalized: True, if the actions in a given offline "input"
                are already normalized (between -1.0 and 1.0). This is usually the case
                when the offline file has been generated by another RLlib algorithm
                (e.g. PPO or SAC), while "normalize_actions" was set to True.
            postprocess_inputs: Whether to run postprocess_trajectory() on the
                trajectory fragments from offline inputs. Note that postprocessing is
                done using the *current* policy, not the *behavior* policy, which
                is typically undesirable for on-policy algorithms.
            shuffle_buffer_size: If positive, input batches are shuffled via a
                sliding window buffer of this number of batches. Use this if the input
                data is not in random enough order. Input is delayed until the shuffle
                buffer is filled.
            output: Specify where experiences should be saved:
                 - None: don't save any experiences
                 - "logdir" to save to the agent log dir
                 - a path/URI to save to a custom output directory (e.g., "s3://bckt/")
                 - a function that returns a rllib.offline.OutputWriter
            output_config: Arguments accessible from the IOContext for configuring
                custom output.
            output_compress_columns: What sample batch columns to LZ4 compress in the
                output data. Note that providing `rllib.core.columns.Columns.OBS` also
                compresses `rllib.core.columns.Columns.NEXT_OBS`.
            output_max_file_size: Max output file size (in bytes) before rolling over
                to a new file.
            output_max_rows_per_file: Max output row numbers before rolling over to a
                new file.
            output_write_remaining_data: Determines whether any remaining data in the
                recording buffers should be stored to disk. It is only applicable if
                `output_max_rows_per_file` is defined. When sampling data, it is
                buffered until the threshold specified by `output_max_rows_per_file`
                is reached. Only complete multiples of `output_max_rows_per_file` are
                written to disk, while any leftover data remains in the buffers. If a
                recording session is stopped, residual data may still reside in these
                buffers. Setting `output_write_remaining_data` to `True` ensures this
                data is flushed to disk. By default, this attribute is set to `False`.
            output_write_method: Write method for the `ray.data.Dataset` to write the
                offline data to `output`. The default is `read_parquet` for Parquet
                files. See https://docs.ray.io/en/latest/data/api/input_output.html for
                more info about available read methods in `ray.data`.
            output_write_method_kwargs: `kwargs` for the `output_write_method`. These
                are passed into the write method without checking.
            output_filesystem: A cloud filesystem to handle access to cloud storage when
                writing experiences. Should be either "gcs" for Google Cloud Storage,
                "s3" for AWS S3 buckets, or "abs" for Azure Blob Storage.
            output_filesystem_kwargs: A dictionary holding the kwargs for the filesystem
                given by `output_filesystem`. See `gcsfs.GCSFilesystem` for GCS,
                `pyarrow.fs.S3FileSystem`, for S3, and `ablfs.AzureBlobFilesystem` for
                ABS filesystem arguments.
            output_write_episodes: If RLlib should record data in its RLlib's
                `EpisodeType` format (that is, `SingleAgentEpisode` objects). Use this
                format, if you need RLlib to order data in time and directly group by
                episodes for example to train stateful modules or if you plan to use
                recordings exclusively in RLlib. Otherwise RLlib records data in tabular
                (columnar) format. Default is `True`.
            offline_sampling: Whether sampling for the Algorithm happens via
                reading from offline data. If True, EnvRunners don't limit the number
                of collected batches within the same `sample()` call based on
                the number of sub-environments within the worker (no sub-environments
                present).

        Returns:
            This updated AlgorithmConfig object.
        z!input_config must be a dict, got r  z@{} should not be set in the input_config. RLlib uses {} instead.num_cpus_per_read_taskNz.config.env_runners(num_cpus_per_env_runner=..)parallelismz0config.evaluation(evaluation_num_env_runners=..)z&config.env_runners(num_env_runners=..))-r1   r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r0  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  rY   r  rZ   r[   r  formatrY  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  )'r  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r0  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  msgs'                                          r\   offline_datazAlgorithmConfig.offline_data   s   V $$ DK[00&8D#K//%6D"#;66,DD)K//%6D"k11':D$$K77-FD* 33)>D&;..$4D!"+55+BD(!44*@D';..$4D!"+55+BD([00&8D#k11':D$#;66,DD);..$4D!"+55+BD(#;66,DD))<<2PD/(;;1ND.{**lD11  M\8J8JMMM   UC 899E JJ0H    ..:% $

)N    %

)+S   
 !-D&k99/JD,[00&8D#k11':D$$$ DK++!.D"+55+BD({22(<D%#;66,DD)&k99/JD,k11':D$%[88.HD+K//%6D"#;66,DD) 33)>D&;..$4D!r^   )
r  r  r   r!  r"  r#  r  r  r  r  r  r  r   rU   r!  r"  r#  r  c       
   	         |t           ur4|D ]}t          |d           t          |t          t          t
          f          rd |D             }t          |t                    r|                                D ]\  }}t          |t                    sEt          |t
          t          f          rt          |          dk    rt          d| d|           _t          |j        t          t          f          s/|j        (t          d| d	t          |j                   d
          || _        nt          d          |t          k    r't!          dd           |                     |           |t           ur|| _        |t           ur+t          |t                    rt'          |          }|| _        |t           ur|| _        |
t          k    rt!          dd           |	t          k    rt!          ddd           |t           ur|dvrt          d| d
          || _        |t           urt          |t
          t          t          f          st/          |          s|
J d            t          |t
          t          t          f          r-t          |          dk    rt0                              d           || _        |t           ur|| _        | S )a  Sets the config's multi-agent settings.

        Validates the new multi-agent settings and translates everything into
        a unified multi-agent setup format. For example a `policies` list or set
        of IDs is properly converted into a dict mapping these IDs to PolicySpecs.

        Args:
            policies: Map of type MultiAgentPolicyConfigDict from policy ids to either
                4-tuples of (policy_cls, obs_space, act_space, config) or PolicySpecs.
                These tuples or PolicySpecs define the class of the policy, the
                observation- and action spaces of the policies, and any extra config.
            policy_map_capacity: Keep this many policies in the "policy_map" (before
                writing least-recently used ones to disk/S3).
            policy_mapping_fn: Function mapping agent ids to policy ids. The signature
                is: `(agent_id, episode, worker, **kwargs) -> PolicyID`.
            policies_to_train: Determines those policies that should be updated.
                Options are:
                - None, for training all policies.
                - An iterable of PolicyIDs that should be trained.
                - A callable, taking a PolicyID and a SampleBatch or MultiAgentBatch
                and returning a bool (indicating whether the given policy is trainable
                or not, given the particular batch). This allows you to have a policy
                trained only on certain data (e.g. when playing against a certain
                opponent).
            policy_states_are_swappable: Whether all Policy objects in this map can be
                "swapped out" via a simple `state = A.get_state(); B.set_state(state)`,
                where `A` and `B` are policy instances in this map. You should set
                this to True for significantly speeding up the PolicyMap's cache lookup
                times, iff your policies all share the same neural network
                architecture and optimizer types. If True, the PolicyMap doesn't
                have to garbage collect old, least recently used policies, but instead
                keeps them in memory and simply override their state with the state of
                the most recently accessed one.
                For example, in a league-based training setup, you might have 100s of
                the same policies in your map (playing against each other in various
                combinations), but all of them share the same state structure
                (are "swappable").
            observation_fn: Optional function that can be used to enhance the local
                agent observations to include more state. See
                rllib/evaluation/observation_function.py for more info.
            count_steps_by: Which metric to use as the "batch size" when building a
                MultiAgentBatch. The two supported values are:
                "env_steps": Count each time the env is "stepped" (no matter how many
                multi-agent actions are passed/how many multi-agent observations
                have been returned in the previous step).
                "agent_steps": Count each individual agent step as one step.

        Returns:
            This updated AlgorithmConfig object.
        Tr  c                 ,    i | ]}|t                      S r  r'   )r  ps     r\   r  z/AlgorithmConfig.multi_agent.<locals>.<dictcomp>  s    >>>Az||>>>r^      zYPolicy specs must be tuples/lists of (cls or None, obs_space, action_space, config), got z for PolicyID=NzMulti-agent policy config for z3 must be a dict or AlgorithmConfig object, but got rt   zX`policies` must be dict mapping PolicyID to PolicySpec OR a set/tuple/list of PolicyIDs! Frp  )r  z0AlgorithmConfig.multi_agent(policy_map_cache=..)z+AlgorithmConfig.multi_agent(replay_mode=..)zBAlgorithmConfig.training(replay_buffer_config={'replay_mode': ..})rg  )r   agent_stepszRconfig.multi_agent(count_steps_by=..) must be one of [env_steps|agent_steps], not zERROR: `policies_to_train` must be a [list|set|tuple] or a callable taking PolicyID and SampleBatch and returning True|False (trainable or not?) or None (for always training all policies).r   z`config.multi_agent(policies_to_train=..)` is empty! Make sure - if you would like to learn at least one policy - to add its ID to that list.)r1   r   rY   r2  r*  r)  r  rw   r'   lenrZ   r  r`   r[   r  r   r   	rl_moduler  r2   r   r#  r  r  r(  rn  r!  r"  )r  r  r  r   r!  r"  r#  r  r  r  r  pidspecs                r\   r  zAlgorithmConfig.multi_agent  s   R ;&&   4 4"3d33333 (S%$677 ?>>X>>>(D)) !)!1!1  IC%dJ77 )$u>> #d))q..",!A'+!A !A;>!A !A# #  CQ 't{_d4KLL K3(TS T T?CDK?P?PT T T   !) 3  
 14DDDBe4444NN9     k11':D$K// +T22 C$/0A$B$B!%6D",,"0D///F   
 ***A<	    ,,%AAA F4BF F F   #1DK//,tS%.@AA	-..	 %,, -,- +dC-?@@ ())Q..NN6  
 &7D"&k99/JD,r^   )rq  rr  rs  rt  ru  rv  rw  custom_stats_cls_lookuprq  rr  rs  rt  ru  rv  rw  r  c                   |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        | S )a  Sets the config's reporting settings.

        Args:
            keep_per_episode_custom_metrics: Store raw custom metrics without
                calculating max, min, mean
            metrics_episode_collection_timeout_s: Wait for metric batches for at most
                this many seconds. Those that have not returned in time are collected
                in the next train iteration.
            metrics_num_episodes_for_smoothing: Smooth rollout metrics over this many
                episodes, if possible.
                In case rollouts (sample collection) just started, there may be fewer
                than this many episodes in the buffer and we'll compute metrics
                over this smaller number of available episodes.
                In case there are more than this many episodes collected in a single
                training iteration, use all of these episodes for metrics computation,
                meaning don't ever cut any "excess" episodes.
                Set this to 1 to disable smoothing and to always report only the most
                recently collected episode's return.
            min_time_s_per_iteration: Minimum time (in sec) to accumulate within a
                single `Algorithm.train()` call. This value does not affect learning,
                only the number of times `Algorithm.training_step()` is called by
                `Algorithm.train()`. If - after one such step attempt, the time taken
                has not reached `min_time_s_per_iteration`, performs n more
                `Algorithm.training_step()` calls until the minimum time has been
                consumed. Set to 0 or None for no minimum time.
            min_train_timesteps_per_iteration: Minimum training timesteps to accumulate
                within a single `train()` call. This value does not affect learning,
                only the number of times `Algorithm.training_step()` is called by
                `Algorithm.train()`. If - after one such step attempt, the training
                timestep count has not been reached, performs n more
                `training_step()` calls until the minimum timesteps have been
                executed. Set to 0 or None for no minimum timesteps.
            min_sample_timesteps_per_iteration: Minimum env sampling timesteps to
                accumulate within a single `train()` call. This value does not affect
                learning, only the number of times `Algorithm.training_step()` is
                called by `Algorithm.train()`. If - after one such step attempt, the env
                sampling timestep count has not been reached, performs n more
                `training_step()` calls until the minimum timesteps have been
                executed. Set to 0 or None for no minimum timesteps.
            log_gradients: Log gradients to results. If this is `True` the global norm
                of the gradients dictionary for each optimizer is logged to results.
                The default is `False`.
            custom_stats_cls_lookup: A dictionary mapping stat names to their corresponding Stats classes.
                The Stats classes should be subclasses of :py:class:`~ray.rllib.utils.metrics.stats.StatsBase`.
                The keys of the dictionary are the stat names, and the values are the corresponding Stats classes.
                This allows you to use your own Stats classes for logging metrics.
                You can replace existing values to override some behaviour of RLlib.
                You can add key-value-pairs to the dictionary to add new stats classes that will be available
                when logging values with the MetricsLogger throughout RLlib.

        Returns:
            This updated AlgorithmConfig object.
        )	r1   rq  rr  rs  rt  ru  rv  rw  rx  )	r  rq  rr  rs  rt  ru  rv  rw  r  s	            r\   	reportingzAlgorithmConfig.reportingJ  s    B ++==3RD0/{BB4 5 .[@@6XD3#;66,DD),K??5VD2-[@@6XD3++!.D"+55$;D!r^   ry  rz  c                 F    |t           ur|| _        |t           ur|| _        | S )a,  Sets the config's checkpointing settings.

        Args:
            export_native_model_files: Whether an individual Policy-
                or the Algorithm's checkpoints also contain (tf or torch) native
                model files. These could be used to restore just the NN models
                from these files w/o requiring RLlib. These files are generated
                by calling the tf- or torch- built-in saving utility methods on
                the actual models.
            checkpoint_trainable_policies_only: Whether to only add Policies to the
                Algorithm checkpoint (in sub-directory "policies/") that are trainable
                according to the `is_trainable_policy` callable of the local worker.

        Returns:
            This updated AlgorithmConfig object.
        )r1   ry  rz  )r  ry  rz  s      r\   checkpointingzAlgorithmConfig.checkpointing  s/    , %K77-FD*-[@@6XD3r^   )r{  r|  r}  r~  r  r  r|  r}  r~  r  r  c                    |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        | S )a  Sets the config's debugging settings.

        Args:
            logger_creator: Callable that creates a ray.tune.Logger
                object. If unspecified, a default logger is created.
            logger_config: Define logger-specific configuration to be used inside Logger
                Default value None allows overwriting with nested dicts.
            log_level: Set the ray.rllib.* log level for the agent process and its
                workers. Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level
                also periodically prints out summaries of relevant internal dataflow
                (this is also printed out once at startup at the INFO level).
            log_sys_usage: Log system resource metrics to results. This requires
                `psutil` to be installed for sys stats, and `gputil` for GPU metrics.
            fake_sampler: Use fake (infinite speed) sampler. For testing only.
            seed: This argument, in conjunction with worker_index, sets the random
                seed of each worker, so that identically configured trials have
                identical results. This makes experiments reproducible.

        Returns:
            This updated AlgorithmConfig object.
        )r1   r{  r|  r}  r~  r  r  )r  r{  r|  r}  r~  r  r  s          r\   	debuggingzAlgorithmConfig.debugging  sy    > ,,"0D++!.DK''&DN++!.D{** ,D{""DIr^   )r  r  r  r  r  r  r  r  recreate_failed_env_runnersrv  rw  max_num_worker_restartsdelay_between_worker_restarts_srx  ry  rz  r  r  r  r  r  r  r  r  c                   |	t           k    rt          ddd           |
t           k    rt          ddd           |t           k    rt          ddd           |t           k    rt          dd	d           |t           k    rt          d
dd           |t           k    rt          ddd           |t           k    rt          ddd           |t           k    rt          ddd           |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        | S )a  Sets the config's fault tolerance settings.

        Args:
            restart_failed_env_runners: Whether - upon an EnvRunner failure - RLlib
                tries to restart the lost EnvRunner(s) as an identical copy of the
                failed one(s). You should set this to True when training on SPOT
                instances that may preempt any time. The new, recreated EnvRunner(s)
                only differ from the failed one in their `self.recreated_worker=True`
                property value and have the same `worker_index` as the original(s).
                If this setting is True, the value of the `ignore_env_runner_failures`
                setting is ignored.
            ignore_env_runner_failures: Whether to ignore any EnvRunner failures
                and continue running with the remaining EnvRunners. This setting is
                ignored, if `restart_failed_env_runners=True`.
            max_num_env_runner_restarts: The maximum number of times any EnvRunner
                is allowed to be restarted (if `restart_failed_env_runners` is True).
            delay_between_env_runner_restarts_s: The delay (in seconds) between two
                consecutive EnvRunner restarts (if `restart_failed_env_runners` is
                True).
            restart_failed_sub_environments: If True and any sub-environment (within
                a vectorized env) throws any error during env stepping, the
                EnvRunner tries to restart the faulty sub-environment. This is done
                without disturbing the other (still intact) sub-environment and without
                the EnvRunner crashing. You can raise
                `ray.rllib.env.env_runner.StepFailedRecreateEnvError` from your
                environment's `step` method to not log the error.
            num_consecutive_env_runner_failures_tolerance: The number of consecutive
                times an EnvRunner failure (also for evaluation) is tolerated before
                finally crashing the Algorithm. Only useful if either
                `ignore_env_runner_failures` or `restart_failed_env_runners` is True.
                Note that for `restart_failed_sub_environments` and sub-environment
                failures, the EnvRunner itself is NOT affected and won't throw any
                errors as the flawed sub-environment is silently restarted under the
                hood.
            env_runner_health_probe_timeout_s: Max amount of time in seconds, we should
                spend waiting for EnvRunner health probe calls
                (`EnvRunner.ping.remote()`) to respond. Health pings are very cheap,
                however, we perform the health check via a blocking `ray.get()`, so the
                default value should not be too large.
            env_runner_restore_timeout_s: Max amount of time we should wait to restore
                states on recovered EnvRunner actors. Default is 30 mins.

        Returns:
            This updated AlgorithmConfig object.
        z<AlgorithmConfig.fault_tolerance(recreate_failed_env_runners)z;AlgorithmConfig.fault_tolerance(restart_failed_env_runners)Trg  z7AlgorithmConfig.fault_tolerance(ignore_worker_failures)z;AlgorithmConfig.fault_tolerance(ignore_env_runner_failures)z8AlgorithmConfig.fault_tolerance(recreate_failed_workers)z8AlgorithmConfig.fault_tolerance(max_num_worker_restarts)z<AlgorithmConfig.fault_tolerance(max_num_env_runner_restarts)z@AlgorithmConfig.fault_tolerance(delay_between_worker_restarts_s)zDAlgorithmConfig.fault_tolerance(delay_between_env_runner_restarts_s)zJAlgorithmConfig.fault_tolerance(num_consecutive_worker_failures_tolerance)zNAlgorithmConfig.fault_tolerance(num_consecutive_env_runner_failures_tolerance)z>AlgorithmConfig.fault_tolerance(worker_health_probe_timeout_s)zBAlgorithmConfig.fault_tolerance(env_runner_health_probe_timeout_s)z9AlgorithmConfig.fault_tolerance(worker_restore_timeout_s)z=AlgorithmConfig.fault_tolerance(env_runner_restore_timeout_s))r   r   r1   r  r  r  r  r  r  r  r  )r  r  r  r  r  r  r  r  r  r  rv  rw  r  r  rx  ry  rz  s                    r\   fault_tolerancezAlgorithmConfig.fault_tolerance  s$   D '*:::RQ   
 "%555MQ   
 #&666NQ   
 #&666NR   
 +.>>>V	    58HHH&&    ),<<<T5	    $'777OS    &[88.HD+%[88.HD+&k99/JD,.kAA3 4 ++==3RD08KK= > -K??5VD2'{::0LD-r^   )model_configr1  r  model_config_dictr  r  r  c                T   |t           k    rt          ddd           |t           k    rt          ddd           |}|t          ur|| _        |t          ur|| _        |t          urBt          |t                    st          d| d	          | j        	                    |           | S )
a  Sets the config's RLModule settings.

        Args:
            model_config: The DefaultModelConfig object (or a config dictionary) passed
                as `model_config` arg into each RLModule's constructor. This is used
                for all RLModules, if not otherwise specified through `rl_module_spec`.
            rl_module_spec: The RLModule spec to use for this config. It can be either
                a RLModuleSpec or a MultiRLModuleSpec. If the
                observation_space, action_space, catalog_class, or the model config is
                not specified it is inferred from the env and other parts of the
                algorithm config object.
            algorithm_config_overrides_per_module: Only used if
                `enable_rl_module_and_learner=True`.
                A mapping from ModuleIDs to per-module AlgorithmConfig override dicts,
                which apply certain settings,
                e.g. the learning rate, from the main AlgorithmConfig only to this
                particular module (within a MultiRLModule).
                You can create override dicts by using the `AlgorithmConfig.overrides`
                utility. For example, to override your learning rate and (PPO) lambda
                setting just for a single RLModule with your MultiRLModule, do:
                config.multi_agent(algorithm_config_overrides_per_module={
                "module_1": PPOConfig.overrides(lr=0.0002, lambda_=0.75),
                })

        Returns:
            This updated AlgorithmConfig object.
        z3AlgorithmConfig.rl_module(_enable_rl_module_api=..)z:AlgorithmConfig.api_stack(enable_rl_module_and_learner=..)Trg  z/AlgorithmConfig.rl_module(model_config_dict=..)z*AlgorithmConfig.rl_module(model_config=..)Fzq`algorithm_config_overrides_per_module` must be a dict mapping module IDs to config override dicts! You provided r  )
r   r   r1   r  r  rY   r  rZ   r  r  )r  r  r1  r  r  r  s         r\   r  zAlgorithmConfig.rl_moduley  s    N !$444IP   
  000E@   
 -L{**!-D,,#1D 0CCCTJJ  @<@ @ @  
 6==5   r^   )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  c                   |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        |t           ur|| _        | S )a  Sets the config's experimental settings.

        Args:
            _validate_config: Whether to run `validate()` on this config. True by
                default. If False, ignores any calls to `self.validate()`.
            _use_msgpack_checkpoints: Create state files in all checkpoints through
                msgpack rather than pickle.
            _torch_grad_scaler_class: Class to use for torch loss scaling (and gradient
                unscaling). The class must implement the following methods to be
                compatible with a `TorchLearner`. These methods/APIs match exactly those
                of torch's own `torch.amp.GradScaler` (see here for more details
                https://pytorch.org/docs/stable/amp.html#gradient-scaling):
                `scale([loss])` to scale the loss by some factor.
                `get_scale()` to get the current scale factor value.
                `step([optimizer])` to unscale the grads (divide by the scale factor)
                and step the given optimizer.
                `update()` to update the scaler after an optimizer step (for example to
                adjust the scale factor).
            _torch_lr_scheduler_classes: A list of `torch.lr_scheduler.LRScheduler`
                (see here for more details
                https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate)
                classes or a dictionary mapping module IDs to such a list of respective
                scheduler classes. Multiple scheduler classes can be applied in sequence
                and are stepped in the same sequence as defined here. Note, most
                learning rate schedulers need arguments to be configured, that is, you
                might have to partially initialize the schedulers in the list(s) using
                `functools.partial`.
            _tf_policy_handles_more_than_one_loss: Experimental flag.
                If True, TFPolicy handles more than one loss or optimizer.
                Set this to True, if you would like to return more than
                one loss term from your `loss_fn` and an equal number of optimizers
                from your `optimizer_fn`.
            _disable_preprocessor_api: Experimental flag.
                If True, no (observation) preprocessor is created and
                observations arrive in model as they are returned by the env.
            _disable_action_flattening: Experimental flag.
                If True, RLlib doesn't flatten the policy-computed actions into
                a single tensor (for storage in SampleCollectors/output files/etc..),
                but leave (possibly nested) actions as-is. Disabling flattening affects:
                - SampleCollectors: Have to store possibly nested action structs.
                - Models that have the previous action(s) as part of their input.
                - Algorithms reading from offline files (incl. action information).

        Returns:
            This updated AlgorithmConfig object.
        )	r1   r  r  r  r  r  r  r  r  )	r  r  r  r  r  r  r  r  r  s	            r\   experimentalzAlgorithmConfig.experimental  s    x ;..$4D!#;66,DD)0CC5 6 %K77-FD*%[88.HD+4KGG9 : $;66,DD)&k99/JD,r^   c                    | j         {t          | j                  t          urdS 	 t	          j        | j                  }n# t          j        j        $ r Y dS w xY wt          |          | _         |	                                 | j         S )z)True if if specified env is an Atari env.NF)
r   r[   r   r  r-  maker  Errorr    close)r  r   s     r\   r    zAlgorithmConfig.is_atari  s    
 >! DH~~S((uhtx((9?   uu &c]]DNIIKKK~s   A   AAc                 N    t          | j                  dk    pt          | j        vS )zReturns whether this config specifies a multi-agent setup.

        Returns:
            True, if a) >1 policies defined OR b) 1 policy defined, but its ID is NOT
            DEFAULT_POLICY_ID.
        r   )r  r  r)   r  s    r\   r,  zAlgorithmConfig.is_multi_agent+  s&     4=!!A%O):$-)OOr^   c                 8    | j         p|                                 S )ai  Returns the Learner sub-class to use by this Algorithm.

        Either
        a) User sets a specific learner class via calling `.training(learner_class=...)`
        b) User leaves learner class unset (None) and the AlgorithmConfig itself
        figures out the actual learner class by calling its own
        `.get_default_learner_class()` method.
        )r	  get_default_learner_classr  s    r\   rV  zAlgorithmConfig.learner_class5  s     "Fd&D&D&F&FFr^   c                     | j         t          | j        t                    r| j        nt	          j        | j                  z  S )a  Defines the model configuration used.

        This method combines the auto configuration `self _model_config_auto_includes`
        defined by an algorithm with the user-defined configuration in
        `self._model_config`.This configuration dictionary is used to
        configure the `RLModule` in the new stack and the `ModelV2` in the old
        stack.

        Returns:
            A dictionary with the model configuration.
        )_model_config_auto_includesrY   r  r  dataclassesasdictr  s    r\   r  zAlgorithmConfig.model_configA  sB     /$,d338D#D$677
 	
r^   c                    |                                  }t          |           | j        t          | j                   t          | j        t                    rWt          |t                    r|                    | j                   |S t          |t                    rt          d          d S t          j	        | j                  }|                    |           |S |S )Nz1Cannot merge MultiRLModuleSpec with RLModuleSpec!)
get_default_rl_module_specr]   r  rY   r   r  r   rZ   r  r  )r  default_rl_module_specmulti_rl_module_specs      r\   r1  zAlgorithmConfig.rl_module_specT  s    !%!@!@!B!B4555 +!$"6777 $.== ,4lCC *11$2FGGG11 68IJJ $K   
 (,}T5I'J'J$$++,BCCC++ *)r^   c                 @    | j         | j        | j        pdz  S | j         S )Nr   )r   r   r   r  s    r\   r  z,AlgorithmConfig.train_batch_size_per_learnerp  s,     -5(T->-C!DD11r^   r   c                     || _         d S rb   )r   r  r   s     r\   r  z,AlgorithmConfig.train_batch_size_per_learnerw  s    -2***r^   c                 $    | j         | j        pdz  S )a  Returns the effective total train batch size.

        New API stack: `train_batch_size_per_learner` * [effective num Learners].

        @OldAPIStack: User never touches `train_batch_size_per_learner` or
        `num_learners`) -> `train_batch_size`.
        r   )r  r   r  s    r\   total_train_batch_sizez&AlgorithmConfig.total_train_batch_size{  s     0D4E4JKKr^   r   worker_indexc                 R   | j         dk    r| j        | j        | j        pdz  z  }t	          |          |k    r[| j        t	          |          | j        z  | j        pdz  z
  }|dz
  | j        z  |k    rt	          |          S t	          |          dz   S t	          |          S | j         S )a  Automatically infers a proper rollout_fragment_length setting if "auto".

        Uses the simple formula:
        `rollout_fragment_length` = `total_train_batch_size` /
        (`num_envs_per_env_runner` * `num_env_runners`)

        If result is a fraction AND `worker_index` is provided, makes
        those workers add additional timesteps, such that the overall batch size (across
        the workers) adds up to exactly the `total_train_batch_size`.

        Returns:
            The user-provided `rollout_fragment_length` or a computed one (if user
            provided value is "auto"), making sure `total_train_batch_size` is reached
            exactly in each iteration.
        r   r   )r   r  r   r   r  )r  r  r   diffs       r\   get_rollout_fragment_lengthz+AlgorithmConfig.get_rollout_fragment_length  s      '611 '+&A,0D0IJ'# *++/FFF2S+6 6061484H4MA6O O "A%)EE$NN6777677!;;./////r^   c                    | j         r| j        J dS | j        }t          |t                    r|                    d          }n-|                     d          }|                    |pi            d|_         d|_        | j        |_        | j        dk    rd|_	        d|_
        nId|_	        | j        d	k    rd
n/t          t          j        | j        | j        pdz                      |_
        |S )a  Creates a full AlgorithmConfig object from `self.evaluation_config`.

        Returns:
            A fully valid AlgorithmConfig object that can be used for the evaluation
            EnvRunnerGroup. If `self` is already an evaluation config object, return
            None.
        NFr  Tr   r}  r   r   r   r   )rY  rT  rY   r`   r  ro   rW  r   rN  r   r   rM  r  mathceil)r  rT  eval_config_objs      r\   get_evaluation_config_objectz,AlgorithmConfig.get_evaluation_config_object  s-     	)1114 2'99 	F/444GGOO #iiEi::O,,->-D"EEE )-%,0) +/*I' (J66)<O&67O33 *=O& +v55 I0:?aA   3 r^   c                    | j         dk    r| j        st          | j        d          | j        z  | j         z  }|}|| j        k     r||z  }|| j        k     || j        z
  d| j        z  k    s||z
  | j        z
  d| j        z  k    rh| j        | j        | j        pdz  z  }|                     d| j         d| j         d| j         d| j         d| j         d	| j          d
| d           dS dS dS dS )a  Detects mismatches for `train_batch_size` vs `rollout_fragment_length`.

        Only applicable for algorithms, whose train_batch_size should be directly
        dependent on rollout_fragment_length (synchronous sampling, on-policy PG algos).

        If rollout_fragment_length != "auto", makes sure that the product of
        `rollout_fragment_length` x `num_env_runners` x `num_envs_per_env_runner`
        roughly (10%) matches the provided `train_batch_size`. Otherwise, errors with
        asking the user to set rollout_fragment_length to `auto` or to a matching
        value.

        Raises:
            ValueError: If there is a mismatch between user provided
                `rollout_fragment_length` and `total_train_batch_size`.
        r   r   g?z'Your desired `total_train_batch_size` (=z learners x zZ) or a value 10% off of that cannot be achieved with your other settings (num_env_runners=z; num_envs_per_env_runner=z; rollout_fragment_length=zD)! Try setting `rollout_fragment_length` to 'auto' OR to a value of r  N)	r   rY  maxr   r   r  _value_errorr   r  )r  min_batch_size
batch_size!suggested_rollout_fragment_lengths       r\   4validate_train_batch_size_vs_rollout_fragment_lengthzDAlgorithmConfig.validate_train_batch_size_vs_rollout_fragment_length  s     '611$:L1D(!,,././ 
 (Jt:::n,
 t:::D77d11 n,t/JJd11  594O0D4H4MAN51 !!<3< <6:6G< <"&"C< < 261E	< <
 04/K< < 04/K< < 9< < <
 
 
 
 
# 2111 r^   c                 <    ddl m}  || j        | j                  S )z1Returns the TorchCompileConfig to use on workers.r   )TorchCompileConfig)torch_dynamo_backendtorch_dynamo_mode)3ray.rllib.core.rl_module.torch.torch_compile_configr  r   r   )r  r  s     r\   get_torch_compile_worker_configz/AlgorithmConfig.get_torch_compile_worker_config*  sH    	
 	
 	
 	
 	
 	
 "!!%!I"C
 
 
 	
r^   c                     t           )a#  Returns the RLModule spec to use for this algorithm.

        Override this method in the subclass to return the RLModule spec, given
        the input framework.

        Returns:
            The RLModuleSpec (or MultiRLModuleSpec) to
            use for this algorithm's RLModule.
        NotImplementedErrorr  s    r\   r  z*AlgorithmConfig.get_default_rl_module_spec6  
     "!r^   c                     t           )af  Returns the Learner class to use for this algorithm.

        Override this method in the sub-class to return the Learner class type given
        the input framework.

        Returns:
            The Learner class to use for this algorithm either as a class type or as
            a string (e.g. "ray.rllib.algorithms.ppo.ppo_learner.PPOLearner").
        r  r  s    r\   r  z)AlgorithmConfig.get_default_learner_classB  r  r^   inference_onlyc                 b   t          j        | j                  }t          |t                    rvd}t
          |vrd}|r4|j                                        D ]\  }}|t
          k    r	|j        sd}nt          |          dk    rd}|rt          d          |t
                   }|j        _||t
                   d         |_        nD|6t          |t          j                  rt          |d|j                  |_        n| j        |_        |j        _||t
                   d         |_        nD|6t          |t          j                  rt          |d|j                  |_        n| j        |_        |j        | j        |_        n!| j        |                                z  |_        |||_        |S )	a  Returns the RLModuleSpec based on the given env/spaces and this config.

        Args:
            env: An optional environment instance, from which to infer the observation-
                and action spaces for the RLModule. If not provided, tries to infer
                from `spaces`, otherwise from `self.observation_space` and
                `self.action_space`. Raises an error, if no information on spaces can be
                inferred.
            spaces: Optional dict mapping ModuleIDs to 2-tuples of observation- and
                action space that should be used for the respective RLModule.
                These spaces are usually provided by an already instantiated remote
                EnvRunner (call `EnvRunner.get_spaces()` to receive this dict). If not
                provided, RLlib tries to infer this from `env`, if provided, otherwise
                from `self.observation_space` and `self.action_space`. Raises an error,
                if no information on spaces can be inferred.
            inference_only: If `True`, the returned module spec is used in an
                inference-only setting (sampling) and the RLModule can thus be built in
                its light version (if available). For example, the `inference_only`
                version of an RLModule might only contain the networks required for
                computing actions, but misses additional target- or critic networks.

        Returns:
            A new RLModuleSpec instance that can be used to build an RLModule.
        FTr   a  When calling `AlgorithmConfig.get_rl_module_spec()`, the configuration must contain the `DEFAULT_MODULE_ID` key and all other keys' specs must have the setting `learner_only=True`! If you are using a more complex setup, call `AlgorithmConfig.get_multi_rl_module_spec(...)` instead.Nr   r  r  )r  r  r1  rY   r   r   r!  rw   learner_onlyr  rZ   r   r-  Envr+  r   r  _get_model_configr  )r  r   r.  r  r1  r  midr  s           r\   get_rl_module_specz"AlgorithmConfig.get_rl_module_specN  s   < t':;;
 n&788 	?E 66 !/!?!E!E!G!G ) )IC///#0 )$(E) ^$$q((  O   ,,=>N+3!39:K3LQ3O00ZSW%=%=3:3S5J4 400 483I0&.!.45F.G.J++ZSW%=%=.5.0@/ /++ /3.?+ &.*.*;N''
 !N$D$D$F$FF ' %,:N)r^   F)r   r.  r  policy_dictsingle_agent_rl_module_specr  r  c                   ||                      ||          \  }}|                                 }| j        p|t          t                    r;p|_        t          fd|                                D                       }net          t                    sJ t          |t                    rt          j        t                    r1pj        |_        fd|                                D             }	n4p||_        fd|j        z                                  D             }		                    j
        |	j        j        j                  }nn5t          j        t                    rj        nt          d d	          |_        	                    j
        fd
|                                D             j        j        j                  }||j        z  D ]}
|j        |
         }|r|j        r|                    |
           /|j        t          |t                    r|j        |_        nst          |j        t                    r%|j        j        }|t          d          ||_        n4|
|j        v r|j        |
         j        |_        nt          d|
 d          |j        t          |t                    r|j        |_        nbt          |j        t                    r|j        j        }||_        n4|
|j        v r|j        |
         j        |_        nt          d|
 d          |j        |j        V|                    |
|                    t,                              }|&|j        |j        |_        |j        |j        |_        |j        | j        |_        | j        |                                z  |_        |S )at  Returns the MultiRLModuleSpec based on the given env/spaces.

        Args:
            env: An optional environment instance, from which to infer the different
                spaces for the individual RLModules. If not provided, tries to infer
                from `spaces`, otherwise from `self.observation_space` and
                `self.action_space`. Raises an error, if no information on spaces can be
                inferred.
            spaces: Optional dict mapping ModuleIDs to 2-tuples of observation- and
                action space that should be used for the respective RLModule.
                These spaces are usually provided by an already instantiated remote
                EnvRunner (call `EnvRunner.get_spaces()`). If not provided, tries
                to infer from `env`, otherwise from `self.observation_space` and
                `self.action_space`. Raises an error, if no information on spaces can be
                inferred.
            inference_only: If `True`, the returned module spec is used in an
                inference-only setting (sampling) and the RLModule can thus be built in
                its light version (if available). For example, the `inference_only`
                version of an RLModule might only contain the networks required for
                computing actions, but misses additional target- or critic networks.
                Also, if `True`, the returned spec does NOT contain those (sub)
                RLModuleSpecs that have their `learner_only` flag set to True.

        Returns:
            A new MultiRLModuleSpec instance that can be used to build a MultiRLModule.
        NrP  c                 :    i | ]}|t          j                  S r  r  r  r  r  r  s     r\   r  z<AlgorithmConfig.get_multi_rl_module_spec.<locals>.<dictcomp>  s6     ! ! ! t}%@AA! ! !r^   )r!  c                 :    i | ]}|t          j                  S r  r  )r  r  single_agent_specs     r\   r  z<AlgorithmConfig.get_multi_rl_module_spec.<locals>.<dictcomp>  s3     $ $ $@A4=):;;$ $ $r^   c           	      l    i | ]0}|t          j        j                            |                    1S r  )r  r  r!  r  )r  r  current_rl_module_specr
  s     r\   r  z<AlgorithmConfig.get_multi_rl_module_spec.<locals>.<dictcomp>  sU     	$ 	$ 	$  4=2BFF !#4  	$ 	$ 	$r^   )multi_rl_module_classr!  modules_to_loadload_state_pathr  zWe have a MultiRLModuleSpec (z), but no `RLModuleSpec`s to compile the individual RLModules' specs! Use `AlgorithmConfig.get_multi_rl_module_spec(policy_dict=.., rl_module_spec=..)`.c                 :    i | ]}|t          j                  S r  r  r  s     r\   r  z<AlgorithmConfig.get_multi_rl_module_spec.<locals>.<dictcomp>E  s6     % % % 4=)DEE% % %r^   zTThe default rl_module spec cannot have an empty module_class under its RLModuleSpec.zModule class for module z cannot be inferred. It is neither provided in the rl_module_spec that is passed in nor in the default module spec used in the algorithm.zCatalog class for module )get_multi_agent_setupr  r  rY   r   r  r   keysr!  	__class__r  r  r  r  rZ   r  remove_modulesmodule_classcatalog_classr   r   r  r   r   )r  r   r.  r  r  r  _r  r  module_specsrZ  rV   r  r  r  r  r
  s        `         @@r\   rR  z(AlgorithmConfig.get_multi_rl_module_spec  sU   N !77C7OONK "&!@!@!B!B "&!5!O9O ,l;; o	
 ,E/E ( :H'6#4! ! ! !(--//! ! !$ $ $   46GHHHHH 0,?? W 4DlSS (C ).> & 8F%4$ $ $ $EPEUEUEWEW$ $ $LL 4M7M & 8F%4	$ 	$ 	$ 	$ 	$ (*@*PP$&&	$ 	$ 	$L (>'G'G*@*V$0$:$J$:$J!7!D (H ( ($$ /: ""8"H,WW  3B 43 )C 6C C C   >L+: (>'G'G*@*V% % % %!,!1!1!3!3% % % %;$J$:$J!7!D (H 	( 	($ %';'KK I	 I	I />yIK +": $33I>>>'/4lCC /E/RK,, 6 FUU #9#I#VL $+(C   0<K,,"8"HHH/E/U!0"  ,, %)9 ) ) )   (04lCC 0F0TK-- 6 FUU $:$J$XM0=K--"8"HHH0F0V!1#  -- %)I ) ) )   -5+3)oo{/@AA  *"4<8C8U5"/73>3K0 '/+/+<((
 %(E(E(G(GG (( $#r^   c                     t          | d          r| j        r|dvrt          d| d          |dk    rd}t                                          ||           dS )z<Gatekeeper in case we are in frozen state and need to error.rm   )r  r  rm   zCannot set attribute (z') of an already frozen AlgorithmConfig!r1  r  N)rx   rm   r  super__setattr__r  r~   r   r  s      r\   r  zAlgorithmConfig.__setattr__  s     4&& 	4? 	 LLL$'S ' ' '   """#CC'''''r^   c                 L    |                      |          }t          | |          S )a  Shim method to still support accessing properties by key lookup.

        This way, an AlgorithmConfig object can still be used as if a dict, e.g.
        by Ray Tune.

        Examples:
            .. testcode::

                from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
                config = AlgorithmConfig()
                print(config["lr"])

            .. testoutput::

                0.001
        r{   r+  )r  items     r\   __getitem__zAlgorithmConfig.__getitem__  s'    4 ++D11tT"""r^   c                 v    |dk    rt          d          t                                          ||           d S )Nr  zCannot set `multiagent` key in an AlgorithmConfig!
Try setting the multi-agent components of your AlgorithmConfig object via the `multi_agent()` method and its arguments.
E.g. `config.multi_agent(policies=.., policy_mapping_fn.., policies_to_train=..)`.)r  r  r  r  s      r\   __setitem__zAlgorithmConfig.__setitem__  sJ     , L   	C'''''r^   c                 P    |                      |d          }t          | |          S *Shim method to help pretend we are a dict.Fru   )r{   rx   )r  r  props      r\   __contains__zAlgorithmConfig.__contains__  s*    ++D%+HHtT"""r^   c                 R    |                      |d          }t          | ||          S r$  r  )r  r~   defaultr&  s       r\   r  zAlgorithmConfig.get  s,    ++C+GGtT7+++r^   c                 .    |                      ||          S r%  )r  )r  r~   r)  s      r\   rn   zAlgorithmConfig.pop  s    xxW%%%r^   c                 N    |                                                                  S r+  )r  r  r  s    r\   r  zAlgorithmConfig.keys  s    ||~~""$$$r^   c                 N    |                                                                  S r+  )r  valuesr  s    r\   r.  zAlgorithmConfig.values  s    ||~~$$&&&r^   c                 N    |                                                                  S r+  )r  rw   r  s    r\   rw   zAlgorithmConfig.items  s    ||~~##%%%r^   c                     i S )a  Defines which `AlgorithmConfig` settings/properties should be
        auto-included into `self.model_config`.

        The dictionary in this property contains the default configuration of an
        algorithm. Together with the `self._model`, this method is used to
        define the configuration sent to the `RLModule`.

        Returns:
            A dictionary with the automatically included properties/settings of this
            `AlgorithmConfig` object into `self.model_config`.
        r  r  s    r\   r  z+AlgorithmConfig._model_config_auto_includes  s	     	r^   c                 p    |dz   }| j         rt          |          t                              |           d S )Nzg
To suppress all validation errors, set `config.experimental(_validate_config=False)` at your own risk.)r  rZ   r(  rn  )r  errmsgr  s      r\   r  zAlgorithmConfig._value_error  sC    N
   	#S//!NN6"""""r^   c                     t          t          t          j                  d t          j        D             z             }| j        |vr#|                     d| j         d| d           d S d S )Nc                     g | ]	}|j         
S r  )r   )r  modes     r\   
<listcomp>zAAlgorithmConfig._validate_env_runner_settings.<locals>.<listcomp>  s    &P&P&Pdtz&P&P&Pr^   z`gym_env_vectorize_mode` (zD) must be a member of `gymnasium.VectorizeMode`! Allowed values are r  )r2  r)  r-  VectorizeModer   r  )r  allowed_vectorize_modess     r\   r  z-AlgorithmConfig._validate_env_runner_settings  s    "%"##&P&Pc>O&P&P&PP#
 #
 &.EEEAT-H A A&=A A A     FEr^   c                 
   | j         sy| j        [| j        T| j        M| j        F| j        ?| j        8| j        1| j        *| j        #| j	        | j
        | j        | j        | j        |                     d           dS dS dS )zValidates callbacks settings.NzConfig settings `config.callbacks(on_....=lambda ..)` aren't supported on the old API stack! Switch to the new API stack through `config.api_stack(enable_env_runner_and_connector_v2=True, enable_rl_module_and_learner=True)`.)r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  s    r\   r  z,AlgorithmConfig._validate_callbacks_settings   s    
 6 	5A3?1=3?1=/;8D4@2>1=0<6B:FCO!!<    #	 	 POr^   c                 f   d\  }}}d}| j         dvr| j         dk    rdS | j         dv rt                      \  }}}nt                      \  }}| j         dk    r| j        r|                     d           |P| j         dk    rEt          j        |j                  t          k     r#| j	        s| j
        r|                     d           | j	        r8dd	lm} | j        |j        |j        fvr|                     d
| j                    |                     |||           |                     ||           dS )zGValidates framework settings and checks whether framework is installed.NNNN>   tftf2r   r<  zCannot use `framework=tf` with the new API stack! Either switch to tf2 via `config.framework('tf2')` OR disable the new API stack via `config.api_stack(enable_rl_module_and_learner=False)`.z0torch.compile is only supported from torch 2.0.0r   )r   z`config.torch_compile_learner_what_to_compile` must be one of [TorchCompileWhatToCompile.forward_train, TorchCompileWhatToCompile.complete_update] but is )r   r/   r0   r   r  r   parse__version__r:   r   r   *ray.rllib.core.learner.torch.torch_learnerr   r   r   COMPLETE_UPDATE(_check_if_correct_nn_framework_installed_resolve_tf_settings)r  _tf1_tf_tfv_torchr  r   s          r\   r  z,AlgorithmConfig._validate_framework_settings=  s   *c4]22t7IW7T7TF=00+ooOD#tt(**IFA %%$*K%J   "g--f0114RRR+ S/3/H S PQQQ % 	      9)7)9B   !!E BE E   	55dCHHH!!$-----r^   c                     dS )z6Checks, whether resources related settings make sense.Nr  r  s    r\   r  z,AlgorithmConfig._validate_resources_settingsm      r^   c                     t          | j        t          t          t          f          r,| j        D ]&}|| j        vr|                     d| d           %dS dS )z8Checks, whether multi-agent related settings make sense.z?`config.multi_agent(policies_to_train=..)` contains policy ID (z<) that was not defined in `config.multi_agent(policies=..)`!N)rY   r!  r)  r2  r*  r  r  )r  r  s     r\   r  z.AlgorithmConfig._validate_multi_agent_settingsq  s     d,tS%.@AA 	-  dm++%%>&)> > >  	 	 r^   c                 >   | j         du r|                     d           | j        dk    r*| j        s#t                              d| j         d           | j        dk    r'| j        dk    r| j        r|                     d           | j        dk    rG| j        s|                     d           dS | j	        d	k    rt                              d
           dS dS t          | j        t                    r| j        dk    r |                     d| j         d           dS dS )z7Checks, whether evaluation related settings make sense.Tz`enable_async_evaluation` has been deprecated (you should set this to False)! Use `config.evaluation(evaluation_parallel_to_training=True)` instead.r   zYou have specified z evaluation workers, but your `evaluation_interval` is 0 or None! Therefore, evaluation doesn't occur automatically with each call to `Algorithm.train()`. Instead, you have to call `Algorithm.evaluate()` manually in order to trigger an evaluation run.z`evaluation_parallel_to_training` can only be done if `evaluation_num_env_runners` > 0! Try setting `config.evaluation_parallel_to_training` to False.r   zU`evaluation_duration=auto` not supported for `evaluation_parallel_to_training=False`!r   zWhen using `config.evaluation_duration='auto'`, the sampling unit used is always 'timesteps'! You have set `config.evaluation_duration_unit='episodes'`, which is ignored.z`evaluation_duration` (z) must be an int and >0!N)r  r  rW  rL  r(  rn  r\  rR  rM  rN  rY   r  r  s    r\   r  z-AlgorithmConfig._validate_evaluation_settings~  s    '4//   *Q..t7O.NN"d&E " " "   +q00-224 3 E   #v--7 
!!?     .*<<V     =< 43S99	'1,,$*B        -,r^   c                     | j         dk    r| j        r|                     d           | j         dk    r<| j        | j        d<   | j        r| j        pd| j        d<   dS | j        pd| j        d<   dS dS )z2Checks, whether input related settings make sense.r   zOff-policy estimation methods can only be used if the input is a dataset. We currently do not support applying off_policy_estimation_method on a sampler input.datasetr  r   r  N)r$  rU  r  r   r<  rY  rW  r   r  s    r\   r  z(AlgorithmConfig._validate_input_settings  s     ;)##(J#-   ;)## ;?:VD67! 
M
 483R3WVW!-000
 483G3L1!-000 $#r^   c                 ,   | j         si| j        t                              d           | j        #t                              d| j         d           | j        r|                     d           dS t          | j        j	         d          r(t                              d| j        j	         d           | j        s|                     d	           	 t          j        | j        t                     t          d
 | j                                        D             d t          j                    D                        n*# t           $ r t                              d           Y nw xY wt#          j        | j        dd           | j        r|                     d           d}| j        d         )|                     |                    dd                     | j        d         i k    r+|                     |                    dd                     dS dS )zAChecks, whether settings related to the new API stack make sense.NzYou have setup a RLModuleSpec (via calling `config.rl_module(...)`), but have not enabled the new API stack. To enable it, call `config.api_stack(enable_rl_module_and_learner=True)`.zRYou specified a custom Learner class (via `AlgorithmConfig.training(learner_class=z)`, but have the new API stack disabled. You need to enable it via `AlgorithmConfig.api_stack(enable_rl_module_and_learner=True)`.zYou are using the new API stack EnvRunners (SingleAgentEnvRunner or MultiAgentEnvRunner), but have forgotten to switch on the new API stack! Try setting `config.api_stack(enable_rl_module_and_learner=True)`._on_new_api_stackzYou are running aM   on the new API stack! This is the new default behavior for this algorithm. If you don't want to use the new API stack, set `config.api_stack(enable_rl_module_and_learner=False,enable_env_runner_and_connector_v2=False)`. For a detailed migration guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.htmlzSetting `enable_rl_module_and_learner` to True and `enable_env_runner_and_connector_v2` to False ('hybrid API stack') is not longer supported! Set both to True (new API stack) or both to False (old API stack), instead.c                 &    i | ]\  }}|d k    ||S vf_share_layersr  r  s      r\   r  zDAlgorithmConfig._validate_new_api_stack_settings.<locals>.<dictcomp>  s)    OOO$!Q=N8N8NA8N8N8Nr^   c                 &    i | ]\  }}|d k    ||S rQ  r  r  s      r\   r  zDAlgorithmConfig._validate_new_api_stack_settings.<locals>.<dictcomp>  s)    SSS$!QAAR<R<RA<R<R<Rr^   a  You configured a custom `model` config (probably through calling config.training(model=..), whereas your config uses the new API stack! In order to switch off the new API stack, set in your config: `config.api_stack(enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False)`. If you DO want to use the new API stack, configure your model, instead, through: `config.rl_module(model_config={..})`.r   zlearning rate)fixed_value_or_schedulesetting_namedescriptiona,  When the RLModule API is enabled, exploration_config can not be set. If you want to implement custom exploration behaviour, please modify the `forward_exploration` method of the RLModule at hand. On configs that have a default exploration config, this must be done via `config.exploration_config={}`.ai  Cannot use `{}` option with the new API stack (RLModule and Learner APIs)! `{}` is part of the ModelV2 API and Policy API, which are not compatible with the new API stack. You can either deactivate the new stack via `config.api_stack( enable_rl_module_and_learner=False)`,or use the new stack (incl. RLModule API) and implement your custom model as an RLModule.r  custom_model_config)r   r  r(  rn  r	  r   r  rJ   r   rz   treeassert_same_structurer  r#   r9   rw   	Exceptionr5   r   r   r   r  )r  not_compatible_w_rlm_msgs     r\   r  z0AlgorithmConfig._validate_new_api_stack_settings  s    0 	 #/   ".V?C?RV V V   6 !!M   F t/BBBCC 	NNj4?#; j j j   6 	5  	&tz>BBBOO$*"2"2"4"4OOOSS."6"8"8SSS     		 		 		NN9    		 	$(G'	
 	
 	
 	
 " 	2  + 	! :n%1(//OO   :+,22(//)+@      32s   A+D> >$E%$E%c                 R   | j         r| j        durt          dd           | j        dvr|                     d           | j        du r
d| j        d<   | j        du r
d| j        d	<   | j                            d
          rt          ddd           | j	        du rd S | j
        st| j        dk    ri| j        dk    r7t          |           j        dk    r|                     d| j         d           n| j	        du r|                     d           d| _	        d S | j	        t          k    r| j        dvr	d| _	        d S | j        rddlm ddlm d | j        r| j                            |           | j        }t1          |t2                    rd |                                D             nd |D             }t7          fd|D                       r	d| _	        d S d| _	        d S d| _	        d S | j	        du r+| j        dk    r"|                     d| j         d           d S d S d S )NFzAlgorithmConfig.render_envzThe `render_env` setting is not supported on the new API stack! In order to log videos to WandB (or other loggers), take a look at this example here: https://github.com/ray-project/ray/blob/master/rllib/examples/envs/env_rendering_and_recording.py)rh  r  )rllibr   NzF`config.preprocessor_pref` must be either 'rllib', 'deepmind' or None!Tr  r  custom_preprocessorz<AlgorithmConfig.training(model={'custom_preprocessor': ...})zCustom preprocessors are deprecated, since they sometimes conflict with the built-in preprocessors for handling complex observation spaces. Please use wrapper classes around your environment instead.r  r   r=  	AlphaStarz/`num_gpus` > 1 not supported yet for framework=rt   zhCannot use `simple_optimizer` if `num_gpus` > 1! Consider not setting `simple_optimizer` in your config.)r<  r   r   )DynamicTFPolicy)TorchPolicyc                 \    g | ])}t          |t          t          f          r	t          | n|*S r  )rY   r*  r)  r'   )r  r  s     r\   r6  zGAlgorithmConfig._validate_to_be_deprecated_settings.<locals>.<listcomp>  sF         .8udm-L-LV
D))RV  r^   c                 *    g | ]}t                      S r  r  )r  r  s     r\   r6  zGAlgorithmConfig._validate_to_be_deprecated_settings.<locals>.<listcomp>  s    9991*,,999r^   c              3   b   K   | ])}|j         pd u pt          |j         pf           V  *d S rb   )policy_class
issubclass)r  r  r`  ra  default_policy_clss     r\   r  zFAlgorithmConfig._validate_to_be_deprecated_settings.<locals>.<genexpr>  sr          &<*<E %)?-?(+6       r^   z<`simple_optimizer=False` not supported for config.framework(z)!)r   r   r   r   r  r  r  r  r  r  r   r   r   r[   rz   r   r,  "ray.rllib.policy.dynamic_tf_policyr`  ray.rllib.policy.torch_policyra  r   get_default_policy_classr  rY   r  r.  any)r  r  policy_specsr`  ra  rg  s      @@@r\   r  z3AlgorithmConfig._validate_to_be_deprecated_settingsP  s_   2 	te7S7S0t    !)DDDX   )T116:DJ23 *d227;DJ34:>>/00 		R
      D((D2 >	t}q7H7H !U**tDzz/Bk/Q/Q!!7!%!37 7 7    &$..!!N   %*D!!! "&666!88(,%%% $ .NNNNNNEEEEEE%)"? X)-)Q)QRV)W)W&= "(D11	:  $,OO$5$5   
 :9999         !-     
2 -1D))),1D)))(-%%% "e++!U**!!?(,(:? ? ?     ,+**r^   c                     | j         r<| j        s5| j        dk    s*| j        s#| j        | j        |                     d           | j        r*| j        j	        dk    rt                              d           ddlm} ddlm} | j        r*t#          | j        |          s|                     d           | j        r*t#          | j        |          s|                     d           dd	lm} | j        r*t#          | j        |          s|                     d
           | j        r#| j        s| j        s|                     d           | j        r'| j        r | j        dk    r|                     d           ddlm} t=          |          }| j        r5| j        dk    r*| j        |vr!|                     d| j         d| d           ddl m!} | j"        r,t#          | j"        |          s|                     d           d S d S d S )Nr   zXIf no evaluation should be run, `action_space` and `observation_space` must be provided.BCz`ignore_final_observation=True` (zeros-out truncation observations), but the algorithm isn't `BC`. It is recommended to use this setting only with `BC`, b/c other RL algorithms rely on truncation-observations due to value function estimates.)OfflineData)OfflinePreLearnerzZUnknown `offline_data_class`. OfflineData class needs to inherit from `OfflineData` class.z]Unknown `prelearner_class`. PreLearner class needs to inherit from `OfflinePreLearner` class.)EpisodeReplayBufferzUnknown `prelearner_buffer_class`. The buffer class for the prelearner needs to inherit from `EpisodeReplayBuffer`. Specifically it needs to store and sample lists of `Single-/MultiAgentEpisode`s.zSetting `input_read_batch_size` is only allowed in case of a dataset that holds either `EpisodeType` or `BatchType` data (i.e. rows that contains multiple timesteps), but neither `input_read_episodes` nor `input_read_sample_batches` is set to `True`.r}  zWhen recording episodes only complete episodes should be recorded (i.e. `batch_mode=='complete_episodes'`). Otherwise recorded episodes cannot be read in for training.)OfflinePolicyEvaluationTypes	eval_lossz!Unknown offline evaluation type: zx.Available types of offline evaluation are either `'eval_loss' to evaluate the training loss on a validation dataset or r  )OfflineEvaluationRunnerzyUnknown `offline_eval_runner_class`. OfflineEvaluationRunner class needs to inherit from `OfflineEvaluationRunner` class.)#
is_offline	is_onlinerW  rL  r   r   r  r6  r   rz   r(  rn  ray.rllib.offline.offline_dataro  $ray.rllib.offline.offline_prelearnerrp  r%  rf  r7  4ray.rllib.utils.replay_buffers.episode_replay_bufferrq  r8  r+  r)  r*  r@  rJ  r   2ray.rllib.offline.offline_policy_evaluation_runnerrr  r)  r]  +ray.rllib.offline.offline_evaluation_runnerrt  r^  )r  ro  rp  rq  rr  offline_eval_typesrt  s          r\   r  z*AlgorithmConfig._validate_offline_settings  sD    O	N	 4q88D<T8&.$2H2P 8  
 ( 	T_-E-M-MNN@   	?>>>>>JJJJJJ" 	:#[,
 ,
 	 ,     	!#4*
 *
 	 2  
	
 	
 	
 	
 	
 	
 ' 	
(*=1
 1
 	 0   % 		$		(,(F		    K		*		 #666D  	
 	
 	
 	
 	
 	
 "">??(		,;;,4PPPVD4P V V@RV V V   	XWWWWW) 	**,C3
 3
 	 8    	 	 	 	r^   c                     | j         S )zDefines if this config is for online RL.

        Note, a config can be for on- and offline training at the same time.
        )r   r  s    r\   rv  zAlgorithmConfig.is_online$  s     r^   c                     t          | j                  oet          | j        t                    p9t          | j        t                    ot          | j        d         t                    o| j        dk    o| j        S )z*Defines, if this config is for offline RL.r   r   )boolr$  rY   r  r)  r   r  s    r\   ru  zAlgorithmConfig.is_offline,  su    
  2 4;,, Wt{D11UjQQT6U6U	2 y(2 1	
r^   c                 r   d| v r|                      d          | d<   d| v rt          | d                   | d<   t          | d                   | d<   t          | d                   | d<   t          | d         t                    rt          | d                   | d<   d| v rRt          | d                             d          t                    r$t          | d         d                   | d         d<   t          | d                             d          t                    r$t          | d         d                   | d         d<   t          | d	                             d
          t                    r$t          | d	         d
                   | d	         d
<   |                     d          }|t          |                    d          t
          t          f          rt          |d                   |d<   |                    d          r
t          |d<   |                    d          r
t          |d<   t          |                     d          t
          t          f          rt          | d                   | d<   |                     d          r
t          | d<   |                     d          r
t          | d<   | S )Nr
  r  r  r   r   r  r[   r   r  r  r  r  r   r!  )	rn   r8   rY   r[   r  r2  r*  r)  r6   )r  	ma_configs     r\   r  zAlgorithmConfig._serialize_dict<  s    &&"(**->"?"?F;f,VG_==F7O,VK-@AA{%3F;M4N%O%O!"fUmT** 	:*6%=99F5M!V++v4599&AA4HH , 6D-.v66 6F)*62 f1266v>>EE 	3A+,V44 4F'(0 fWo)).994@@ 	.<w// /F7ON+
 JJ|,,	 )--
33c5\BB D(,Yz-B(C(C	*%}}011 B1A	-.}}011 B1A	-. fjj,,sEl;; 	:!%fZ&8!9!9F:::)** 	;*:F&'::)** 	;*:F&'r^   r~   rv   c                 &   | dk    rd} n>| dk    rd} n5| dk    rd} n,| dk    rd} n#| d	k    rd
} n| dk    rd} n| dk    rd} n| dk    rd} |r| dk    rt          ddd           n| dk    rt          ddd           n| dk    rt          ddd           n|| dk    rt          ddd           nc| dk    rt          ddd           nJ| d k    rt          d!d"d           n1| d#k    rt          d$d%d           n| d&k    rt          d'd(d           | S ))Nr  r
  r  r   r  rX  r  r   r  r$  r  r  num_cpus_for_driverr   num_workersr   r  rr  Trg  r  z!config.metrics_smoothing_episodesz)config.metrics_num_episodes_for_smoothingr  zconfig.min_iter_time_szconfig.min_time_s_per_iterationr  zconfig.min_time_s_per_reportingr  z)config.min_sample_timesteps_per_reportingz)config.min_sample_timesteps_per_iterationr  z(config.min_train_timesteps_per_reportingz(config.min_train_timesteps_per_iterationr  zconfig.timesteps_per_iterationzY`config.min_sample_timesteps_per_iteration` OR `config.min_train_timesteps_per_iteration`r  zconfig.evaluation_num_episodeszK`config.evaluation_duration` and `config.evaluation_duration_unit=episodes`r   )r~   rv   s     r\   r{   z'AlgorithmConfig._translate_special_keysm  s9    +#CC***.CC***.CCK!CCG^^CCH__CC)))-CCM!!#C  2	///#1>    
 444#;C    
 )))#09    
 222#99    
 <<<#CC    
 ;;;#BB    
 111#8A	     111#8A	    
r^   c                     | j         dv r|s|st          d          dS dS | j         dk    r|st          d          dS dS )z?Check if tf/torch experiment is running and tf/torch installed.>   r<  r=  zTensorFlow was specified as the framework to use (via `config.framework([tf|tf2])`)! However, no installation was found. You can install TensorFlow via `pip install tensorflow`r   zPyTorch was specified as the framework to use (via `config.framework('torch')`)! However, no installation was found. You can install PyTorch via `pip install torch`.N)r   ImportError)r  rD  rE  rG  s       r\   rB  z8AlgorithmConfig._check_if_correct_nn_framework_installed  s    .. C !Y      7** !G   +* r^   c                 `   |r| j         dk    ru| j         dk    r|dk     rt          d          |                                s|                                 t                              d| j          d| j         d           d	S |r'| j         dk    rt                              d           d	S d	S d	S )
zCheck and resolve tf settings.r=  r   zxYou configured `framework`=tf2, but your installed pip tf-version is < 2.0! Make sure your TensorFlow version is >= 2.x.zExecuting eagerly (framework='z'), with eager_tracing=z. For production workloads, make sure to set eager_tracing=True  in order to match the speed of tf-static-graph (framework='tf'). For debugging purposes, `eager_tracing=False` is the best choice.r<  zYour framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.N)r   rZ   executing_eagerlyenable_eager_executionr(  infor   )r  rD  rF  s      r\   rC  z$AlgorithmConfig._resolve_tf_settings  s    	D&%//!U**taxx )  
 ))++ .++---KK<1C < <'+'9< < <      	d(D00KK3    	 	00r^   )r  r   r.  default_policy_classr  c                   t          j        |p| j                  }t          |t          t
          t          f          rd |D             }d}d}t          |t          j        j	                  r/t          j
        |j                                                  \  }}n|t          |d          r't          |j        t          j                  r|j        }n6t          |d          r&t          |j        t          j                  r|j        }t          |d          r't          |j        t          j                  r|j        }n6t          |d          r&t          |j        t          j                  r|j        }|I|"|
                    t(          dg          d         }|#|
                    t(          ddg          d         }|                                                                 D ]\  }}t          |t,                    st-          | x||<   }|j        ||||         _        |j        nt          |d	          r|j        n||||v r||         d         }	n2t          t2                    rd}	| j        }
t          t          d
          rj        rj        n                                          }t;          |          dk    rj        n                    |d                   t?          fd|D                       r}	n|
rV|D ]S} |
|dd          |k    r?|	(                    |          |	k    rtA          d                              |          }	Tn'||}	n"| j        r| j        }	ntA          d| d          |	||         _        |j        nt          |d	          r|j        n||||v r||         d         }n2t          t2                    rd}| j        }
t          t          d
          rj        rj        n                                          }t;          |          dk    rj        n!                    |d                   t?          fd|D                       r}n|
rV|D ]S} |
|dd          |k    r?|(!                    |          |k    rtA          d          !                    |          }Tn'||}n"| j        r| j        }ntA          d| d          |||         _        t          ||         j"        tF                    sp||         j"        "t          ||         j"        tH                    sJ |                      d          %                    ||         j"        pi           ||         _"        | j&        /tO          | j&                  st	          | j&                  dfd	}n| j&        }||fS )an
  Compiles complete multi-agent config (dict) from the information in `self`.

        Infers the observation- and action spaces, the policy classes, and the policy's
        configs. The returned `MultiAgentPolicyConfigDict` is fully unified and strictly
        maps PolicyIDs to complete PolicySpec objects (with all their fields not-None).

        Examples:
        .. testcode::

            import gymnasium as gym
            from ray.rllib.algorithms.ppo import PPOConfig
            config = (
              PPOConfig()
              .environment("CartPole-v1")
              .framework("torch")
              .multi_agent(policies={"pol1", "pol2"}, policies_to_train=["pol1"])
            )
            policy_dict, is_policy_to_train = config.get_multi_agent_setup(
                env=gym.make("CartPole-v1"))
            is_policy_to_train("pol1")
            is_policy_to_train("pol2")

        Args:
            policies: An optional multi-agent `policies` dict, mapping policy IDs
                to PolicySpec objects. If not provided uses `self.policies`
                instead. Note that the `policy_class`, `observation_space`, and
                `action_space` properties in these PolicySpecs may be None and must
                therefore be inferred here.
            env: An optional env instance, from which to infer the different spaces for
                the different policies. If not provided, tries to infer from
                `spaces`. Otherwise from `self.observation_space` and
                `self.action_space`. Raises an error, if no information on spaces can be
                infered.
            spaces: Optional dict mapping policy IDs to tuples of 1) observation space
                and 2) action space that should be used for the respective policy.
                These spaces were usually provided by an already instantiated remote
                EnvRunner. Note that if the `env` argument is provided, tries to
                infer spaces from `env` first.
            default_policy_class: The Policy class to use should a PolicySpec have its
                policy_class property set to None.

        Returns:
            A tuple consisting of 1) a MultiAgentPolicyConfigDict and 2) a
            `is_policy_to_train(PolicyID, SampleBatchType) -> bool` callable.

        Raises:
            ValueError: In case, no spaces can be infered for the policy/ies.
            ValueError: In case, two agents in the env map to the same PolicyID
                (according to `self.policy_mapping_fn`), but have different action- or
                observation spaces according to the infered space information.
        c                 ,    i | ]}|t                      S r  r  )r  r  s     r\   r  z9AlgorithmConfig.get_multi_agent_setup.<locals>.<dictcomp>.  s    >>>cZ\\>>>r^   Nr  r   r  r   r   r   r  r/  c              3   J   K   | ]}                     |          k    V  d S rb   )r  )r  rg   env_unwrappedone_obs_spaces     r\   r  z8AlgorithmConfig.get_multi_agent_setup.<locals>.<genexpr>x  sJ         &;;C@@MQ     r^   )rh   zTwo agents in your environment map to the same policyID (as per your `policy_mapping_fn`), however, these agents also have different observation spaces!z3`observation_space` not provided in PolicySpec for z and env does not have an observation space OR no spaces received from other workers' env(s) OR no `observation_space` specified in config!c              3   J   K   | ]}                     |          k    V  d S rb   )r  )r  rg   r  one_act_spaces     r\   r  z8AlgorithmConfig.get_multi_agent_setup.<locals>.<genexpr>  sJ         &66s;;}L     r^   zTwo agents in your environment map to the same policyID (as per your `policy_mapping_fn`), however, these agents also have different action spaces!z.`action_space` not provided in PolicySpec for z and env does not have an action space OR no spaces received from other workers' env(s) OR no `action_space` specified in config!Fr  c                     | v S rb   r  )r  batchpolss     r\   is_policy_to_trainzAAlgorithmConfig.get_multi_agent_setup.<locals>.is_policy_to_train  s    d{"r^   rb   )(r  r  r  rY   r2  r)  r*  rayactorActorHandler  _get_spacesremoterx   r  r-  Spacer   r  r   r   rw   r'   re  r  r   r   r/  get_agent_idsr  r  r  rZ   r  r  r`   r  ro   r!  r  )r  r  r   r.  r  env_obs_spaceenv_act_spacer  r  r6  
mapping_fnaidsrg   r7  r  r  r  r  r  s                  @@@@r\   r  z%AlgorithmConfig.get_multi_agent_setup  s:   x =!:T];;
 hdE 233 	?>>X>>>H  c39011 	1+.73?3I3I3K3K+L+L(M== _s677 6J,ci= = 6 !$ <122 6z%sy8 8 6 !$ 5 s122 1z'8 8 1 !$ 7n-- 1* #)3 3 1 !$ 0 $ &

+;dV D DQ G$ &

+;dD\ J J1 M !) 5 5 7 7 L	 L	Ck:66 G.8+.FF '/4H4T-A* ,418k1J1J SPS%#-- &sAII}== 4 $I!%!7J"=2CDD;)9;55 +88::	 D 4yyA~~(5(G(5(K(KDQRG(T(T     #'     U %2		 $ U#' U UC)z#tDAAASHH %.$9(5(K(KC(P(P'0)1 )1 +5)H+& +& %& -:,O,OPS,T,T	". -II+  $ 6II$CC C C   3</ '/18k1J1J SPS%#-- &sAII}== 2 $I!%!7J"=2CDD;)9;55 +88::	 D 4yyA~~(5(B(5(F(FtAw(O(O     #'     P %2		 $ P#' P PC)z#tDAAASHH %.$9(5(F(Fs(K(Ky(X(X*4)C+& +& %& -:,J,J3,O,O	". -II&  $ 1II$>> > >   .7* hsm2ODD }+3zSM($8 833  (,yyUy'C'C'T'TSM(.B( ($ !-ht?U6V6V-t-..D# # # # # # # "&!7+++r^   zAlgorithmConfig.build_algo)ri  r  c                      | j         |i |S rb   )r  r  r'  ri   s      r\   rW  zAlgorithmConfig.build  s    t////r^   z*AlgorithmConfig.get_multi_rl_module_spec()c                     d S rb   r  r  s      r\   get_marl_module_specz$AlgorithmConfig.get_marl_module_spec  rI  r^   zAlgorithmConfig.env_runners(..)c                     d S rb   r  r  s      r\   rolloutszAlgorithmConfig.rollouts  rI  r^   c                     d S rb   r  r  s      r\   explorationzAlgorithmConfig.exploration  rI  r^   r~  c                     d S rb   r  r  s    r\   r  z+AlgorithmConfig.recreate_failed_env_runners  	     	r^   c                 *    t          ddd           d S )Nz+AlgorithmConfig.recreate_failed_env_runners*AlgorithmConfig.restart_failed_env_runnersTrg  r  r  s     r\   r  z+AlgorithmConfig.recreate_failed_env_runners  s,    =<	
 	
 	
 	
 	
 	
r^   %AlgorithmConfig._enable_new_api_stackc                     d S rb   r  r  s    r\   r  z%AlgorithmConfig._enable_new_api_stack  	     	r^   c                 *    t          ddd           d S )Nr  z,AlgorithmConfig.enable_rl_module_and_learnerTrg  r  r  s     r\   r  z%AlgorithmConfig._enable_new_api_stack  s,    7>	
 	
 	
 	
 	
 	
r^   z2AlgorithmConfig.enable_env_runner_and_connector_v2c                     d S rb   r  r  s    r\   uses_new_env_runnersz$AlgorithmConfig.uses_new_env_runners"  r  r^   AlgorithmConfig.num_env_runnersc                     d S rb   r  r  s    r\   rs  z#AlgorithmConfig.num_rollout_workers'  r  r^   c                 *    t          ddd           d S )Nz#AlgorithmConfig.num_rollout_workersr  Trg  r  r  s     r\   rs  z#AlgorithmConfig.num_rollout_workers,  s,    51	
 	
 	
 	
 	
 	
r^   &AlgorithmConfig.evaluation_num_workersc                     d S rb   r  r  s    r\   r  z&AlgorithmConfig.evaluation_num_workers4  r  r^   c                 *    t          ddd           d S )Nr  z*AlgorithmConfig.evaluation_num_env_runnersTrg  r  r  s     r\   r  z&AlgorithmConfig.evaluation_num_workers9  )    8<	
 	
 	
 	

 	r^   'AlgorithmConfig.num_envs_per_env_runnerc                     d S rb   r  r  s    r\   rt  z#AlgorithmConfig.num_envs_per_workerB  r  r^   c                 *    t          ddd           d S )Nz#AlgorithmConfig.num_envs_per_workerr  Trg  r  r  s     r\   rt  z#AlgorithmConfig.num_envs_per_workerG  )    59	
 	
 	
 	

 	r^   *AlgorithmConfig.ignore_env_runner_failuresc                     d S rb   r  r  s    r\   rv  z&AlgorithmConfig.ignore_worker_failuresP  r  r^   c                 *    t          ddd           d S )Nz&AlgorithmConfig.ignore_worker_failuresr  Trg  r  r  s     r\   rv  z&AlgorithmConfig.ignore_worker_failuresU  r  r^   r  c                     d S rb   r  r  s    r\   rw  z'AlgorithmConfig.recreate_failed_workers^  r  r^   c                 *    t          ddd           d S )Nz'AlgorithmConfig.recreate_failed_workersr  Trg  r  r  s     r\   rw  z'AlgorithmConfig.recreate_failed_workersc  s)    9<	
 	
 	
 	

 	r^   +AlgorithmConfig.max_num_env_runner_restartsc                     d S rb   r  r  s    r\   r  z'AlgorithmConfig.max_num_worker_restartsl  r  r^   c                 *    t          ddd           d S )Nz'AlgorithmConfig.max_num_worker_restartsr  Trg  r  r  s     r\   r  z'AlgorithmConfig.max_num_worker_restartsq  s)    9=	
 	
 	
 	

 	r^   3AlgorithmConfig.delay_between_env_runner_restarts_sc                     d S rb   r  r  s    r\   r  z/AlgorithmConfig.delay_between_worker_restarts_sz  r  r^   c                 *    t          ddd           d S )Nz/AlgorithmConfig.delay_between_worker_restarts_sr  Trg  r  r  s     r\   r  z/AlgorithmConfig.delay_between_worker_restarts_s  s)    AE	
 	
 	
 	

 	r^   =AlgorithmConfig.num_consecutive_env_runner_failures_tolerancec                     d S rb   r  r  s    r\   rx  z9AlgorithmConfig.num_consecutive_worker_failures_tolerance  s	    
 	r^   c                 *    t          ddd           d S )Nz9AlgorithmConfig.num_consecutive_worker_failures_tolerancer  Trg  r  r  s     r\   rx  z9AlgorithmConfig.num_consecutive_worker_failures_tolerance  s)    KO	
 	
 	
 	

 	r^   1AlgorithmConfig.env_runner_health_probe_timeout_sc                     d S rb   r  r  s    r\   ry  z-AlgorithmConfig.worker_health_probe_timeout_s  r  r^   c                 *    t          ddd           d S )Nz-AlgorithmConfig.worker_health_probe_timeout_sr  Trg  r  r  s     r\   ry  z-AlgorithmConfig.worker_health_probe_timeout_s  s)    ?C	
 	
 	
 	

 	r^   ,AlgorithmConfig.env_runner_restore_timeout_sc                     d S rb   r  r  s    r\   rz  z(AlgorithmConfig.worker_restore_timeout_s  r  r^   c                 *    t          ddd           d S )Nz(AlgorithmConfig.worker_restore_timeout_sr  Trg  r  r  s     r\   rz  z(AlgorithmConfig.worker_restore_timeout_s  s)    :>	
 	
 	
 	

 	r^   7AlgorithmConfig.validate_env_runners_after_constructionc                     d S rb   r  r  s    r\   ru  z3AlgorithmConfig.validate_workers_after_construction  r  r^   c                 *    t          ddd           d S )Nz3AlgorithmConfig.validate_workers_after_constructionr  Trg  r  r  s     r\   ru  z3AlgorithmConfig.validate_workers_after_construction  s)    EI	
 	
 	
 	

 	r^   'AlgorithmConfig.num_cpus_per_env_runnerc                     d S rb   r  r  s    r\   r_  z#AlgorithmConfig.num_cpus_per_worker  r  r^   c                 *    t          ddd           d S )Nz#AlgorithmConfig.num_cpus_per_workerr  Trg  r  r  s     r\   r_  z#AlgorithmConfig.num_cpus_per_worker  r  r^   'AlgorithmConfig.num_gpus_per_env_runnerc                     d S rb   r  r  s    r\   r`  z#AlgorithmConfig.num_gpus_per_worker  r  r^   c                 *    t          ddd           d S )Nz#AlgorithmConfig.num_gpus_per_workerr  Trg  r  r  s     r\   r`  z#AlgorithmConfig.num_gpus_per_worker  r  r^   /AlgorithmConfig.custom_resources_per_env_runnerc                     d S rb   r  r  s    r\   ra  z+AlgorithmConfig.custom_resources_per_worker  r  r^   c                 *    t          ddd           d S )Nz+AlgorithmConfig.custom_resources_per_workerr  Trg  r  r  s     r\   ra  z+AlgorithmConfig.custom_resources_per_worker  s)    =A	
 	
 	
 	

 	r^   AlgorithmConfig.num_learnersc                     d S rb   r  r  s    r\   rb  z#AlgorithmConfig.num_learner_workers  r  r^   c                 *    t          ddd           d S )Nz#AlgorithmConfig.num_learner_workersr  Trg  r  r  s     r\   rb  z#AlgorithmConfig.num_learner_workers  s)    5.	
 	
 	
 	

 	r^   $AlgorithmConfig.num_cpus_per_learnerc                     d S rb   r  r  s    r\   rc  z+AlgorithmConfig.num_cpus_per_learner_worker  r  r^   c                 *    t          ddd           d S )Nz+AlgorithmConfig.num_cpus_per_learner_workerr  Trg  r  r  s     r\   rc  z+AlgorithmConfig.num_cpus_per_learner_worker  )    =6	
 	
 	
 	

 	r^   $AlgorithmConfig.num_gpus_per_learnerc                     d S rb   r  r  s    r\   rd  z+AlgorithmConfig.num_gpus_per_learner_worker  r  r^   c                 *    t          ddd           d S )Nz+AlgorithmConfig.num_gpus_per_learner_workerr  Trg  r  r  s     r\   rd  z+AlgorithmConfig.num_gpus_per_learner_worker  r  r^   )AlgorithmConfig.num_cpus_for_local_workerc                     d S rb   r  r  s    r\   re  z)AlgorithmConfig.num_cpus_for_local_worker  r  r^   c                 *    t          ddd           d S )Nr  z)AlgorithmConfig.num_cpus_for_main_processTrg  r  r  s     r\   re  z)AlgorithmConfig.num_cpus_for_local_worker  s)    ;;	
 	
 	
 	

 	r^   rb   )rW   N)NNTr;  )NNr  )T)rz   
__module____qualname____doc__staticmethodre   rj   classmethodr  r   rr   r   r
   r[   r  r<   r  rB   ro   r   r  r   r  r   r  r  r  r  r  r.   r   r>   r   rF   r  r   r9  rG  rM  r@   r   r-  r  rD   rT  rC   rY  r\  r1   r^  r   r  floatrj  r  r  r=   r  r7  r	   r   r!   r  r.  r  r?   r  r   r  r  r%   r$   r  rA   r   r;   rE   r  r4   r  r  r  r  r   r  r  propertyr    r,  rV  r  r1  r  setterr  r  r  r  r  r  r  r   r  r'   r   rR  r  r   r"  r'  r  rn   r  r.  rw   r  r  r  r  r  r  r  r  r  r  r  r  rv  ru  r  r{   rB  rC  r-   r&   r  r   rW  r  r  r  r  r  r  rs  r  rt  rv  rw  r  r  rx  ry  rz  ru  r_  r`  ra  rb  rc  rd  re  __classcell__r  s   @r\   r`   r`   n   sI)        < ! ! \! ! ! \!
 D T    [2 ;  ;  [; zf< f<8D> f< f< f< f<PA, A A A AFi/i 
i i i iV4S>    < tCH~ %c	2B    [2,4S> , , , ,"  $    &, , , ,$ ;3 3 3 ;:3( .29=	&
 &
eCL)*&
 !"f*!56&
 	&

 
&
 &
 &
 &
T 	o o
 
o o o obs sk s s s sr 	I I
 
I I I I\ "&HL596:, , , g, hci.B(CCDE	,
 !!12, ""23, 
, , , ,b "&HL	$ $ $ g$ hci.B(CCDE	$
 
$ $ $ $L x  D        J <G;F	   +34. +34.	
 
   4 4?0;%0,7,,$4,$4$4&"2| | | $,C=| 5,-	|
 TN| %SM|  
!| | | |@ $/y )4,74?:E0;?J;F>I/:=H:E5@3>!y y yC=y  ~	y
 %SMy "$sCx.1y  (S#X7y  (~y 08}y ,4C=y /7smy 'tny .6c]y +33-y #4S>2y  #+4.!y" 
#y y y yz 8C=H) )&.tn) -5TN) 
	) ) ) )Z .9^ /:1<,7%05@,7'2/:#.)4$^ ^ ^eCL)*^ ]+	^
 $CI.^ sy)^ TN^ uT5[12^ $D>^ tn^ 'tn^ 4.^ "#^  
!^ ^ ^ ^F *5)42=5@1<JU1<?J:EBM,7?J  KVKV2=>I6A0;=H$/"-,72=5@-8<G-84?+6,72=7B,,,</ 0(82B&6!1+*ka a a !a "#	a
 "*$a %-TNa "*#a !)sC4E/E)F Ga "*#a "*%s
*;!<a *2$a 2:$a #5/a 08}a "*gYmT-5H&H IIJ"
a$ "*gz*E-mAT2T,UUV"
%a* ;C4.+a, ;C4.-a. #+3-/a0 "*%T	*:!;1a2 &.d^3a4  (~5a6 "*%S/!:7a8 SM9a: $;a< $D>=a@ "*$AaB %-TNCaD %TNEaF #4#89GaH %TNIaJ #+5/KaL $C=MaN %SMOaP "*$QaR &.e_Sal 
ma a a aL '2AL<G;FGR'2<G3> EP8C%u u u smu 'uS%_'=>	u
 'uUCZ'89u ,4C=u 6>e_u  }u -5SMu  Y0u $!3:#34mT-%889;
u" 5=TN#u$ &d38n5%u& 
'u u u ut "-/:%0&1*56A$/(32= +$/*:4D%2B3> EP8C9O O O O +,	O
 E?O smO #3-O '/smO SMO !O "*$O ~O D>O(  Y0)O* $!3:#34mT-%889;
+O6 5=TN7O8 &d38n59O: 
;O O O Oj 	U HSEPGREP    JULWHSFQEPDOCN9U U U!$}%tD,?'@@A
U $E(DN*B$CDU "%$x.(@"ABU $E(DN*B$CDU "%$x.(@"ABU $,(DN*+$
U "*(DN*+"
U  #+(DN*+#
!U& ,4(DN*+,
'U, 'uXtH~-E'FG-U. !)xh/G)H I/U0 %U8T(^+C%DE1U2 #54>)A#BC3U4 "%$x.(@"AB5U6 !xh'?!@A7U8  hX&> ?@9U: 
;U U U Ut .99D2=KVKV7B:EGR 8C5@4?9D5@2=6A8C9D<GCN@K:E:E 8CHS?JKV>I>I>IFQAL)9/Wg g g &c]g &eCHo6	g
 #+3-g <DC=g <DC=g &.e_g *2$g 7?tng $#%??@
g (0~g %-TNg  %-SM!g" %-X$6#g& &.c]'g( #+3-)g* "*(!3+g, $,H#5-g. %-X$6/g0 -5SM1g2 4<C=3g4 08~5g6 +33-7g8 +33-9g: 3;cN3
;g@ '/uoAgB 9ACgD /7tnEgF ;C4.GgH .6d^IgJ .6d^KgL /7smMgN 5=UOOgP 08QgX 
Yg g g gX LW-8<G3>6A.94?/:*52=6A+62=-8.93>+62=3>8C7B'26A-8-8 +(37B0;2=6A-85@+63>0;*5Ot t t sHi[+-E$FFGHt %TN	t
 $E#x-$89t #+4.t $DcN3t &d^t $,D>t  (}t #3-t "*$t !)c 3t #4.t "*$t  %TN!t" &d^#t$ #+4.%t& #4.'t( "*$)t* #+4.+t, )1-t. (0}/t0 tn1t2 &.d^3t4 %TN5t6 &c]7t8 9t:  ~;t< "*$s)!4=t> 'uo?t@ #+3-AtB &.d^CtD &c]EtF %-TNGtH $C=ItJ #+4.KtL  (~MtN #3-OtP 
Qt t t tv -8  6A-8(3.>$ *+r r r ,j.BBC
r &c]r $g}-x78
r $*X&(O1Ld1R(SST
r &.d^r !*r !r, 
-r r r rn ;F@K<G4?;F<G(3HST T T *2$T /7uo	T
 -5SMT #+5/T ,4C=T -5SMT  ~T "*$sDO/C*D!ET 
T T T Tp 5@=H #+D> -5TN 
	   @ :E(3#.(3'2), , , !"f*!56,  ~	,
 C=,  ~, tn, sm, 
, , , ,b 6A5@5@?J:EGR=H8C$4/ 0 0(82B&6!1'L L L %-TNL %-TN	L
 &.c]L .6e_L *2$L 8@}L ,4E?L '/uoL( 
)L L L Lb MX5@ *.D D D uT#s(^5G%GHID !!12	D
 085560
D 
D D D DR ,03>3> @K4?5@DOQ Q Q #4.Q #+4.	Q
 #+4.Q &.$t*d8T$Z#7889&
Q 08~Q $,D>Q %-TNQ 4<D>Q 
Q Q Q Qf $    X0 P P P P XP 	GtI 	G 	G 	G X	G 
 
 X
$ * * X*6 2c 2 2 2 X2 "(3# 3$ 3 3 3 )(3 L L L L XL'0 '0 '0C '0 '0 '0 '0VH	#	$H H H HT+ + + +Z

 

 


",< 
" 
" 
" 
"
"5i#1E+F 
" 
" 
" 
" "&CG)-	Y YgY c5CI)=#>>?@Y !	Y
 
Y Y Y Y| "&HL$7;>Br$ r$ r$ gr$ hci.B(CCDE	r$
 r$ d3
?34r$ &.l%;r$ 
r$ r$ r$ r$h( ( ( ( (&# # #:( ( ( ( ($#D # # # #
, , , ,
& & & &% % %' ' '& & & T#s(^    X"# # # #	 	 	 	   :.. .. .. ..`    > > >@M M M4t t tpf f fPj j jX 4    X 
D 
 
 
 X
 . . \.` H HS H4 H3 H H H \HT  *  >  :>!%HL7;F, F, F, 56F, g	F,
 hci.B(CCDEF, 'tF|4F, 
)8X4OQU4U+VV	WF, F, F, [F,P Z0>>>0 0 ?>0 Z@MMM  NM Z5TBBB  CB Z5TBBB  CB ZL   	  X
 !'
 
 ('
 Z;4HHH  IH X !
 
 "!
 ZHPTUUU  VU X Z5TBBB  CB X 
 
  
 Z<DIII  JI X "  #" Z=TJJJ  KJ X     Z@MMM  NM X "  #" Z@MMM  NM X #  $# ZANNN  ON X #  $# ZIQUVVV  WV X %+  ,+ ZKSW     X /5  65 ZGtTTT  UT X #)  *) ZB$OOO  PO X $  %$ ZE   	  X
 )/  0/ Z=TJJJ  KJ X     Z=TJJJ  KJ X     ZETRRR  SR X !'  (' Z2$???  @? X     Z:$GGG  HG X !'  (' Z:$GGG  HG X !'  (' Z?tLLL  ML X %  &%    r^   r`   c                   r    e Zd ZU dZee         ed<   d fd	Zeeeeedde	e
d                  de	ed	ged
ed
         f         f                  de	e         de	eeef                  dee         dd f fdZ fdZdee
d         ef         fdZdeee
d         ef                  fdZdee         fdZ xZS )DifferentiableAlgorithmConfiga]  An RLlib DifferentiableAlgorithmConfig builds a Meta algorithm from a given
    configuration

    .. testcode::

        from ray.rllib.algorithms.algorithm_config import DifferentiableAlgorithmConfig
        from ray.rllib.core.learner.differentiable_learner_config import (
            DifferentiableLearnerConfig,
        )
        from ray.rllib.core.learner.torch.torch_differentiable_learner import (
            TorchDifferentiableLearner,
        )
        # Construct a generic config for an algorithm that needs differentiable Learners.
        config = (
            DifferentiableAlgorithmConfig()
            .training(lr=3e-4)
            .environment(env="CartPole-v1")
            .learners(
                differentiable_learner_configs=[
                    DifferentiableLearnerConfig(
                        TorchDifferentiableLearner,
                        lr=1e-4,
                    )
                ]
            )
        )
        # The config is then used to configure a MetaLearner, see
        # `rllib/examples/algorithms/maml_lr_supervised_learning.py` for a full example.


    differentiable_learner_configsNc                 Z    t                                          |           g | _        dS )a  Initializes the DifferentiableLearnerConfig instance.

        Args:
            algo_class: An optional Algorithm class that this config class belongs to.
                Used (if provided) to build a respective Algorithm instance from this
                config.
        )r   N)r  r  r  )r  r   r  s     r\   r  z&DifferentiableAlgorithmConfig.__init__O  s2     	J/// RT+++r^   )rV  r  r  r  r  rV  rN   r  rT   r   r  r  rW   c                     t                      j        di | |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur| j                            |           |t          ur|| _        | S )ab  Sets the configurations for differentiable learners.

        Args:
            learner_class: The `Learner` class to use for (distributed) updating of the
                RLModule. Only used when `enable_rl_module_and_learner=True`.
            learner_connector: A callable taking an env observation space and an env
                action space as inputs and returning a learner ConnectorV2 (might be
                a pipeline) object.
            add_default_connectors_to_learner_pipeline: If True (default), RLlib's
                Learners automatically add the default Learner ConnectorV2
                pieces to the LearnerPipeline. These automatically perform:
                a) adding observations from episodes to the train batch, if this has not
                already been done by a user-provided connector piece
                b) if RLModule is stateful, add a time rank to the train batch, zero-pad
                the data, and add the correct state inputs, if this has not already been
                done by a user-provided connector piece.
                c) add all other information (actions, rewards, terminateds, etc..) to
                the train batch, if this has not already been done by a user-provided
                connector piece.
                Only if you know exactly what you are doing, you
                should set this setting to False.
                Note that this setting is only relevant if the new API stack is used
                (including the new EnvRunner classes).
            learner_config_dict: A dict to insert any settings accessible from within
                the Learner instance. This should only be used in connection with custom
                Learner subclasses and in case the user doesn't want to write an extra
                `AlgorithmConfig` subclass just to add a few settings to the base Algo's
                own config class.
            differentiable_learner_configs: A list of `DifferentiableLearnerConfig` instances
                defining the `DifferentiableLearner` classes used for the nested updates in
                `Algorithm`'s learner.
        r  )	r  r  r1   r	  r  r  r  r  r  )r  rV  r  r  r  r  ri   r  s          r\   r  z&DifferentiableAlgorithmConfig.learners^  s    X 	""6"""++"/DK//&7D#5[HH: ; k11$++,?@@@)<<2PD/r^   c                    t                                                       ddlm} t	          |                                 |          s+|                     d|                                  d           t          | j        t                    s-|                     dt          | j                   d           dS t          d | j        D                       s|                     d           dS dS )	r  r   rR   zQ`get_default_learner_class` must return a `MetaLearner` class or sublass but got r  zc`differentiable_learner_configs` must be a list of `DifferentiableLearnerConfig` instances, but is c              3   @   K   | ]}t          |t                    V  d S rb   )rY   r   )r  learner_cfgs     r\   r  z9DifferentiableAlgorithmConfig.validate.<locals>.<genexpr>  sA       
 
 {$?@@
 
 
 
 
 
r^   z`differentiable_learner_configs` must be a list of `DifferentiableLearnerConfig` instances, but at least one instance is not a `DifferentiableLearnerConfig`.N)r  r   /ray.rllib.core.learner.torch.torch_meta_learnerrS   rf  r  r  rY   r  r)  r[   r  )r  rS   r  s     r\   r   z&DifferentiableAlgorithmConfig.validate  sT    	 	UTTTTT$88::<LMM 	J&*&D&D&F&FJ J J  
 $=tDD 	@;<<@ @ @      
 
#B
 
 
 
 
 	 G    		 	r^   rS   c                     t           S )a\  Returns the `MetaLearner` class to use for this algorithm.

        Override this method in the sub-class to return the `MetaLearner`.

        Returns:
            The `MetaLearner` class to use for this algorithm either as a class
            type or as a string. (e.g. "ray.rllib.core.learner.torch.torch_meta_learner.TorchMetaLearner")
        NotImplementedr  s    r\   r  z7DifferentiableAlgorithmConfig.get_default_learner_class  s
     r^   rO   c                     t           S )a  Returns the `DifferentiableLearner` classes to use for this algorithm.

        Override this method in the sub-class to return the `DifferentiableLearner`.

        Returns:
            The `DifferentiableLearner` class to use for this algorithm either as a class
            type or as a string. (e.g.
            "ray.rllib.core.learner.torch.torch_meta_learner.TorchDifferentiableLearner").
        r  r  s    r\   "get_differentiable_learner_classesz@DifferentiableAlgorithmConfig.get_differentiable_learner_classes  s
     r^   c                     | j         S )a  Returns the `DifferentiableLearnerConfigs` for all `DifferentiableLearner`s.

        Override this method in the sub-class to return the `DifferentiableLearnerConfig`s.

        Returns:
            The `DifferentiableLearnerConfig` instances to use for this algorithm.
        )r  r  s    r\   "get_differentiable_learner_configsz@DifferentiableAlgorithmConfig.get_differentiable_learner_configs  s     22r^   rb   )rz   r  r  r  r	   r   __annotations__r  r1   r
   r   r   r   r  r   r  r   r  r   r  r  r  r  r  s   @r\   r  r  )  s         F %))D$EEEET T T T T T$ 4? EP8CLW; ; ;  Y0; $j\5]8K)K#LLM
	; 5=TN; &d38n5; )--H(I; 
); ; ; ; ; ;z         D	56H1I31N+O 	 	 	 		eD01367	8   3D9T4U 3 3 3 3 3 3 3 3r^   r  c                       e Zd ZdZdZdZdS )r   a{  Enumerates schemes of what parts of the TorchLearner can be compiled.

    This can be either the entire update step of the learner or only the forward
    methods (and therein the forward_train method) of the RLModule.

    .. note::
        - torch.compiled code can become slow on graph breaks or even raise
            errors on unsupported operations. Empirically, compiling
            `forward_train` should introduce little graph breaks, raise no
            errors but result in a speedup comparable to compiling the
            complete update.
        - Using `complete_update` is experimental and may result in errors.
    complete_updateforward_trainN)rz   r  r  r  rA  r   r  r^   r\   r   r     s&         " (O $MMMr^   r   )r  r  loggingr  r   enumr   typingr   r   r   r   r   r	   r
   r   r   r   	gymnasiumr-  rX  	packagingr   typing_extensionsr   r  ray._common.deprecationr   r   r   ray.rllib.callbacks.callbacksr   !ray.rllib.connectors.connector_v2r   ray.rllib.corer   ray.rllib.core.columnsr   4ray.rllib.core.learner.differentiable_learner_configr   ray.rllib.core.rl_moduler   -ray.rllib.core.rl_module.default_model_configr   (ray.rllib.core.rl_module.multi_rl_moduler   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.envr   r   ray.rllib.env.multi_agent_envr   %ray.rllib.env.wrappers.atari_wrappersr    0ray.rllib.evaluation.collectors.sample_collectorr!   5ray.rllib.evaluation.collectors.simple_list_collectorr"   ray.rllib.modelsr#   ray.rllib.offline.input_readerr$   ray.rllib.offline.io_contextr%   ray.rllib.policy.policyr&   r'   ray.rllib.policy.sample_batchr)   ray.rllib.utilsr*   r+   r,   ray.rllib.utils.annotationsr-   r.   ray.rllib.utils.frameworkr/   r0   ray.rllib.utils.from_configr1   r2   &ray.rllib.utils.metrics.metrics_loggerr3   ray.rllib.utils.metrics.statsr4   #ray.rllib.utils.schedules.schedulerr5   ray.rllib.utils.serializationr6   r7   r8   ray.rllib.utils.test_utilsr9   ray.rllib.utils.torch_utilsr:   ray.rllib.utils.typingr;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   ray.tune.loggerrF   ray.tune.registryrG   ray.tune.resultrH   ray.tune.tunerI   ray.utilrJ   ray.util.placement_grouprK   r  rM   ray.rllib.core.learnerrN   -ray.rllib.core.learner.differentiable_learnerrO   rQ  rQ   r  rS   rT   rU   	getLoggerrz   r(  r]   r`   r  r  r   r  r^   r\   <module>r9     s          



                                         " " " " " " 



         
 8 7 7 7 7 7 9 9 9 9 9 9 , , , , , , * * * * * *      8 7 7 7 7 7 L L L L L L F F F F F F ; ; ; ; ; ; C C C C C C C C 7 7 7 7 7 7 : : : : : : L L L L L L U U U U U U + + + + + + 6 6 6 6 6 6 2 2 2 2 2 2 6 6 6 6 6 6 6 6 ; ; ; ; ; ; @ @ @ @ @ @ @ @ @ @        F E E E E E E E @ @ @ @ @ @ @ @ K K K K K K 3 3 3 3 3 3 9 9 9 9 9 9         
 - , , , , , F F F F F F                          # " " " " " / / / / / / & & & & & & ! ! ! ! ! !       3 3 3 3 3 3 3888888......SSSSSSAAAAAAPPPPPP;;;;;;222222		8	$	$
'7 
D 
 
 
 
x^ x^ x^ x^ x^g x^ x^ x^v}u3 u3 u3 u3 u3O u3 u3 u3p$ $ $ $ $T $ $ $ $ $r^   