
    &`iJ                     X   d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	m
Z
 d dlmZ d dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD d dlEmFZF d dlGmHZHmIZI d dlJmKZK d dlLmMZM d dlNmOZO d dlPmQZQmRZRmSZSmTZT d dlUmVZVmWZW  ejX        eY          ZZdZ[ G d de          Z\e\Z] G d de          Z^e^Z_e.d              Z`dS )!    N)DictListOptionalSetTupleTypeUnion)Self)	ObjectRef)DEPRECATED_VALUEdeprecation_warning)SampleBatch)	Algorithm)AlgorithmConfigNotProvided)AddOneTsToEpisodesAndTruncateNumpyToTensor)!COMPONENT_ENV_TO_MODULE_CONNECTOR!COMPONENT_MODULE_TO_ENV_CONNECTOR)TrainingData)RLModuleSpec)MixInMultiAgentReplayBuffer)LearnerThread)MultiGPULearnerThread)Policy)concat_samples)OldAPIStackoverride)AGGREGATOR_ACTOR_RESULTSALL_MODULESENV_RUNNER_RESULTSLEARNER_GROUPLEARNER_RESULTSLEARNER_UPDATE_TIMERMEAN_NUM_EPISODE_LISTS_RECEIVED$MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED!MEAN_NUM_LEARNER_RESULTS_RECEIVEDNUM_AGENT_STEPS_SAMPLEDNUM_AGENT_STEPS_TRAINEDNUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLED_LIFETIMENUM_ENV_STEPS_TRAINEDNUM_ENV_STEPS_TRAINED_LIFETIMENUM_SYNCH_WORKER_WEIGHTS7NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTSSAMPLE_TIMERSYNCH_WORKER_WEIGHTS_TIMERTIMERS)LearnerInfoBuilder))DEFAULT_HISTOGRAM_BOUNDARIES_SHORT_EVENTSTimerAndPrometheusLogger)
ReplayMode)_ALL_POLICIES)	Scheduler)LearningRateOrSchedulePolicyID
ResultDictSampleBatchType)Counter	Histogramcurr_entropy_coeffc            4       f    e Zd ZdZd( fd	Z ee          eeeeeeeeeeeeeeeeeeeeeeeeeedde	e
         de	e         de	e         de	e         d	e	e         d
e	e         de	e         de	e         de	e         de	e         de	e         de	e         de	e         de	e         de	e         de	eeeeef                                    de	e         de	e         de	e         de	e         de	e         de	eeeeef                                    de	e
         de	e         def2 fd            Zeedd e	e
         d!e	e
         def fd"Z ee          d) fd#            Zedefd$            Z ee          d%             Z ee          defd&            Z ee          	 d( fd'	            Z xZS )*IMPALAConfigaI  Defines a configuration class from which an Impala can be built.

    .. testcode::

        from ray.rllib.algorithms.impala import IMPALAConfig

        config = (
            IMPALAConfig()
            .environment("CartPole-v1")
            .env_runners(num_env_runners=1)
            .training(lr=0.0003, train_batch_size_per_learner=512)
            .learners(num_learners=1)
        )
        # Build a Algorithm object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()
        del algo

    .. testcode::

        from ray.rllib.algorithms.impala import IMPALAConfig
        from ray import tune

        config = (
            IMPALAConfig()
            .environment("CartPole-v1")
            .env_runners(num_env_runners=1)
            .training(lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0)
            .learners(num_learners=1)
        )
        # Run with tune.
        tune.Tuner(
            "IMPALA",
            param_space=config,
            run_config=tune.RunConfig(stop={"training_iteration": 1}),
        ).fit()
    Nc                    ddi| _         t                                          |pt                     d| _        d| _        d| _        d| _        d| _        d| _	        d| _
        d	| _        d
| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _         d| _!        d| _"        d| _#        d| _$        d| _%        d| _&        d| _'        d| _(        tR          | _*        tR          | _+        dS )z$Initializes a IMPALAConfig instance.typeStochasticSampling)
algo_classTg      ?                 g      D@global_normg      ?g{Gz?r   2   i     gMb@?
   FNi,  adamgGz?g?),exploration_configsuper__init__IMPALAvtracevtrace_clip_rho_thresholdvtrace_clip_pg_rho_thresholdlearner_queue_sizetimeout_s_sampler_managertimeout_s_aggregator_managerbroadcast_intervalnum_gpu_loader_threads	grad_clipgrad_clip_byvf_loss_coeffentropy_coeffnum_learners!num_aggregator_actors_per_learnerrollout_fragment_lengthtrain_batch_sizenum_env_runnerslrmin_time_s_per_iteration!_dont_auto_sync_env_runner_states_env_runners_only_skip_learnerslr_scheduleentropy_coeff_schedulenum_multi_gpu_tower_stacksminibatch_buffer_sizereplay_proportionreplay_buffer_num_slotslearner_queue_timeoutopt_typedecaymomentumepsilon_separate_vf_optimizer_lr_vfnum_gpus%_tf_policy_handles_more_than_one_lossr   num_aggregation_workers,max_requests_in_flight_per_aggregator_worker)selfrE   	__class__s     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala.pyrQ   zIMPALAConfig.__init__n   sx    (#
 	J$8&999 ),&,/)"#),&,/)"#&'# * ! 12.')$ # (*%
 26. "'#&*#*+'%&"!$'($%("
&+#592 (8$<L999    )rS   rT   rU   rZ   rk   rl   rm   rn   rV   ro   rW   rX   rY   r[   rp   ri   rq   rr   rs   r]   r^   rj   rt   ru   rx   ry   rS   rT   rU   rZ   rk   rl   rm   rn   rV   ro   rW   rX   rY   r[   rp   ri   rq   rr   rs   r]   r^   rj   rt   ru   returnc                   |t           k    rt          ddd           |t           k    rt          ddd            t                      j        di | |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        |t          ur|| _        |t          ur|| _        |	t          ur|	| _        |
t          ur|
| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        | S )aF  Sets the training related configuration.

        Args:
            vtrace: V-trace params (see vtrace_tf/torch.py).
            vtrace_clip_rho_threshold:
            vtrace_clip_pg_rho_threshold:
            num_gpu_loader_threads: The number of GPU-loader threads (per Learner
                worker), used to load incoming (CPU) batches to the GPU, if applicable.
                The incoming batches are produced by each Learner's LearnerConnector
                pipeline. After loading the batches on the GPU, the threads place them
                on yet another queue for the Learner thread (only one per Learner
                worker) to pick up and perform `forward_train/loss` computations.
            num_multi_gpu_tower_stacks: For each stack of multi-GPU towers, how many
                slots should we reserve for parallel data loading? Set this to >1 to
                load data into GPUs in parallel. This will increase GPU memory usage
                proportionally with the number of stacks.
                Example:
                2 GPUs and `num_multi_gpu_tower_stacks=3`:
                - One tower stack consists of 2 GPUs, each with a copy of the
                model/graph.
                - Each of the stacks will create 3 slots for batch data on each of its
                GPUs, increasing memory requirements on each GPU by 3x.
                - This enables us to preload data into these stacks while another stack
                is performing gradient calculations.
            minibatch_buffer_size: How many train batches should be retained for
                minibatching. This conf only has an effect if `num_epochs > 1`.
            replay_proportion: Set >0 to enable experience replay. Saved samples will
                be replayed with a p:1 proportion to new data samples.
            replay_buffer_num_slots: Number of sample batches to store for replay.
                The number of transitions saved total will be
                (replay_buffer_num_slots * rollout_fragment_length).
            learner_queue_size: Max queue size for train batches feeding into the
                learner.
            learner_queue_timeout: Wait for train batches to be available in minibatch
                buffer queue this many seconds. This may need to be increased e.g. when
                training with a slow environment.
            timeout_s_sampler_manager: The timeout for waiting for sampling results
                for workers -- typically if this is too low, the manager won't be able
                to retrieve ready sampling results.
            timeout_s_aggregator_manager: The timeout for waiting for replay worker
                results -- typically if this is too low, the manager won't be able to
                retrieve ready replay requests.
            broadcast_interval: Number of training step calls before weights are
                broadcasted to rollout workers that are sampled during any iteration.
            grad_clip: If specified, clip the global norm of gradients by this amount.
            opt_type: Either "adam" or "rmsprop".
            lr_schedule: Learning rate schedule. In the format of
                [[timestep, lr-value], [timestep, lr-value], ...]
                Intermediary timesteps will be assigned to interpolated learning rate
                values. A schedule should normally start from timestep 0.
            decay: Decay setting for the RMSProp optimizer, in case `opt_type=rmsprop`.
            momentum: Momentum setting for the RMSProp optimizer, in case
                `opt_type=rmsprop`.
            epsilon: Epsilon setting for the RMSProp optimizer, in case
                `opt_type=rmsprop`.
            vf_loss_coeff: Coefficient for the value function term in the loss function.
            entropy_coeff: Coefficient for the entropy regularizer term in the loss
                function.
            entropy_coeff_schedule: Decay schedule for the entropy regularizer.
            _separate_vf_optimizer: Set this to true to have two separate optimizers
                optimize the policy-and value networks. Only supported for some
                algorithms (APPO, IMPALA) on the old API stack.
            _lr_vf: If _separate_vf_optimizer is True, define separate learning rate
                for the value network.

        Returns:
            This updated AlgorithmConfig object.
        z+config.training(num_aggregation_workers=..)aO  Aggregator workers are no longer supported on the old API stack! To use aggregation (and GPU pre-loading) on the new API stack, activate the new API stack, then set `config.learners(num_aggregator_actors_per_learner=..)`. Good choices are normally 1 or 2, but this depends on your overall setup, especially your `EnvRunner` throughput.T)oldhelperrorz@config.training(max_requests_in_flight_per_aggregator_worker=..)zAggregator workers are no longer supported on the old API stack! To use aggregation (and GPU pre-loading) on the new API stack, activate the new API stack and THEN set `config.learners(max_requests_in_flight_per_aggregator_actor=..)`. )r   r   rP   trainingr   rS   rT   rU   rZ   rk   rl   rm   rn   rV   ro   rY   rW   rX   r[   rp   ri   rq   rr   rs   r]   r^   rj   rt   ru   )rz   rS   rT   rU   rZ   rk   rl   rm   rn   rV   ro   rW   rX   rY   r[   rp   ri   rq   rr   rs   r]   r^   rj   rt   ru   rx   ry   kwargsr{   s                               r|   r   zIMPALAConfig.training   sN   J #&666AA 	 	 	 	 8;KKKV
     	""6"""$$ DK$K77-FD*'{::0LD-!44*@D'%[88.HD+ 33)>D&K//%6D""+55+BD([00&8D# 33)>D&[00&8D#$K77-FD*'{::0LD-K''&DN;&&$DMk))*D##DJ;&&$DM+%%"DL++!.D++!.D!44*@D'!44*@D'$$ DKr}   )rg   rh   rg   rh   c                z     t                      j        di | |t          ur|| _        |t          ur|| _        | S )a  Sets the debugging related configuration.

        Args:
            _env_runners_only: If True, only run (remote) EnvRunner requests, discard
                their episode/training data, but log their metrics results. Aggregator-
                and Learner actors won't be used.
            _skip_learners: If True, no `update` requests are sent to the LearnerGroup
                and Learner actors. Only EnvRunners and aggregator actors (if
                applicable) are used.
        r   )rP   	debuggingr   rg   rh   )rz   rg   rh   r   r{   s       r|   r   zIMPALAConfig.debuggingi  sM    " 	##F###K//%6D",,"0Dr}   c           	         t                                                       | j        s|                     d           | j        r| j        dk    r|                     d|  d           | j        |                     d           | j        |                     d           t          j        | j	        dd	           | j
        Q| j
        | j        z  d
k    r| j
        | j        k    s.|                     d| j
         d| j         d| j         d           n:t          | j	        t                    r | j	        dk     r|                     d           | j        dv r)| j        du r"| j        du r|                     d           d S d S d S d S )NzXIMPALA and APPO do NOT support vtrace=False anymore! Set `config.training(vtrace=True)`.rG   zkThe new API stack in combination with the new EnvRunner API does NOT support a mixin replay buffer yet for z) (set `config.replay_proportion` to 0.0)!zW`lr_schedule` is deprecated and must be None! Use the `lr` setting to setup a schedule.zm`entropy_coeff_schedule` is deprecated and must be None! Use the `entropy_coeff` setting to setup a schedule.r^   zentropy coefficient)fixed_value_or_schedulesetting_namedescriptionr   z`minibatch_size` (zB) must either be None or a multiple of `rollout_fragment_length` (zL) while at the same time smaller than or equal to `total_train_batch_size` (z)!z`entropy_coeff` must be >= 0.0)tftf2TFz`_tf_policy_handles_more_than_one_loss` must be set to True, for TFPolicy to support more than one loss term/optimizer! Try setting config.training(_tf_policy_handles_more_than_one_loss=True).)rP   validaterS   _value_error"enable_env_runner_and_connector_v2replay_ratiori   rj   r8   r^   minibatch_sizera   total_train_batch_size
isinstancefloatframework_strrt   rw   rz   r{   s    r|   r   zIMPALAConfig.validate  s4    	 { 	2   2 '	D C''!!GG G G   +!!8  
 *6!!C   (,(:,1   
 ".$t'CCqHH'4+FFF!!8)< 8 848 8 3	8 8 8   $,e44 D9Kc9Q9Q!!"BCCC
 -//+t33:eCCO    	 0/33CCr}   c                 0    | j         dk    r
d| j         z  ndS )z~Returns replay ratio (between 0.0 and 1.0) based off self.replay_proportion.

        Formula: ratio = 1 / proportion
        r   rH   rG   )rm   )rz   s    r|   r   zIMPALAConfig.replay_ratio  s$     04/E/I/ID***sRr}   c                     | j         dk    rddlm} |S | j         dv rt          d          t          d| j          d          )Ntorchr   )IMPALATorchLearner)r   r   zPTensorFlow is no longer supported on the new API stack! Use `framework='torch'`.The framework z+ is not supported. Use `framework='torch'`.)r   6ray.rllib.algorithms.impala.torch.impala_torch_learnerr   
ValueError)rz   r   s     r|   get_default_learner_classz&IMPALAConfig.get_default_learner_class  s    ((      &%=00+  
 +!3 + + +  r}   c                 t    | j         dk    rddlm} t          |          S t	          d| j          d          )Nr   r   )DefaultPPOTorchRLModule)module_classr   z/ is not supported. Use either 'torch' or 'tf2'.)r   :ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_moduler   r   r   )rz   r   s     r|   get_default_rl_module_specz'IMPALAConfig.get_default_rl_module_spec  sm    ((       -DEEEE/!3 / / /  r}   c                     t                                          |||          }| j        rF|                    t	                                 | j        dk    r|                    t                     |S )Nr   )rP   build_learner_connector*add_default_connectors_to_learner_pipelineprependr   r`   remover   )rz   input_observation_spaceinput_action_spacedevice	connectorr{   s        r|   r   z$IMPALAConfig.build_learner_connector  sy     GG33#
 
	
 : 	0 ;==>>>599  ///r}   Nr~   N)__name__
__module____qualname____doc__rQ   r   r   r   r   r   boolr   intstrr   r	   r9   r
   r   r   r   propertyr   r   r   r   r   __classcell__r{   s   @r|   rA   rA   G   s       $ $LHM HM HM HM HM HMT Xo "-5@8C0;4?/:-81<,71<5@8C,7%0"-?J!,$/#.)4:EJU1<"- 05E;n n n n $,E?	n
 '/uon !)n %-SMn  (}n $E?n "*#n %SMn  (n $,E?n '/uon %SMn  E?!n" 3-#n$ d4c5j(9#:;<%n& 'n( 5/)n* %+n,  -n.   67/n0 !)d5e3D.E)F G1n2 !)3n4 5n> 
?n n n n n nf -8)4	   $D> !	 
     4 Xo@ @ @ @ @ @D Se S S S XS Xo  $ XoL     Xo
 	         r}   rA   c                   J    e Zd ZdZe ee          defd                        Ze ee          de	de
ee                  fd                        Z ee          de	f fd            Z ee          d             Zd Zd	ee         d
edeee                  fdZdeeeef                  deee                  fdZ ee           fd            Zed             Ze	 dde
e         deeeeeef         f                  fd            Zedeeeef                  dee         fd            Zedee         ddfd            Z edd            Z!ede"fd            Z#e	 d de$e         de
ee%                  ddfd            Z& ee           fd            Z' xZ(S )!rR   a  Importance weighted actor/learner architecture (IMPALA) Algorithm

    == Overview of data flow in IMPALA ==
    1. Policy evaluation in parallel across `num_env_runners` actors produces
       batches of size `rollout_fragment_length * num_envs_per_env_runner`.
    2. If enabled, the replay buffer stores and produces batches of size
       `rollout_fragment_length * num_envs_per_env_runner`.
    3. If enabled, the minibatch ring buffer stores and replays batches of
       size `train_batch_size` up to `num_epochs` times per batch.
    4. The learner thread executes data parallel SGD across `num_gpus` GPUs
       on batches of size `train_batch_size`.
    r~   c                     t                      S r   )rA   )clss    r|   get_default_configzIMPALA.get_default_config  s     ~~r}   configc                 ^    |j         dk    rddlm} |S |j         dk    rddlm} |S ddlm} |S )Nr   r   )ImpalaTorchPolicyr   )ImpalaTF1Policy)ImpalaTF2Policy)r   /ray.rllib.algorithms.impala.impala_torch_policyr   ,ray.rllib.algorithms.impala.impala_tf_policyr   r   )r   r   r   r   r   s        r|   get_default_policy_classzIMPALA.get_default_policy_class  s    
 7**      %$!T))      #"      #"r}   c                 T   t                                          |           d| j        t          <   g | _        d | _        g | _        g | _        d t          | j	        j
        pd          D             | _        | j	        j        sW| j	        j        dk    rGt          | j	        j        dk    r| j	        j        nd| j	        j        t"          j                  | _        i | _        | j	        j        s:t)          | j        | j	                  | _        | j                                         d S d S )Nr   c                     i | ]}|g S r   r   ).0is     r|   
<dictcomp>z IMPALA.setup.<locals>.<dictcomp>D  s'     9
 9
 9
Ar9
 9
 9
r}   rH   rG   )capacityr   replay_mode)rP   setup	_countersr/   data_to_place_on_learnerlocal_mixin_buffer_batch_being_built_episode_packs_being_builtranger   r_   _ma_batches_being_builtenable_rl_module_and_learnerrm   r   rn   r   r6   LOCKSTEP_resultsmake_learner_thread
env_runner_learner_threadstart)rz   r   r{   s     r|   r   zIMPALA.setup5  s8   f STNO )+%"&"$ +-'9
 9
 !9!>Q??9
 9
 9
$ {7 
	{,s22*E  ;>BB ;;!%!9 * 3+ + +' {7 	)#6t#T#TD  &&(((((	) 	)r}   c                 0   t          | j                  5  | j        j        s |                                 cd d d            S | j        j        dk    }| j                            t          t          f          5  | 
                                \  }}}}| j                            |t                     | j                            t          t          ft          |                     d d d            n# 1 swxY w Y   | j        j        r	 d d d            d S | j        j        dk    rut          | j                  5  |                     || j        j                  }| j                            t*          dft          |                     | j                            dd          }g }|D ]0}	|                    |	j        |	                                f           1| j                            t*          dft          |                     |r| j        j        | j        j        pd	z  }
|d |
         ||
d          }}| j                            d
d |D             d          }| j        j        t          |          |z
  z  }|dk    r| j                            |           | j                            t*          df|d           ||                     |          }t          |          dk    r)| j                            t          |                     n| j                             d	           | j                            t*          df| j        j        | j        j        pd	z  t          |          z  dd           d d d            n# 1 swxY w Y   n!|                     || j        j!                  }| j        j"        r	 d d d            d S | j                            t          tF          f          5  | j                            tH          t          |                     d }d}| j%        tL                   | j        j'        k    }t          | j(                  5  |D ]F}tR          | j        *                    t          tR          fd          tV          | j        *                    tX          tZ          tV          fd          i}| j        j        dk    r2t          |          | j        j        pd	k    sJ t]          |          }nt]          |          }| j/        0                    ||||| j        j1        | j        j2        | j        j3                  }d}|t          |          z  }|D ]}|4                    d|          }| j                            |tX                     H	 d d d            n# 1 swxY w Y   | j                            tj          tl          f|           d d d            n# 1 swxY w Y   | j%        tL          xx         d	z  cc<   | j                            tL          | j%        tL                   d           | j        7                    | j/        8                                tj                     |d| j%        tL          <   | j                            tr          d	d           | j                            t          tt          f          5  t          | j;                  5  | j<        =                    | j        ||| j        *                    t          tR          fd          | j>        | j?                   d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nr   )key)package_sizemean_num_input_packagesTget_batches)return_obj_refstagsmean_num_output_batchesrH   	get_batchc                 .    g | ]}t          |           S ))episode_refs)dict)r   ps     r|   
<listcomp>z(IMPALA.training_step.<locals>.<listcomp>  s#    #H#H#HQDa$8$8$8#H#H#Hr}   )funcr   tag)valuenum_env_steps_dropped_lifetimesum)reduce!num_env_steps_aggregated_lifetime)r   with_throughput)r   r   )default)
batch_refs)episodes_refs)training_dataasync_updatereturn_state	timesteps
num_epochsr   shuffle_batch_per_epochF_rl_module_state_after_update)stats_dictsr   mean)r   connector_statesrl_module_stateenv_steps_sampledenv_to_modulemodule_to_env)@r5   "_metrics_impala_training_step_timer   r   _training_step_old_api_stackr_   metricslog_timer2   r0    _sample_and_get_connector_states	aggregater!   	log_valuer%   lenrg   r`   ;_metrics_impala_training_step_aggregator_preprocessing_time_pre_queue_episode_refstrain_batch_size_per_learnerr   _aggregator_actor_managerfetch_ready_async_reqsappendactor_idgetforeach_actor_async/_metrics_impala_training_step_env_steps_droppedinc_pre_queue_batch_refs+_metrics_impala_training_step_input_batches0_metrics_impala_training_step_zero_input_batchesr   rh   r$   r&   r   r/   rY   5_metrics_impala_training_step_learner_group_loop_timer+   peekr-   r#   r    r   learner_groupupdater   r   r   popr"   r'   log_dict	get_statsr.   r1   8_metrics_impala_training_step_sync_env_runner_state_timeenv_runner_groupsync_env_runner_statesenv_to_module_connectormodule_to_env_connector)rz   do_async_updatesr   r   env_runner_metricsenv_runner_indices_to_updatedata_packages_for_aggregatorsma_batches_refs_remote_resultsma_batches_refscall_resultnum_aggpackssent_droppeddata_packages_for_learner_groupr   "num_learner_group_results_receivedr   batch_ref_or_episode_list_refr   r   learner_resultsresult_from_1_learners                          r|   training_stepzIMPALA.training_step_  s   %d&MNN j	 j	;; ;88::j	 j	 j	 j	 j	 j	 j	 j	
  ${7!;
 &&'=>>   99;; $&0 &&&* '    &&')HI%%                ( {, ?j	 j	 j	 j	 j	 j	 j	 j	H {<q@@-T  N N 594P4P$%)[%M 5Q 5 51 L**13LML))   6MM,0!. N   3 ')O'E  '..(1;??3D3DE    L**13LMO,,  
 8 "&+"O K49# :(7(C9'((C  =  $=QQ!,#H#H%#H#H#H -  R     $(;#KJJ-$ $a<< PTT&. U    .. 8 @ %#( /   + 8 > 7;6P6P'7 73 :;;a??HLL"%&E"F"F M     MQQ"# R    L**13VW@;38q:=>>?  %(, +   ON N N N N N N N N N N N N N Nb 372N2N t{/Q 3O 3 3/
 {) yj	 j	 j	 j	 j	 j	 j	 j	~ &&0D'EFF I I&&<=>> '    #'562 NO {56  .N  7 7
 94 45 ;DL<M<M!35S T() =N = = ;DL<M<M$3$/$B!"
 )* =N = =%	  ;H1LL#&'D#E#E $ 8 =A$ $ $ $ -9+H- - -MM
 -9.K- - -M +/*<*C*C*7)9)5&/'+{'=+/;+E48K4W +D + + (-:c/>R>RR:5D  1.C.G.G ?/ /OO ..(7 / /    c47 7 7 7 7 7 7 7 7 7 7 7 7 7 7p &&&(IJ< '   MI I I I I I I I I I I I I I IV NRSSSWXXSSSL""GVW #    L!!$"4">">"@"@m!TTT
 *  K &&'?5&QQQ\**F4N+OPP  1U    -DD#';-=,;.2l.?.?!35S T() /@ / / +/*F*.*F E 
 
 
                            yj	 j	 j	 j	 j	 j	 j	 j	 j	 j	 j	 j	 j	 j	 j	 j	 j	 j	s   ^6^8A/C3'^3C7	7^:C7	;^%^=I	N^N	^N	2^&^?A'W=&EV>2W=>WW=W+W=1^=X	^X	C ^%]3:A]]3]  ]3#] $]3'^3]7	7^:]7	;^^^c                 
   t          | j                  5  t                      }g }g }g }| j                                        }|dk    r| j                            dd| j        j        dd          }g }|D ]8}|                    |d                    |	                    |d                    9|D ]E\  }	}
}|	                    |	           |	                    |
           |	                    |           Fnp| j
                                        }	| j
                                        g}t          j        |	          g}| j
                            t           t"          g          g}d d d            n# 1 swxY w Y   ||||fS )Nr   sample_get_state_and_metricsFT)r   r   timeout_secondsr   return_actor_idsrH   )
components)r5   4_metrics_impala_sample_and_get_connector_states_timesetr!  num_healthy_remote_workers$foreach_env_runner_async_fetch_readyr   rW   addr  r   sampleget_metricsrayput	get_stater   r   )rz   r'  r   r   r&  r=  async_resultsresultsrepisodesstatesr  s               r|   r  z'IMPALA._sample_and_get_connector_statesM  s	   %E
 
 -	 -	 ,/55(L!!#%@@BB '
 *A--)NN;:(,(M(-)- O    & ) )A044QqT:::NN1Q4((((3: 7 7/Xvw ''111$++F333&--g66667  ?1133&*o&A&A&C&C%D" # 1 12O--==$ .  $ M-	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	` (	
 	
s   EE44E8;E8r   r   c                    g }|D ]|}| j                             |           t          | j                   | j        j        z  | j                                        z  |k    r!|                    | j                    g | _         }|S r   )r   r  r
  r   num_envs_per_env_runnerget_rollout_fragment_length)rz   r   r   rH  refs        r|   r  zIMPALA._pre_queue_episode_refs  s    
 +- 		5 		5C+223777D344+56+99;;<     
  ?@@@24/r}   r   c                    |D ]2\  }}| j         |         }| j        |                             |           3g }t          d | j                                        D                       rf|                    d | j                                        D                        t          d | j                                        D                       f|S )Nc              3      K   | ]}|V  d S r   r   r   learner_lists     r|   	<genexpr>z/IMPALA._pre_queue_batch_refs.<locals>.<genexpr>  s3       
 
)L
 
 
 
 
 
r}   c                 8    g | ]}|                     d           S )r   )r  rP  s     r|   r   z0IMPALA._pre_queue_batch_refs.<locals>.<listcomp>  s6       $ !$$Q''  r}   )_aggregator_actor_to_learnerr   r  allvalues)rz   r   agg_actor_idma_batch_reflearner_actor_idbatch_refs_for_learner_groups         r|   r  zIMPALA._pre_queue_batch_refs  s    +5 	P 	P&L,#@N()9:AA,OOOO ?A$ 
 
-1-I-P-P-R-R
 
 
 
 
 	 )// (,(D(K(K(M(M      
 
-1-I-P-P-R-R
 
 
 
 
 	 ,+r}   c                 T   t                                                       t          ddt          d          | _        | j                            d| j        j        i           t          ddt          d          | _        | j                            d| j        j        i           t          dd	t          d          | _	        | j	                            d| j        j        i           t          d
dt          d          | _
        | j
                            d| j        j        i           t          ddt          d          | _        | j                            d| j        j        i           t          ddd          | _        | j                            d| j        j        i           t          ddd          | _        | j                            d| j        j        i           t          ddd          | _        | j                            d| j        j        i           d S )N*rllib_algorithms_impala_training_step_timez$Time spent in IMPALA.training_step())rllib)namer   
boundariestag_keysr]  Crllib_algorithms_impala_training_step_aggregator_preprocessing_timezUTime spent preprocessing episodes with aggregator actor in the IMPALA.training_step()=rllib_algorithms_impala_training_step_learner_group_loop_timezPTime spent in the learner group update calls loop, in the IMPALA.training_step()@rllib_algorithms_impala_training_step_sync_env_runner_state_timezDTime spent on syncing EnvRunner states in the IMPALA.training_step()<rllib_algorithms_impala_sample_and_get_connector_states_timez7Time spent in IMPALA._sample_and_get_connector_states();rllib_algorithms_impala_training_step_input_batches_counterzYNumber of input batches processed and passed to the learner in the IMPALA.training_step())r^  r   r`  @rllib_algorithms_impala_training_step_zero_input_batches_counterzKNumber of times zero input batches were ready in the IMPALA.training_step()?rllib_algorithms_impala_training_step_env_steps_dropped_counterzdNumber of env steps dropped when sending data to the aggregator actors in the IMPALA.training_step())rP   _set_up_metricsr>   r4   r  set_default_tagsr{   r   r  r  r   r;  r=   r  r  r  r   s    r|   rh  zIMPALA._set_up_metrics  s   !!!2;=>@	3
 3
 3
/ 	/@@dn-.	
 	
 	
 LUVo@	L
 L
 L
H 	HYYdn-.	
 	
 	
 FOPj@	F
 F
 F
B 	BSSdn-.	
 	
 	
 IRS^@	I
 I
 I
E 	EVVdn-.	
 	
 	
 ENOQ@	E
 E
 E
A 	ARRdn-.	
 	
 	
 <CNs<
 <
 <
8
 	8IIdn-.	
 	
 	
 AHSeA
 A
 A
=
 	=NNdn-.	
 	
 	
 @GR~@
 @
 @
<
 	<MMdn-.	
 	
 	
 	
 	
r}   c                    | j                                         st          d          |                     d          }d |D             }|                     |          }|D ]M}| j        t          xx         |j        z  cc<   | j        t          xx         |	                                z  cc<   N| 
                    |           |                                  |                                 }| j        t                   5  t          |                                          }|                     ||           d d d            n# 1 swxY w Y   | j        r&| j                            | j        j        d           |S )Nz'The learner thread died while training!F)return_object_refsc                     h | ]\  }}|S r   r   )r   	worker_id_s      r|   	<setcomp>z6IMPALA._training_step_old_api_stack.<locals>.<setcomp>  s)     %
 %
 %
&)QI%
 %
 %
r}   )workers_that_need_updates
policy_idsT)r8  mark_healthy)r   is_aliveRuntimeError'_get_samples_from_workers_old_api_stack"_process_experiences_old_api_stackr   r*   countr(   agent_steps"_concatenate_batches_and_pre_queue0_place_processed_samples_on_learner_thread_queue_process_trained_results_timersr1   listkeys_update_workers_old_api_stackr  probe_unhealthy_actorsr   !env_runner_health_probe_timeout_s)rz   unprocessed_sample_batchesrp  batchesbatchtrain_resultspidss          r|   r  z#IMPALA._training_step_old_api_stack  s    #,,.. 	JHIII &*%Q%Q$ &R &
 &
"
%
 %
*D%
 %
 %
!
 99:TUU  	K 	KEN0111U[@111N2333u7H7H7J7JJ3333 	//888==???5577 \45 	 	**,,--D..*C /   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ) 	*AA $ M! B   
 s   ?9EEEFrk  c                    | j         t                   5  | j                                        dk    rB| j                            d            | j                            | j        j        |          }nZ| j        j        dk    s| j	        rA| j	        j
        5| j	                                        }|rt          j        |          }d|fg}ng }ddd           n# 1 swxY w Y   |S )a/  Get samples from rollout workers for training.

        Args:
            return_object_refs: If True, return ObjectRefs instead of the samples
                directly. This is useful when using aggregator workers so that data
                collected on rollout workers is directly de referenced on the aggregator
                workers instead of first in the driver and then on the aggregator
                workers.

        Returns:
            a list of tuples of (worker_index, sample batch or ObjectRef to a sample
                batch)

        r   c                 *    |                                  S r   )r@  )workers    r|   <lambda>z@IMPALA._get_samples_from_workers_old_api_stack.<locals>.<lambda>W  s    6==?? r}   )r8  r   N)r|  r0   r!  r=  foreach_env_runner_asyncr  r   rW   rc   r   	async_envr@  rB  rC  )rz   rk  sample_batchessample_batchs       r|   ru  z.IMPALA._get_samples_from_workers_old_api_stack:  sD   & \,' 	$ 	$ $??AAAEE%>>22  
 )@@$(K$I$6 A    ,11 2$(O$=$I  $5577% 9#&7<#8#8L#$l"3!4 "$9	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$< s   B:CC Cworker_to_sample_batchesc                 j   d |D             }g }|D ]}t          |t                    r
J d            |                                }| j        r:| j                            |           | j                            t                    }n|                                }|r|                    |           |S )zProcess sample batches directly on the driver, for training.

        Args:
            worker_to_sample_batches: List of (worker_id, sample_batch) tuples.

        Returns:
            Batches that have been processed by the mixin buffer.

        c                     g | ]\  }}|S r   r   )r   rn  bs      r|   r   z=IMPALA._process_experiences_old_api_stack.<locals>.<listcomp>{  s    :::A1:::r}   zF`IMPALA._process_experiences_old_api_stack` can not handle ObjectRefs!)	r   r   decompress_if_neededr   r?  replayr7   copyr  )rz   r  r  processed_batchesr  s        r|   rv  z)IMPALA._process_experiences_old_api_stackm  s     ;:!9::: 	0 	0E!y  X XWX X  ..00E & %'++E222/66}EE
 

 0!((///  r}   r  Nc                       fd}|D ]s} j         j        dk    r= j         j        r1t          d |j                                        D                       rO j                            |            |             tdS )zConcatenate batches that are being returned from rollout workers

        Args:
            batches: List of batches of experiences from EnvRunners.
        c                      t          d j        D                       j        j        k    r7t	          j                  } j                            |            g _        d S d S )Nc              3   $   K   | ]}|j         V  d S r   )rw  )r   r  s     r|   rR  zaIMPALA._concatenate_batches_and_pre_queue.<locals>.aggregate_into_larger_batch.<locals>.<genexpr>  s$      ==AG======r}   )r   r   r   r   r   r   r  )batch_to_addrz   s    r|   aggregate_into_larger_batchzNIMPALA._concatenate_batches_and_pre_queue.<locals>.aggregate_into_larger_batch  ss    ==T%<=====;56 6  .d.EFF-44\BBB*,'''6 6r}   truncate_episodesc              3      K   | ]N}t           j        |v o;|t           j                 j        d          |t           j                 j        d          k    V  OdS )r   N)r   VF_PREDSshapeREWARDS)r   pbs     r|   rR  z<IMPALA._concatenate_batches_and_pre_queue.<locals>.<genexpr>  sp           (B. ;/06q9k128;<     r}   N)r   
batch_moderestart_failed_env_runnersanypolicy_batchesrV  r   r  )rz   r  r  r  s   `   r|   ry  z)IMPALA._concatenate_batches_and_pre_queue  s    	- 	- 	- 	- 	-  	* 	*E &*===K: >    $299;;      #**5111''))))7	* 	*r}   c                    t          | j                  D ]\  }}	 | j        j                            ||t          | j                  dz
  k               | j        dxx         | j        j        dk    r|	                                n|j
        z  cc<   # t          j        $ r | j        dxx         dz  cc<   Y w xY w| j                                         dS )z:Place processed samples on the learner queue for training.rH   )blocknum_samples_added_to_queuerx  num_times_learner_queue_fullN)	enumerater   r   inqueuerC  r
  r   r   count_steps_byrx  rw  queueFullclear)rz   r   r  s      r|   rz  z7IMPALA._place_processed_samples_on_learner_thread_queue  s    "$"?@@ 	D 	DHAuD$,00 s4#@AAAEE 1    ;<<<{1]BB %%'''<<<<
 : D D D=>>>!C>>>>>D 	%++-----s   A8B$B;:B;c                    d}d}g }t          | j        j                                                  D ]G}| j        j                            d          \  }}}||z  }||z  }|r|                    |           H|st          j        | j        j                  }n<t                      }	|D ]}
|	
                    |
           |	                                }| j        t          xx         |z  cc<   | j        t          xx         |z  cc<   |S )zProcess training results that are outputed by the learner thread.

        Returns:
            Aggregated results from the learner thread after an update is completed.

        r   gMbP?)timeout)r   r   outqueueqsizer  r  r  deepcopylearner_infor3   &add_learn_on_batch_results_multi_agentfinalizer   r,   r)   )rz   num_env_steps_trainednum_agent_steps_trainedlearner_infosrn  	env_stepsrx  r3  final_learner_infobuilderinfos              r|   r{  zIMPALA._process_trained_results  sM    !""#t+4::<<== 
	6 
	6A
 $-11%1@@	 "Y.!#{2# 6$$_555 	4!%t/C/P!Q!Q )**G% E E>>tDDDD!(!1!1!3!3 	,---1FF---.///3JJ///!!r}   rp  rq  c                      j         j        r j                                          j        t
                    fd|pg D             d j                            |            j         j        r j                                          j        t          xx         dz  cc<    j	        
                                dk    r j        t                    j         j        k    r|r j         j        r j                                          j                            |          } j         j        r j                                         t          j        |           j        j                                         d j        t          <    j        t$          xx         dz  cc<    j	                            fddt)          |          d           d	S d	S d	S d	S )
a<  Updates all RolloutWorkers that require updating.

        Updates only if NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS has been
        reached and the worker has sent samples in this iteration. Also only updates
        those policies, whose IDs are given via `policies` (if None, update all
        policies).

        Args:
            workers_that_need_updates: Set of worker IDs that need to be updated.
            policy_ids: Optional list of Policy IDs to update. If None, will update all
                policies on the to-be-updated workers.
        c                 @    i | ]}|j         j        |         j        S r   )r   
policy_mapnum_grad_updates)r   pidrz   s     r|   r   z8IMPALA._update_workers_old_api_stack.<locals>.<dictcomp>  s8     , , , T_/4E, , ,r}   )timestepnum_grad_updates_per_policy)rq  rH   r   c                 T    |                      t          j                            S r   )set_weightsrB  r  )wglobal_varsweights_refs    r|   r  z6IMPALA._update_workers_old_api_stack.<locals>.<lambda>3  s    q}}SW[-A-A;OO r}   F)r   local_env_runnerremote_worker_idsr8  N)r   policy_states_are_swappabler   lockr   r)   set_global_varsunlockr/   r!  num_remote_workersrY   get_weightsrB  rC  r   policy_ids_updatedr  r.   foreach_env_runnerr}  )rz   rp  rq  weightsr  r  s   `   @@r|   r  z$IMPALA._update_workers_old_api_stack   s   & ;2 	#O  """'>?, , , ,%+, , ,
 
 	''
'KKK;2 	%O""$$$ 	NOOOSTTOOO!4466::VW{-. .). {6 '$$&&&o11*==G{6 )&&(((''**K 399;;;VWDNRSN34449444!44OOOOO!&"&'@"A"A !	 5      ;:. . . .r}   c                      t                      j        |i |}| j        j        s| j                            |d          }|S )NF)overwrite_learner_info)rP   (_compile_iteration_results_old_api_stackr   r   r   add_learner_metrics)rz   argsr   resultr{   s       r|   r  z/IMPALA._compile_iteration_results_old_api_stack9  sT    AA4R6RR{7 	)==u >  F r}   )Fr   r   ))r   r   r   r   classmethodr   r   rA   r   r   r   r   r   r   r   r5  r  r   r   r   r  r   r  rh  r   r  r   r	   r<   ru  r   rv  ry  rz  r;   r{  r   r:   r  r  r   r   s   @r|   rR   rR   	  s         Xi<     [ Xi#$#	$v,	# # #  [#. Xi')O ') ') ') ') ') ')R Xik k kZ5
 5
 5
n O;>	d9o	   (,uS)^45,	d9o	, , , ,: XiM
 M
 M
 M
 M
^ 2 2 [2h  .30 0$TN0 
eCy/9::;	<0 0 0 [0d "!"&uS+-='>"?"! 
o	"! "! "! ["!H +*${:K +*PT +* +* +* [+*Z . . . [.. %"* %" %" %" [%"N  046 6#&s86 T(^,6 
	6 6 6 [6p Xi        r}   rR   c                    |d         st                               d                    |d         |d                              |d         }|d         }||k     r)t                               d| d| d| d	           ||d<   t	          | |d         |d
         |d         |d         |d         |d         |d         |d         	  	        }n,t          | |d         |d         |d         |d                   }|S )Nsimple_optimizerz:Enabling multi-GPU mode, {} GPUs, {} parallel tower-stacksrv   rk   rl   zIn multi-GPU mode you should have at least as many multi-GPU tower stacks (to load data into on one device) as you have stack-index slots in the buffer! You have configured z stacks and a buffer of size z!. Setting `minibatch_buffer_size=z`.rd   rb   r   rV   ro   rZ   )rv   rd   rb   rk   num_sgd_iterrV   ro   num_data_load_threads)rl   r  rV   ro   )loggerr  formatwarningr   r   )local_workerr   
num_stacksbuffer_sizelearner_threads        r|   r   r   F  sg   $% %
HOOz"F+G$H 	
 	
 	

 89
45##NN9 )9 9 	9 9
 +59 9 9   /9F*+.J'd|#$67'-.J'K-%&:;"()@"A"()A"B

 

 

 '"()@"A-%&:;"()@"A
 
 
 r}   )ar  loggingr  typingr   r   r   r   r   r   r	   typing_extensionsr
   rB  r   ray._common.deprecationr   r   	ray.rllibr   ray.rllib.algorithms.algorithmr   %ray.rllib.algorithms.algorithm_configr   r   ray.rllib.connectors.learnerr   r   ray.rllib.corer   r   $ray.rllib.core.learner.training_datar   "ray.rllib.core.rl_module.rl_moduler   /ray.rllib.execution.buffers.mixin_replay_bufferr   "ray.rllib.execution.learner_threadr   ,ray.rllib.execution.multi_gpu_learner_threadr   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.metricsr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   $ray.rllib.utils.metrics.learner_infor3   #ray.rllib.utils.metrics.ray_metricsr4   r5   8ray.rllib.utils.replay_buffers.multi_agent_replay_bufferr6   ,ray.rllib.utils.replay_buffers.replay_bufferr7   #ray.rllib.utils.schedules.schedulerr8   ray.rllib.utils.typingr9   r:   r;   r<   ray.util.metricsr=   r>   	getLoggerr   r  &LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEYrA   ImpalaConfigrR   Impalar   r   r}   r|   <module>r     s
      @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ " " " " " " 



       I I I I I I I I ! ! ! ! ! ! 4 4 4 4 4 4 N N N N N N N N U U U U U U U U        > = = = = = ; ; ; ; ; ; W W W W W W < < < < < < N N N N N N * * * * * * 8 8 8 8 8 8 = = = = = = = =                                           , D C C C C C        P O O O O O F F F F F F 9 9 9 9 9 9            0 / / / / / / /		8	$	$ *> &| | | | |? | | |~ w w w w wY w w wt 
 ' ' ' ' 'r}   