
    &`i                        d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZG ddlHmIZImJZJmKZKmLZL  ejM        eN          ZO G d de          ZPdedeeQ         fdZR G d de          ZSdS )ap  
Deep Q-Networks (DQN, Rainbow, Parametric DQN)
==============================================

This file defines the distributed Algorithm class for the Deep Q-Networks
algorithm. See `dqn_[tf|torch]_policy.py` for the definition of the policies.

Detailed documentation:
https://docs.ray.io/en/master/rllib-algorithms.html#deep-q-networks-dqn-rainbow-parametric-dqn
    N)defaultdict)AnyCallableDictListOptionalTupleTypeUnion)Self)DEPRECATED_VALUE)	Algorithm)AlgorithmConfigNotProvided)DQNTFPolicy)DQNTorchPolicy)Learner)RLModuleSpec)synchronous_parallel_sample)multi_gpu_train_one_steptrain_one_step)Policy)MultiAgentBatch)deep_update)override)ALL_MODULESENV_RUNNER_RESULTSENV_RUNNER_SAMPLING_TIMERLAST_TARGET_UPDATE_TSLEARNER_RESULTSLEARNER_UPDATE_TIMERNUM_AGENT_STEPS_SAMPLED NUM_AGENT_STEPS_SAMPLED_LIFETIMENUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLED_LIFETIMENUM_TARGET_UPDATESREPLAY_BUFFER_ADD_DATA_TIMERREPLAY_BUFFER_RESULTSREPLAY_BUFFER_SAMPLE_TIMER REPLAY_BUFFER_UPDATE_PRIOS_TIMERSAMPLE_TIMERSYNCH_WORKER_WEIGHTS_TIMERTD_ERROR_KEYTIMERS)convert_to_numpy)sample_min_n_steps_from_buffer*update_priorities_in_episode_replay_buffer"update_priorities_in_replay_buffervalidate_buffer_config)LearningRateOrSchedule
ResultDictRLModuleSpecTypeSampleBatchTypec            2           e Zd ZdZd& fd	Z ee          eeeeeeeeeeeeeeeeeeeeeeeddee	         dee
         dee         deeeee	ef                                    d	ee         d
ee         dee	         dee	         dee         dee	         dee         dee         dee         dee         dee         dee	         dee         deee	ee	e	f         f                  deee         eee                  ee	         gee         f         dee         dee         dee         dee	         def0 fd            Z ee          d' fd            Z ee          d(d e	de	fd!            Z ee          defd"            Ze ee          deeef         f fd#                        Z ee          deed$         ef         fd%            Z  xZ!S ))	DQNConfiga/  Defines a configuration class from which a DQN Algorithm can be built.

    .. testcode::

        from ray.rllib.algorithms.dqn.dqn import DQNConfig

        config = (
            DQNConfig()
            .environment("CartPole-v1")
            .training(replay_buffer_config={
                "type": "PrioritizedEpisodeReplayBuffer",
                "capacity": 60000,
                "alpha": 0.5,
                "beta": 0.5,
            })
            .env_runners(num_env_runners=1)
        )
        algo = config.build()
        algo.train()
        algo.stop()

    .. testcode::

        from ray.rllib.algorithms.dqn.dqn import DQNConfig
        from ray import tune

        config = (
            DQNConfig()
            .environment("CartPole-v1")
            .training(
                num_atoms=tune.grid_search([1,])
            )
        )
        tune.Tuner(
            "DQN",
            run_config=tune.RunConfig(stop={"training_iteration":1}),
            param_space=config,
        ).fit()

    .. testoutput::
        :hide:

        ...


    Nc                     ddddd| _         t                                          |pt                     d| _        dd	g| _        d
| _        d| _        d| _        d| _	        | 
                    t          j        d                     d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        dg| _        d| _        d| _        d| _        d| _        d| _        d| _         d| _!        dddd d!| _"        d| _#        tH          | _%        tH          | _&        tH          | _'        tH          | _(        d| _)        tH          | _*        tH          | _+        tH          | _,        dS )"z!Initializes a DQNConfig instance.EpsilonGreedy      ?g{Gz?'  )typeinitial_epsilonfinal_epsilonepsilon_timesteps)
algo_classauto)r   r<   )r=   g?g      D@global_normgMb@?    F)explore)evaluation_configNi  i  g:0yE>   g      $g      $@g      ?T   huberr   PrioritizedEpisodeReplayBufferiP  g333333?g?)r>   capacityalphabeta)-exploration_configsuper__init__DQNrollout_fragment_lengthepsilon	grad_clipgrad_clip_bylrtrain_batch_size
evaluationr   	overridesmin_time_s_per_iteration"min_sample_timesteps_per_iterationtarget_network_update_freq(num_steps_sampled_before_learning_startsstore_buffer_in_checkpointsadam_epsilontau	num_atomsv_minv_maxnoisysigma0duelinghiddensdouble_qn_stepbefore_learn_on_batchtraining_intensitytd_error_loss_fn$categorical_distribution_temperatureburn_in_lenreplay_buffer_configlr_scheduler   buffer_sizeprioritized_replaylearning_startsreplay_batch_sizereplay_sequence_lengthprioritized_replay_alphaprioritized_replay_betaprioritized_replay_eps)selfrB   	__class__s     p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn.pyrQ   zDQNConfig.__init__{   s    $"!!&	#
 #
 	J$5#666
 9?$
 !-0  * " 	/*CE*R*R*RSSS )-%26/
 +.'8<5+0( 


u%)""& '471 5 %
 %
!   ,"2/!1&*#(8%'7$&6###    )r]   rp   r_   rq   rT   r`   rU   r^   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   r]   rp   r_   rq   rT   r`   rU   r^   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   returnc                b    t                      j        di | |t          ur|| _        |t          ur+t	          d| j        id|iddgdg          }|d         | _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _	        |t          ur|| _
        |t          ur|| _        |	t          ur|	| _        |
t          ur|
| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        |t          ur|| _        | S )uW  Sets the training related configuration.

        Args:
            target_network_update_freq: Update the target network every
                `target_network_update_freq` sample steps.
            replay_buffer_config: Replay buffer config.
                Examples:
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentReplayBuffer",
                "capacity": 50000,
                "replay_sequence_length": 1,
                }
                - OR -
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentPrioritizedReplayBuffer",
                "capacity": 50000,
                "prioritized_replay_alpha": 0.6,
                "prioritized_replay_beta": 0.4,
                "prioritized_replay_eps": 1e-6,
                "replay_sequence_length": 1,
                }
                - Where -
                prioritized_replay_alpha: Alpha parameter controls the degree of
                prioritization in the buffer. In other words, when a buffer sample has
                a higher temporal-difference error, with how much more probability
                should it drawn to use to update the parametrized Q-network. 0.0
                corresponds to uniform probability. Setting much above 1.0 may quickly
                result as the sampling distribution could become heavily “pointy” with
                low entropy.
                prioritized_replay_beta: Beta parameter controls the degree of
                importance sampling which suppresses the influence of gradient updates
                from samples that have higher probability of being sampled via alpha
                parameter and the temporal-difference error.
                prioritized_replay_eps: Epsilon parameter sets the baseline probability
                for sampling so that when the temporal-difference error of a sample is
                zero, there is still a chance of drawing the sample.
            store_buffer_in_checkpoints: Set this to True, if you want the contents of
                your buffer(s) to be stored in any saved checkpoints as well.
                Warnings will be created if:
                - This is True AND restoring from a checkpoint that contains no buffer
                data.
                - This is False AND restoring from a checkpoint that does contain
                buffer data.
            epsilon: Epsilon exploration schedule. In the format of [[timestep, value],
                [timestep, value], ...]. A schedule must start from
                timestep 0.
            adam_epsilon: Adam optimizer's epsilon hyper parameter.
            grad_clip: If not None, clip gradients during optimization at this value.
            num_steps_sampled_before_learning_starts: Number of timesteps to collect
                from rollout workers before we start sampling from replay buffers for
                learning. Whether we count this in agent steps or environment steps
                depends on config.multi_agent(count_steps_by=..).
            tau: Update the target by 	au * policy + (1-	au) * target_policy.
            num_atoms: Number of atoms for representing the distribution of return.
                When this is greater than 1, distributional Q-learning is used.
            v_min: Minimum value estimation
            v_max: Maximum value estimation
            noisy: Whether to use noisy network to aid exploration. This adds parametric
                noise to the model weights.
            sigma0: Control the initial parameter noise for noisy nets.
            dueling: Whether to use dueling DQN.
            hiddens: Dense-layer setup for each the advantage branch and the value
                branch
            double_q: Whether to use double DQN.
            n_step: N-step target updates. If >1, sars' tuples in trajectories will be
                postprocessed to become sa[discounted sum of R][s t+n] tuples. An
                integer will be interpreted as a fixed n-step value. If a tuple of 2
                ints is provided here, the n-step value will be drawn for each sample(!)
                in the train batch from a uniform distribution over the closed interval
                defined by `[n_step[0], n_step[1]]`.
            before_learn_on_batch: Callback to run before learning on a multi-agent
                batch of experiences.
            training_intensity: The intensity with which to update the model (vs
                collecting samples from the env).
                If None, uses "natural" values of:
                `train_batch_size` / (`rollout_fragment_length` x `num_env_runners` x
                `num_envs_per_env_runner`).
                If not None, will make sure that the ratio between timesteps inserted
                into and sampled from the buffer matches the given values.
                Example:
                training_intensity=1000.0
                train_batch_size=250
                rollout_fragment_length=1
                num_env_runners=1 (or 0)
                num_envs_per_env_runner=1
                -> natural value = 250 / 1 = 250.0
                -> will make sure that replay+train op will be executed 4x asoften as
                rollout+insert op (4 * 250 = 1000).
                See: rllib/algorithms/dqn/dqn.py::calculate_rr_weights for further
                details.
            td_error_loss_fn: "huber" or "mse". loss function for calculating TD error
                when num_atoms is 1. Note that if num_atoms is > 1, this parameter
                is simply ignored, and softmax cross entropy loss will be used.
            categorical_distribution_temperature: Set the temperature parameter used
                by Categorical action distribution. A valid temperature is in the range
                of [0, 1]. Note that this mostly affects evaluation since TD error uses
                argmax for return calculation.
            burn_in_len: The burn-in period for a stateful RLModule. It allows the
                Learner to utilize the initial `burn_in_len` steps in a replay sequence
                solely for unrolling the network and establishing a typical starting
                state. The network is then updated on the remaining steps of the
                sequence. This process helps mitigate issues stemming from a poor
                initial state - zero or an outdated recorded state. Consider setting
                this parameter to a positive integer if your stateful RLModule faces
                convergence challenges or exhibits signs of catastrophic forgetting.

        Returns:
            This updated AlgorithmConfig object.
        rp   F )rP   trainingr   r]   r   rp   r_   rq   rT   r`   rU   r^   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   )rz   r]   rp   r_   rq   rT   r`   rU   r^   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   kwargsnew_replay_buffer_configr{   s                             r|   r   zDQNConfig.training   s   ` 	""6"""%[88.HD+{22 (3')BC')=>'('(( ($ )AAW(XD%&k99/JD,k))*D+%%"DL{** ,DK''&DN3;FF8 9 k!!DHK''&DN##DJ##DJ##DJ$$ DK+%%"DL+%%"DL;&&$DM$$ DK 33)>D&[00&8D#;..$4D!/{BB4 5 k))*Dr}   c           	      b   t                                                       | j        r| j        |                     d           nc| j        st          |            | j        d         dk    r<| j        dk    r|                     d           | j	        r|                     d           | j
        dvr|                     d           | j        sI| j        d	k    r>| j        | j        k     r.|                     d
| j         d| j         d| j         d           d| j        v rFd| j        d         cxk     r| j        k    r)n n&t          d| j         d| j        d          d          ddlm} | j        rRt'          | j        d         t*                    s2t-          | j        d         |          s|                     d           d S | j        sat'          | j        d         t*                    rd| j        d         v st-          | j        d         |          r|                     d           d S d S d S )NzW`lr_schedule` is deprecated and must be None! Use the `lr` setting to setup a schedule.r>   ParameterNoisecomplete_episodeszParameterNoise Exploration requires `batch_mode` to be 'complete_episodes'. Try setting `config.env_runners(batch_mode='complete_episodes')`.zOParameterNoise Exploration and `noisy` network cannot be used at the same time!)rJ   msez,`td_error_loss_fn` must be 'huber' or 'mse'!rC   z Your `rollout_fragment_length` (z) is smaller than `n_step` (z:)! Try setting config.env_runners(rollout_fragment_length=z).max_seq_lenr   zYour defined `burn_in_len`=z" is larger or equal `max_seq_len`=zC! Either decrease the `burn_in_len` or increase your `max_seq_len`.)EpisodeReplayBufferz[When using the new `EnvRunner API` the replay buffer must be of type `EpisodeReplayBuffer`.Episodead  When using the old API stack the replay buffer must not be of type `EpisodeReplayBuffer`! We suggest you use the following config to run DQN on the old API stack: `config.training(replay_buffer_config={'type': 'MultiAgentPrioritizedReplayBuffer', 'prioritized_replay_alpha': [alpha], 'prioritized_replay_beta': [beta], 'prioritized_replay_eps': [eps], })`.)rP   validateenable_rl_module_and_learnerrq   _value_errorin_evaluationr3   rO   
batch_modere   rm   rS   rj   model_configro   
ValueError4ray.rllib.utils.replay_buffers.episode_replay_bufferr   "enable_env_runner_and_connector_v2
isinstancerp   str
issubclass)rz   r   r{   s     r|   r   zDQNConfig.validate  s    	, 	+!!8  
 % -&t,,, &v.2BBB?&999%%<  
 : %%2  
  (888LMMM "
	,66,t{::#43O # #*.+# # ;# # #   T...D%m4HHHH8HHHHHHDd.> D D!%!2=!AD D D  	
 	
 	
 	
 	
 	

 3	t8@#FF	 t8@BUVV	
 )     8 	44V<cBB	 !:6!BBB$3F;=PQQ C 	 	 	 	 		 	 CBr}   r   worker_indexc                     | j         dk    r5t          | j        t          t          f          r| j        d         n| j        S | j         S )NrC   rH   )rS   r   rj   tuplelist)rz   r   s     r|   get_rollout_fragment_lengthz%DQNConfig.get_rollout_fragment_length   sI    '611 dkE4=99!A[ //r}   c                     | j         dk    rddlm} t          || j                  S t          d| j          d          )Ntorchr   )DefaultDQNTorchRLModule)module_classr   The framework ; is not supported! Use `config.framework('torch')` instead.)framework_str:ray.rllib.algorithms.dqn.torch.default_dqn_torch_rl_moduler   r   r   r   )rz   r   s     r|   get_default_rl_module_specz$DQNConfig.get_default_rl_module_spec  s}    ((       4!.   
 ;!3 ; ; ;  r}   c           	          t                      j        | j        | j        | j        | j        | j        | j        | j        dz  S )N)ri   rg   rT   rb   std_initrd   rc   )	rP   _model_config_auto_includesri   rg   rT   rb   rf   rd   rc   )rz   r{   s    r|   r   z%DQNConfig._model_config_auto_includes  sF     ww2||ZZ6
 6
 
 	
r}   r   c                 X    | j         dk    rddlm} |S t          d| j          d          )Nr   r   )DQNTorchLearnerr   r   )r   0ray.rllib.algorithms.dqn.torch.dqn_torch_learnerr   r   )rz   r   s     r|   get_default_learner_classz#DQNConfig.get_default_learner_class)  sa    ((      #";!3 ; ; ;  r}   Nr~   N)r   )"__name__
__module____qualname____doc__rQ   r   r   r   r   intdictboolr   r   floatr4   r	   r   r
   r   r   r   r   r   r   r   r6   r   propertyr   r   r   r   __classcell__r{   s   @r|   r9   r9   K   s       - -^W7 W7 W7 W7 W7 W7r Xo 5@/:6A?J4?(3#.BM*#.!,!, +"-"-!,#.8C .9*5@K%09M M M %-SMM 'tn	M
 &.d^M d4c5j(9#:;<M 01M uoM C=M 3;3-M e_M C=M M M ~M  !M" $#M$ #%M& 4.'M( sE#s(O345)M*  (/"Df$6S	B!# 
+M2 %UO3M4 #3-5M6 /7uo7M8 c]9M< 
=M M M M M M^ XoY Y Y Y Y Yv Xo0 0 0C 0 0 0 0 Xo,<      Xo	
T#s(^ 	
 	
 	
 	
 	
  X	
 Xo5i#1E+F        r}   r9   configr~   c                 D   | j         sddgS | j        |                                 | j        z  t	          | j        dz   d          z  z  }| j         |z  }|dk     r&t          t          j        d|z                      dgS dt          t          j        |                    gS )zACalculate the round robin weights for the rollout and train stepsrH   )	rl   total_train_batch_sizer   num_envs_per_env_runnermaxnum_env_runnersr   npround)r   native_ratiosample_and_train_weights      r|   calculate_rr_weightsr   8  s    $ 1v 0**,,

(	) f$q(!
,
,		-L %7,F""BHQ!8899::A>>3rx 78899::r}   c                       e Zd Ze ee          defd                        Ze ee          dede	e
e                  fd                        Z ee          deddf fd            Z ee          d
d            Zd Zdefd	Z xZS )rR   r~   c                     t                      S r   )r9   )clss    r|   get_default_configzDQN.get_default_configT  s     {{r}   r   c                 6    |d         dk    rt           S t          S )N	frameworkr   )r   r   )r   r   s     r|   get_default_policy_classzDQN.get_default_policy_classY  s     
 +'))!!r}   Nc                 $   t                                          |           | j        j        r_| j        rZ| j        ,| j                            d dgd          d         | _        d S | j        j        	                                | _        d S d S d S )Nc                 4    | j                                         S r   )moduleis_stateful)ers    r|   <lambda>zDQN.setup.<locals>.<lambda>j  s    ry4466 r}   rH   F)remote_worker_idslocal_env_runnerr   )
rP   setupr   r   env_runner_group
env_runnerforeach_env_runner_module_is_statefulr   r   )rz   r   r{   s     r|   r   z	DQN.setupc  s    f;9 	Pd>S 	P&+/+@+S+S66'(c%* ,T , , 	,((( ,0?+A+M+M+O+O(((	P 	P 	P 	Pr}   c                 j    | j         j        s|                                 S |                                 S )a-  DQN training iteration function.

        Each training iteration, we:
        - Sample (MultiAgentBatch) from workers.
        - Store new samples in replay buffer.
        - Sample training batch (MultiAgentBatch) from replay buffer.
        - Learn on training batch.
        - Update remote workers' new policy weights.
        - Update target network every `target_network_update_freq` sample steps.
        - Return all collected metrics for the iteration.

        Returns:
            The results dict from executing the training iteration.
        )r   r   _training_step_old_api_stack_training_step_new_api_stack)rz   s    r|   training_stepzDQN.training_stepq  s7    " {= 	744666 00222r}   c                 
   t          | j                  \  }}t          |          D ]}| j                            t
          t          f          5  t          | j        d| j        j	        dd          \  }}d d d            n# 1 swxY w Y   | j        
                    |t                     | j                            t
          t          f          5  | j                            |           d d d            n# 1 swxY w Y   | j        j        dk    rHt!          | j                            t          t$          fi                                                     }n(| j                            t          t(          fd          }|| j        j        k    rt          |          D ]}| j                            t
          t,          f          5  | j                            | j        j        | j        j        | j        | j        j                            dd          z  t;          | j                  t=          | j        d          r| j        j        nd| j        j         | j        j!                            d	          d
          }| j        "                                }| j        
                    |gtF                     d d d            n# 1 swxY w Y   | j                            t
          tH          f          5  | j%        &                    |t(          | j                            t          t(          f          t$          | j                            t          t$          f          i          }tO          tP                    }	|D ]w}
|
)                                D ]`\  }}tT          |v rR|	|         +                    tY          |-                    tT                                                                         axd |	)                                D             }	| j        
                    |t\                     d d d            n# 1 swxY w Y   | j                            t
          t^          f          5  ta          | j        |	           d d d            n# 1 swxY w Y   | j                            t
          tb          f          5  te          |d         3                                          th          hz
  }| j        5                    | j%        |d d           d d d            d S # 1 swxY w Y   d S d S )NT)
worker_setconcatsample_timeout_s_uses_new_env_runners_return_metrics)keyagent_steps)defaultr   r   ro   rN   )	num_itemsrj   batch_length_Tlookbackmin_batch_length_TgammarN   sample_episodes)episodes	timestepsc                 P    i | ]#\  }}|t           t          j        |d           i$S )r   )axis)r-   r   concatenate).0	module_idss      r|   
<dictcomp>z4DQN._training_step_new_api_stack.<locals>.<dictcomp>  sB     ! ! !(Iq "L".2K2K2K#L! ! !r}   )replay_buffer	td_errors)from_worker_or_learner_grouppoliciesglobal_varsinference_only)6r   r   rangemetricslog_timer.   r   r   r   r   	aggregater   r'   local_replay_bufferaddcount_steps_bysumpeekr#   valuesr%   r^   r)   sampler   rj   r   r   getr   hasattrro   r   rp   get_metricsr(   r!   learner_groupupdater   r   itemsr-   extendr/   popr    r*   r1   r,   setkeysr   sync_weights)rz   store_weightr   _r   env_runner_results
current_tsreplay_buffer_resultslearner_resultsr   resr   module_resultsmodules_to_updates                 r|   r   z DQN._training_step_new_api_stack  s   0DT[0Q0Q-- |$$ 	7 	7A&&0I'JKK  /J#4%)[%A*.$(0 0 0,,               L""#5;M"NNN &&0L'MNN 7 7(,,X6667 7 7 7 7 7 7 7 7 7 7 7 7 7 7 ;%66!!')IJTV "  &(( JJ **#%CDa +  J
 MMM233 H H\**F4N+OPP  #7>>"&+"D#{1 !4"k6::=!LLM "%T%=!>!> #4;>>,4;+B+B"k/![=AA&II(,#  ?    H* -1,D,P,P,R,R)L**./5J +   1              : \**F4H+IJJ "Q "Q&*&8&?&?!): $ 1 1%79W$X!" !" = $ 1 1(:(H%&!" !"# '@ ' 'O( !,D 1 1I. " "9< " "5I~+~== )) 4 ; ;$4(6(:(:<(H(H(M(M(O(O%& %&!" !" !""! !,5OO,=,=! ! !I L**?*PPPE"Q "Q "Q "Q "Q "Q "Q "Q "Q "Q "Q "Q "Q "Q "QJ \**F4T+UVV  >&*&>"+                  &&0J'KLL  $'(:(?(?(A(A$B$Bk]$R!%22151C. $#'	 3                    ] NMso   'BB		B	C;;C?	C?	C2KK	K	?E QQ	Q	<RR#	&R#	AT33T7:T7c                    i }t          | j                  \  }}t          |          D ]}| j        t                   5  t          | j        d| j        j                  }ddd           n# 1 swxY w Y   |si c S | j        t          xx         |
                                z  cc<   | j        t          xx         |                                z  cc<   | j                            |           d| j        t                   i}| j        | j        j        dk    rt          nt                   }|| j        j        k    rt          |          D ]}t#          | j        | j        j        | j        j        dk              }| j                            d          pd }	 |	|| j        | j                  }| j                            d	          du rt)          | |          }nt+          | |          }t-          | j        | j        ||           | j        t.                   }
||
z
  | j        j        k    r_| j                                        }| j                            |fd
           | j        t8          xx         dz  cc<   || j        t.          <   | j        t:                   5  | j                            |           ddd           n# 1 swxY w Y   |S )zvTraining step for the old API stack.

        More specifically this training step relies on `RolloutWorker`.
        T)r   r   r   Ntimestepr   )count_by_agent_stepsrk   c                     | S r   r   )bas     r|   r   z2DQN._training_step_old_api_stack.<locals>.<lambda>7  s    UV r}   simple_optimizerc                 2    ||v o|                                  S r   )update_target)ppid	to_updates      r|   r   z2DQN._training_step_old_api_stack.<locals>.<lambda>N  s    9,B1B1B r}   rH   )r   )r   r   r   _timersr+   r   r   r   	_countersr"   r   r$   	env_stepsr   r   r   r^   r0   r   r  r   r   r2   r   r]   r   get_policies_to_trainforeach_policy_to_trainr&   r,   r  )rz   train_resultsr  r   r  new_sample_batchr   cur_tstrain_batchpost_fnlast_updater$  s               r|   r   z DQN._training_step_old_api_stack  s   
  1ET[0Q0Q--|$$ 	; 	;Al+  4O#4%)[%A5 5 5                $ 			 N23337G7S7S7U7UU333N01115E5O5O5Q5QQ111 $(()9:::: '<=

  ;->> ('*	
 DKHHH233 *P *P<,K6)-)C})T   +//*ABBW%gk43H$+VV
 ;??#566$>>$24$E$EMM$<T;$O$OM 3,K!	   #n-BCK'4;+QQQ $ E E G GIO;;1:     
 N#5666!;666<BDN#89 \"<= P P)66;6OOOP P P P P P P P P P P P P P P s#   "A++A/	2A/	!K		K	K	r   )r   r   r   classmethodr   r   r9   r   r   r   r
   r   r   r   r   r   r5   r   r   r   s   @r|   rR   rR   S  sT       Xi9     [ Xi$	$v,	    [ XiPO P P P P P P P Xi3 3 3 3,x x xtYj Y Y Y Y Y Y Y Yr}   rR   )Tr   loggingcollectionsr   typingr   r   r   r   r   r	   r
   r   numpyr   typing_extensionsr   ray._common.deprecationr   ray.rllib.algorithms.algorithmr   %ray.rllib.algorithms.algorithm_configr   r   &ray.rllib.algorithms.dqn.dqn_tf_policyr   )ray.rllib.algorithms.dqn.dqn_torch_policyr   ray.rllib.core.learnerr   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   ray.rllib.utils.metricsr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   ray.rllib.utils.numpyr/   $ray.rllib.utils.replay_buffers.utilsr0   r1   r2   r3   ray.rllib.utils.typingr4   r5   r6   r7   	getLoggerr   loggerr9   r   r   rR   r   r}   r|   <module>rI     s  	 	  # # # # # # J J J J J J J J J J J J J J J J J J J J     " " " " " " 4 4 4 4 4 4 4 4 4 4 4 4 N N N N N N N N > > > > > > D D D D D D * * * * * * ; ; ; ; ; ;             + * * * * * 9 9 9 9 9 9 ' ' ' ' ' ' 0 0 0 0 0 0                                         * 3 2 2 2 2 2                       
	8	$	$j j j j j j j jZ; ;T%[ ; ; ; ;6H H H H H) H H H H Hr}   