
    &`i                        d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= d dl>m?Z? erd dl@mAZA d dlBmCZC d dlDmEZE  e jF        eG          ZHdZIdZJdZKe' G d d                      ZLe' G d de                      ZMe'deNdedeNd eOd!e#f
d"            ZPe'd#ee$         d!e$fd$            ZQe' G d% d&                      ZRd'ed!ee         fd(ZSd)ee9ee!ee)f         f         d*e9d!ee!ee)f         fd+ZTdS ),    N)defaultdict)TYPE_CHECKINGDictIteratorListOptionalSetTupleUnion)ASYNC_RESET_RETURNBaseEnv)ExternalEnvWrapper)
MonitorEnvget_wrapper_by_cls)_PolicyCollectorGroup)	EpisodeV2)RolloutMetrics)Preprocessor)Policy)MultiAgentBatchSampleBatchconcat_samples)OldAPIStack)Filter)convert_to_numpy)get_original_spaceunbatch)ActionConnectorDataTypeAgentConnectorDataTypeAgentIDEnvActionTypeEnvIDEnvInfoDict
EnvObsTypeMultiAgentDictMultiEnvDictPolicyIDPolicyOutputTypeSampleBatchTypeStateBatchesTensorStructType)log_onceSimpleImageViewer)RLlibCallback)RolloutWorkeri  i  g     @@c                   ^    e Zd ZdZddee         fdZdedee	ef         fdZ
d Zd	 Zd
 ZdS )
_PerfStatsz<Sampler perf stats that will be included in rollout metrics.Nema_coefc                 h    || _         d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr           )r3   itersraw_obs_processing_timeinference_timeaction_processing_timeenv_wait_timeenv_render_time)selfr3   s     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/evaluation/env_runner_v2.py__init__z_PerfStats.__init__;   s@     !
'*$!&)# "    fieldvaluec                     |dk    r| xj         |z  c_         d S | j        | j        |xx         |z  cc<   d S d| j        z
  | j        |         z  | j        |z  z   | j        |<   d S )Nr6   g      ?)r6   r3   __dict__)r<   r@   rA   s      r=   incrz_PerfStats.incrJ   s    GJJ%JJF = M%   E)     $'$-$74=< $%$&DM%   r?   c                     t           | j        z  }| j        |z  | j        |z  | j        |z  | j        |z  | j        |z  dS N)mean_raw_obs_processing_msmean_inference_msmean_action_processing_msmean_env_wait_msmean_env_render_ms)	MS_TO_SECr6   r7   r8   r9   r:   r;   )r<   factors     r=   _get_avgz_PerfStats._get_avgX   sV    TZ' +/*F*O!%!4v!=)-)Dv)M $ 2V ;"&"6"?
 
 	
r?   c                     | j         t          z  | j        t          z  | j        t          z  | j        t          z  | j        t          z  dS rF   )r7   rL   r8   r9   r:   r;   r<   s    r=   _get_emaz_PerfStats._get_emah   sJ    
 +/*F*R!%!4y!@)-)Dy)P $ 2Y >"&"6"B
 
 	
r?   c                 `    | j         |                                 S |                                 S N)r3   rN   rQ   rP   s    r=   getz_PerfStats.getx   s&    = ==??"==??"r?   rS   )__name__
__module____qualname____doc__r   floatr>   strr   intrD   rN   rQ   rT    r?   r=   r2   r2   7   s        FF# #% # # # #&# &eCJ&7 & & & &
 
 
 
 
 
 # # # # #r?   r2   c                       e Zd Zd ZdS )_NewDefaultDictc                 :    |                      |          x}| |<   |S rS   )default_factory)r<   env_idrets      r=   __missing__z_NewDefaultDict.__missing__   s#    !11&999d6l
r?   N)rU   rV   rW   rc   r\   r?   r=   r^   r^      s#            r?   r^   
episode_idbatch_builderlarge_batch_thresholdmultiple_episodes_in_batchreturnc                 t   i }|j                                         D ]\  }}|j        dk    r|j        |k    rRt          d          rCt                              d                    |j        |j        |           dz   |sdndz              |                                }|||<   t          ||j                  S )a   Build MultiAgentBatch from a dict of _PolicyCollectors.

    Args:
        env_steps: total env steps.
        policy_collectors: collected training SampleBatchs by policy.

    Returns:
        Always returns a sample batch in MultiAgentBatch format.
    r   large_batch_warningz9More than {} observations in {} env steps for episode {} av  are buffered in the sampler. If this is more than you expected, check that that you set a horizon on your environment correctly and that it terminates at some point. Note: In multi-agent environments, `rollout_fragment_length` sets the batch size based on (across-agents) environment steps, not the steps of individual agents, which can result in unexpectedly large batches.zuAlso, you may be waiting for your Env to terminate (batch_mode=`complete_episodes`). Make sure it does at some point. )policy_batches	env_steps)
policy_collectorsitemsagent_stepsr,   loggerwarningformatrm   buildr   )rd   re   rf   rg   ma_batchpid	collectorbatchs           r=   _build_multi_agent_batchry      s      H'9??AA  Y A%%$'<<<!B
 B
< NN$f!-}/F
 !	!  6 - - #  , !! (m>UVVVVr?   	eval_datac                     t          |           }d|v r<t          |           }t          j        |t          j                  |t
          j        <   |S )zBatch a list of input SampleBatches into a single SampleBatch.

    Args:
        eval_data: list of SampleBatches.

    Returns:
        single batched SampleBatch.
    
state_in_0)dtype)r   lennponesint32r   SEQ_LENS)rz   inference_batch
batch_sizes      r=   _batch_inference_sample_batchesr      sK     %Y//O&&^^
02
"(0S0S0S,-r?   c                      e Zd ZdZ	 	 	 d3dddeded	d
dedededefdZ	d Z
d ZdefdZdee         fdZdee         fdZdedeeef         dee         fdZdedededededeee         eeee         f         eeeef                  f         fdZded ed!ee         fd"Z ded#eeee!e"f         f         deeee!e#f         f         ded$eeee         f         f
d%Z$ded&e%d ed'ee         d$eeee         f         d!ee         ddfd(Z&dedefd)Z'ded*eee(f         fd+Z)d,ededede*e+f         fd-Z,d$eeee         f         deee-f         fd.Z.d'ee         d$eeee         f         d/eee-f         d0efd1Z/d2 Z0dS )4EnvRunnerV2z;Collect experiences from user environment using Connectors.   rm   Nworkerr0   base_envrg   	callbacksr/   
perf_statsrollout_fragment_lengthcount_steps_byrenderc	                    || _         t          |t                    rt          d          || _        || _        || _        || _        || _        || _	        || _
        |                                 | _        i | _        t          | j                  | _        | j        t#          d          k    rt%          t&          | j        dz            nt(          | _        dS )av  
        Args:
            worker: Reference to the current rollout worker.
            base_env: Env implementing BaseEnv.
            multiple_episodes_in_batch: Whether to pack multiple
                episodes into each batch. This guarantees batches will be exactly
                `rollout_fragment_length` in size.
            callbacks: User callbacks to run on episode events.
            perf_stats: Record perf stats into this object.
            rollout_fragment_length: The length of a fragment to collect
                before building a SampleBatch from the data and resetting
                the SampleBatchBuilder object.
            count_steps_by: One of "env_steps" (default) or "agent_steps".
                Use "agent_steps", if you want rollout lengths to be counted
                by individual agent steps. In a multi-agent env,
                a single env_step contains one or more agent_steps, depending
                on how many agents are present at any given time in the
                ongoing episode.
            render: Whether to try to render the environment after each
                step.
        z@Policies using the new Connector API do not support ExternalEnv.inf
   N)_worker
isinstancer   
ValueError	_base_env_multiple_episodes_in_batch
_callbacks_perf_stats_rollout_fragment_length_count_steps_by_render_get_simple_image_viewer_simple_image_viewer_active_episodesr^   _new_batch_builder_batch_buildersrY   maxMIN_LARGE_BATCH_THRESHOLDDEFAULT_LARGE_BATCH_THRESHOLD_large_batch_threshold)	r<   r   r   rg   r   r   r   r   r   s	            r=   r>   zEnvRunnerV2.__init__   s    @ h 233 	R   "+E(#%(?%-
 ))++ 	!
 9;CR#D
 D
 ,e<< )4+H2+MNNN. 	###r?   c                     | j         sdS 	 ddlm}  |            S # t          t          f$ r$ d| _         t
                              d           Y nw xY wdS )zCMaybe construct a SimpleImageViewer instance for episode rendering.Nr   r-   Fz\Could not import gymnasium.envs.classic_control.rendering! Try `pip install gymnasium[all]`.)r   (gymnasium.envs.classic_control.renderingr.   ImportErrorModuleNotFoundErrorrq   rr   )r<   r.   s     r=   r   z$EnvRunnerV2._get_simple_image_viewer  s     | 	4		RRRRRR$$&&&01 	 	 	 DLNN?    	 ts    2AAc                 D   | j         j        j                                        D ]H}t	          |dd           5|j                            || j        ||                                           I| j	                            | j         | j        | j         j        ||           d S )Nexplorationpolicyenvironmentepisodetf_sessr   r   policies	env_indexr   )
r   
policy_mapcachevaluesgetattrr   on_episode_startr   get_sessionr   )r<   r   ra   ps       r=   _call_on_episode_startz"EnvRunnerV2._call_on_episode_start"  s     (.5577 	 	Aq-..:.. $#MMOO	 /    	((<^\, 	) 	
 	
 	
 	
 	
r?   rh   c                 4    t          | j        j                  S )zCreate a new batch builder.

        We create a _PolicyCollectorGroup based on the full policy_map
        as the batch builder.
        )r   r   r   )r<   _s     r=   r   zEnvRunnerV2._new_batch_builder:  s     %T\%<===r?   c              #   D   K   	 |                                  }|D ]}|V  )zSamples and yields training episodes continuously.

        Yields:
            Object containing state, action, reward, terminal condition,
            and other fields as dictated by `policy`.
        )step)r<   outputsos      r=   runzEnvRunnerV2.runB  s7      	iikkG  	r?   c                    | j                             dd           t          j                    }| j                                        \  }}}}}}t          j                    |z
  }t          j                    }	|                     |||||          \  }
}}| j                             dt          j                    |	z
             t          j                    }|                     |          }| j                             dt          j                    |z
             t          j                    }|                     |
|||          }| j                             dt          j                    |z
             t          j                    }| j                            |           | j                             d	|t          j                    z   |z
             | 	                                 |S )
z;Samples training episodes by stepping through environments.r6      )unfiltered_obsrewardsterminateds
truncatedsinfosr7   )to_evalr8   )active_envsr   eval_resultsoff_policy_actionsr9   r:   )
r   rD   timer   poll_process_observations_do_policy_eval_process_policy_eval_resultssend_actions_maybe_render)r<   t0r   r   r   r   r   r   env_poll_timet1r   r   r   t2r   t3actions_to_sendt4s                     r=   r   zEnvRunnerV2.stepN  s    	gq)))Y[[ N!!	
	b( Y[[ )-(B(B)#! )C )
 )
%Wg 	7r9IJJJ Y[[++G+<<.	b0@AAA Y[[ --#%1	 . 
 
 	 	6	b8HIII Y[[##O444o}ty{{/JR/OPPPr?   r   r   c           
         t          | j                  }|"|D ]}|                    |j                   |S i }|                                }|D ];}|                    |          }|j        |         }	|	                                ||<   <t          |j	        |j
        t          |j                  |j        i |j        |j        |          gS )z+Get rollout metrics from completed episode.N)custom_metrics)episode_lengthepisode_rewardagent_rewardsr   r   	hist_datamediaconnector_metrics)_fetch_atari_metricsr   _replacer   
get_agents
policy_forr   get_connector_metricsr   lengthtotal_rewarddictr   r   r   )
r<   r   r   atari_metricsmr   active_agentsagent	policy_idr   s
             r=   _get_rollout_metricsz EnvRunnerV2._get_rollout_metrics  s     /C4>.R.R$" B B

'*@
AAAA  **,," 	J 	JE**511I'	2F+1+G+G+I+Ii(( &~&3"7#899&5!+m"3	 	 	
 	
r?   r   r   r   r   r   c                    t                      }t          t                    }g }|                                D ]\  }	t	          |	t
                    r:|         d         du sJ d d            |                     |	d|||           U| j        vr |                               }
|
| j        <   n| j                 }
|
	                                s| 
                    |
           |         d         s|         d         rd}nd}|                               |
                    d|                             di                      t          t                    }i }i }|	                                D ]\  }}|dk    sJ |
                    |          }t          |         d         p|                             |                    }|||<   t          |         d         p|                             |d                    }|||<   |
	                    |          s|s|rt           j        |
j        t           j        t           j        |
                    |          t           j        |                             |d          t           j        |t           j        |t           j        |                             |i           t           j        |i}||                             ||f           |r|
                                D ]{}|                    |d          s+|                    |d          s|
                    |          rE|
                    |          }| j        j        |         }tA          |j!                  }|                             |d          }|                             |i           }t           j        |
j        t           j        t           j        |
                    |          t           j        |t           j        dt           j        |                             |d          t           j        |t           j        |"                                i}||                             ||f           }|                                D ]\  }}| j        j        |         }|j#        s
J d	            fd
|D             }|#                    |          }|D ]<}|
	                    |j$                  sm|
%                    |j$        |j&        j'        t           j                 |j&        j'        t           j                 |j&        j'        t           j                            n%|
(                    |j$        |j&        j'                   |s|                    |j$        d          sp|                    |j$        d          sU|
                    |j$                  s;tS          |j*        |j$        |j&                  }||                             |           >|
+                                 |
j        dk    r3| j,        -                    | j        | j.        | j        j        |
           |r4|                     |	|         d         p|         d         |||           | j/        r@| 0                    | j1                 |
          }|r|                    |           | j1        = |||fS )a  Process raw obs from env.

        Group data for active agents by policy. Reset environments that are done.

        Args:
            unfiltered_obs: The unfiltered, raw observations from the BaseEnv
                (vectorized, possibly multi-agent). Dict of dict: By env index,
                then agent ID, then mapped to actual obs.
            rewards: The rewards MultiEnvDict of the BaseEnv.
            terminateds: The `terminated` flags MultiEnvDict of the BaseEnv.
            truncateds: The `truncated` flags MultiEnvDict of the BaseEnv.
            infos: The MultiEnvDict of infos dicts of the BaseEnv.

        Returns:
            A tuple of:
                A list of envs that were active during this step.
                AgentConnectorDataType for active agents for policy evaluation.
                SampleBatches and RolloutMetrics for completed agents for output.
        __all__Tz&ERROR: When a sub-environment (env-id zZ) returns an error as observation, the terminateds[__all__] flag must also be set to True!)ra   env_obs_or_exceptionis_doner   r   r   F
__common__r5   z.EnvRunnerV2 requires agent connectors to work.c                 8    g | ]\  }}t          ||          S r\   )r   ).0agent_iddatara   s      r=   
<listcomp>z5EnvRunnerV2._process_observations.<locals>.<listcomp>h  s9     : : :&$ +68TBB: : :r?   r   init_obs
init_infostr   r   r   r   r   r   )2setr   listro   r   	Exception_handle_done_episoder   create_episodehas_init_obsr   addset_last_inforT   r   boolr   Tr   ENV_IDAGENT_INDEXagent_indexREWARDSTERMINATEDS
TRUNCATEDSINFOSNEXT_OBSappendr   r   r   r   r   observation_spacesampleagent_connectorsr   add_init_obsr   raw_dictadd_action_reward_done_next_obsr   ra   r   r   on_episode_stepr   r   ._try_build_truncated_episode_multi_agent_batchr   ) r<   r   r   r   r   r   r   r   r   env_obsr   all_agents_donesample_batches_by_policyagent_terminatedsagent_truncatedsr   obsr   agent_terminatedagent_truncatedvalues_dictr   	obs_spacerewardinfobatchesacd_list	processedditemsample_batchra   s                                   @r=   r   z!EnvRunnerV2._process_observations  s   N #&%%@KD@Q@Q@B  .3355 _	5 _	5OFG
 '9-- "6*95===V    >== ))!)0  +## *    T222%)%8%8%@%@07%f--%)%:6%B '')) =++GV<<< 6"9- (F1CI1N ("&"'''' !!,f0A0A,PR0S0STTT (34'8'8$ "!!( )T )T#9,,,,&-&8&8&B&B	#''	2Wk&6I6M6Mh6W6W$ $  /?!(+"&v&y1 ?!&)--h>># # .= * ++H55 $(7   M7>  &+W-@-@-J-J  ')<)<Xs)K)K+-=  *O%uV}'8'82'F'F(#!( )3::Hk;RSSSS  *X
 !( 2 2 4 4 %X %XH *--h>>!+//%@@! #??844!
 !*1*<*<X*F*FI!\4Y?F !363K L LI %V_003??F =,,Xr::D#w~#*F#/1D1DX1N1N $+V#/#.
60B0F0FxQV0W0W#)4#,i.>.>.@.@#K -Y7>>+?VWWWW '?&D&D&F&F )8 )8"	7!%!8!C +D DCD D+: : : :*1: : : #33H==	" 8 8A"//
;; 
,,%&Z%&V_[5I%J'(v{7H'Ifokm<	 -      ??J   (	8,00UCC	8 ,//
EBB	8 #??1:66		8  6ah
AFSS	*11$777186 LLNNN
 ~!! //<!^!\4#$ 0      ))'	2Sj6H6S   / 	5#RR(0'      5NN<000 ,V4GW,,r?   ra   r   r   c                     | j         |         }| j        |         }|                    |||           | j        sBt	          |j        || j        | j                  }|r|                    |           | j        |= dS dS )zBuilds a MultiAgentSampleBatch from the episode and adds it to outputs.

        Args:
            env_id: The env id.
            is_done: Whether the env is done.
            outputs: The list of outputs to add the
        )re   r   check_donesN)r   r   postprocess_episoder   ry   rd   r   r  )r<   ra   r   r   r   re   ma_sample_batchs          r=   _build_done_episodezEnvRunnerV2._build_done_episode  s     "26:,V4##' 	$ 	
 	
 	
 / 	-6"+0	 O  0/// $V,,,	- 	-r?   r!  r   c           	         t          t                    }|                                         D ]7\  }}                    |          }	||	                             ||f           8|                                D ]\  }	}
| j        j        |	         }fd|
D             }|                    |          }|D ]}                    |j	        |j
        j        t          j                 |j
        j        t          j                 |j
        j        t          j                            ||	                             |           dS )zProcess resetted obs through agent connectors for policy eval.

        Args:
            env_id: The env id.
            obs: The Resetted obs.
            episode: New episode.
            to_eval: List of agent connector data for policy eval.
        c                     g | ]^\  }}t          |t          j        |t          j        t          j        j        t          j                            |          i          _S r\   )r   r   r  r  r
  r   r  r  )r   r   r!  ra   r   r   s      r=   r   z?EnvRunnerV2.__process_resetted_obs_for_eval.<locals>.<listcomp>  ss     6 6 6 "Hc '#,c#)5#w~#/1D1DX1N1N		 	6 6 6r?   r   N)r   r  ro   r   r  r   r   r  r  r   r   r  r   r  r  r
  )r<   ra   r!  r   r   r   per_policy_resetted_obsr   raw_obsr   
agents_obsr   r)  r*  r+  s    ` ``          r=   __process_resetted_obs_for_evalz+EnvRunnerV2.__process_resetted_obs_for_eval  sr     9DD8I8I!$V!2!2!4!4 	K 	KHg")"4"4X">">I#I.55x6IJJJJ%<%B%B%D%D 	- 	-!Iz\,Y7F6 6 6 6 6 6 &06 6 6H //99I - -$$ZV_[-AB v{/@Afokm4	 %    	"))!,,,,-%	- 	-r?   r   r   c                    t          |t                    r&|}|                    t          d                     nX| j        |         }|                    |                     || j        j                             | 	                    |||           | 
                    ||           |                     |          }	 | j                            |          \  }	}
|	&|	t          k    st          |	|         t                    sn$|                    t          d                     j| j        j        j                                        D ]}|j                            |           |	o|	t          k    rf|| j        |<   |                     ||           |                     ||	|
||           |                                 |                    |           dS dS dS )a  Handle an all-finished episode.

        Add collected SampleBatch to batch builder. Reset corresponding env, etc.

        Args:
            env_id: Environment ID.
            env_obs_or_exception: Last per-environment observation or Exception.
            env_infos: Last per-environment infos.
            is_done: If all agents are done.
            active_envs: Set of active env ids.
            to_eval: Output container for policy eval data.
            outputs: Output container for collected sample batches.
        T)episode_faulty)r   N)r   r  r  r   r   extendr   r   r   r2  end_episoder  r   	try_resetr   r   r   r  resetr   +_EnvRunnerV2__process_resetted_obs_for_evalr   r  )r<   ra   r   r   r   r   r   episode_or_exceptionnew_episoderesetted_obsresetted_infosr   s               r=   r  z EnvRunnerV2._handle_done_episode  s   , *I66 	?.B NN>>>>????.2.CF.K NN))(T\5L *     $$VWg>>> 	!5666!%!4!4V!<!<	D+/>+C+CF+K+K(L. $#555!,v"6	BB 6  ~TBBBCCC	D (.5577 	- 	-A$$V,,,, #8J(J(J,7D!&)''V<<<00   OOF##### $#(J(Jr?   c                     || j         vsJ t          || j        j        | j        j        | j        | j                  }| j                            | j        | j        | j        j        ||           |S )a  Creates a new EpisodeV2 instance and returns it.

        Calls `on_episode_created` callbacks, but does NOT reset the respective
        sub-environment yet.

        Args:
            env_id: Env ID.

        Returns:
            The newly created EpisodeV2 instance.
        )r   r   r   )r   r   r   r   policy_mapping_fnr   on_episode_createdr   )r<   ra   rA  s      r=   r  zEnvRunnerV2.create_episodel  s     T22222  L#L*<o
 
 
 	**<^\, 	+ 	
 	
 	
 r?   r@  c                    | j                             | j        | j        | j        j        ||           | j        j        j                                        D ]H}t          |dd          5|j                            || j        ||	                                           It          |t                    r'|}|j        dk    rd|j         d}t          |          || j        v r
| j        |= dS dS )zCleans up an episode that has finished.

        Args:
            env_id: Env ID.
            episode_or_exception: Instance of an episode if it finished successfully.
                Otherwise, the exception that was thrown,
        r   r   Nr   r   zData from episode z does not show any agent interactions. Hint: Make sure for at least one timestep in the episode, env.step() returns non-empty values.)r   on_episode_endr   r   r   r   r   r   r   r   r   r   total_agent_stepsrd   r   r   )r<   ra   r@  r   r   msgs         r=   r<  zEnvRunnerV2.end_episode  s)    	&&<^\,( 	' 	
 	
 	
 (.5577 	 	Aq-..:,, $0MMOO	 -    *I66 
	&*G(A--E); E E E 
 !oo% T***%f--- +*r?   re   c                    | j         dk    r|j        }|j        }n|j        }|j        }||z   | j        k    r| j         dk    r'||z   | j        k    sJ d| d| d| j         d            || j        k     r|                    |d           |j        d	k    r!t          |j        || j	        | j
                  S t          d
          rt                              d           d S )Nrm   rp   zbuilt_steps (z) + ongoing_steps (z) != rollout_fragment_length (z).F)re   r   r   no_agent_stepszoYour environment seems to be stepping w/o ever emitting agent observations (agents are never requested to act)!)r   rm   active_env_stepsrp   active_agent_stepsr   r0  ry   rd   r   r   r,   rq   rr   )r<   re   r   built_stepsongoing_stepss        r=   r  z:EnvRunnerV2._try_build_truncated_episode_multi_agent_batch  sK    ;..'1K#4MM (3K#6M &$*GGG#}44"]2d6SSSSRK R RM R R040MR R R TSS T:::++-QV+WWW (1,,/&!/4	   *++ )   tr?   c                 V     j         j        }dt          f fd}i }|                                D ]w\  }}	 t	          ||          }n# t
          $ r  ||          }Y nw xY wt          d |D                       }|                    ||j         fd|D                       ||<   x|S )a  Call compute_actions on collected episode data to get next action.

        Args:
            to_eval: Mapping of policy IDs to lists of AgentConnectorDataType objects
                (items in these lists will be the batch's items for the model
                forward pass).

        Returns:
            Dict mapping PolicyIDs to compute_actions_from_input_dict() outputs.
        rz   c                     d }| D ]M}j         |j                 }|                    |j        d          }|||k    rt	          d| d          |}Nt          j        j        |          S )NT)refreshzePolicy map changed. The list of eval data that was handled by a same policy is now handled by policy zC and {policy_id}. Please don't do this in the middle of an episode.)r   ra   r   r   r   _get_or_raiser   r   )rz   r   r+  r   rv   r<   s        r=   _try_find_policy_againz;EnvRunnerV2._do_policy_eval.<locals>._try_find_policy_again  s    I    /9((T(BB(SI-=-=$LEHL L L    		 !8)DDDr?   c                 &    g | ]}|j         j        S r\   )r   r-  )r   r+  s     r=   r   z/EnvRunnerV2._do_policy_eval.<locals>.<listcomp>"  s    888$888r?   c                 4    g | ]}j         |j                 S r\   )r   ra   )r   r   r<   s     r=   r   z/EnvRunnerV2._do_policy_eval.<locals>.<listcomp>(  s#    MMMa$/9MMMr?   )timestepepisodes)	r   r   r   ro   rT  r   r   compute_actions_from_input_dictglobal_timestep)	r<   r   r   rU  r   r   rz   r   
input_dicts	   `        r=   r   zEnvRunnerV2._do_policy_eval  s    <*	E.D 	E 	E 	E 	E 	E 	E  :<$+MMOO 	 	 IyC!.x!C!C C C C "8!7	!B!BC
 988i888 J '-&L&L/MMMM9MMM 'M ' 'L## s   AA A r   r   c           
         t          t                    }|D ]}i ||<   |                                D ]\  }}||         d         }	t          |	          }	||         d         }
||         d         }t	          |	t
                    rt          j        |	          }	t          |	          }	t          | j
        j        |          }|j        r|j        s
J d            t          |	          D ]\  }}||         j        }||         j        }||         j        j        }t'          j        |fd|
          }t'          j        |fd|          }t+          ||||||f          }|                    |          j        \  }}}||vs
|||         vr|n||         |         }t+          ||||||f          }|j                            |           |||         vsJ |||         |<   |S )a  Process the output of policy neural network evaluation.

        Records policy evaluation results into agent connectors and
        returns replies to send back to agents in the env.

        Args:
            active_envs: Set of env IDs that are still active.
            to_eval: Mapping of policy IDs to lists of AgentConnectorDataType objects.
            eval_results: Mapping of policy IDs to list of
                actions, rnn-out states, extra-action-fetches dicts.
            off_policy_actions: Doubly keyed dict of env-ids -> agent ids ->
                off-policy-action, returned by a `BaseEnv.poll()` call.

        Returns:
            Nested dict of env id -> agent id -> actions to be sent to
            Env (np.ndarrays).
        r   r      z/EnvRunnerV2 requires action connectors to work.c                     | |         S rS   r\   xis     r=   <lambda>z:EnvRunnerV2._process_policy_eval_results.<locals>.<lambda>e  
    1Q4 r?   c                     | |         S rS   r\   r`  s     r=   rc  z:EnvRunnerV2._process_policy_eval_results.<locals>.<lambda>j  rd  r?   )r   r   ro   r   r   r  r   arrayr   rT  r   r   r  action_connectors	enumeratera   r   r   r  treemap_structurer   outputon_policy_output)r<   r   r   r   r   r   ra   r   rz   actionsrnn_outextra_action_outr   rb  actionr   r\  
rnn_statesfetchesac_dataaction_to_sendaction_to_buffers                         r=   r   z(EnvRunnerV2._process_policy_eval_results-  s   0 FQQUEVEV! 	) 	)F&(OF## %,MMOO A	C A	C Iy(4Y(?(BG&w//G$0$;A$>G%1)%<Q%? '4(( ,(7+++27+;+;G*4<+BINNF'A,2,DA A@A AD 'w// -C -C	6'l1$-aL$9/8|/@/I
151C !'''2 2

 !% 2 !''')9! !
 2Hj6:w2O  7=6N6N7 7 4
G %777'9&'AAA F ,F3H=	 ! 4K%z7;	4 4 '88AAAv'>>>>>4B'11[-C^ r?   c                    | j         r| j        sdS t          j                    }| j                                        }t          |t          j                  r3t          |j	                  dk    r| j        
                    |           n|dvrt          d| j         d          | j                            dt          j                    |z
             dS )zVisualize environment.N   )TFNzThe env's (z) `try_render()` method returned an unsupported value! Make sure you either return a uint8/w x h x 3 (RGB) image or handle rendering in a window and then return `True`.r;   )r   r   r   r   
try_renderr   r   ndarrayr~   shapeimshowr   r   rD   )r<   t5rendereds      r=   r   zEnvRunnerV2._maybe_render  s     | 	4#< 	FY[[ >,,..h
++ 	HN0C0Cq0H0H%,,X66660001dn 1 1 1   	/r1ABBBBBr?   )r   rm   N)1rU   rV   rW   rX   r   r	  r2   r[   rZ   r>   r   r   r   r   r   r)   r   r   r   r   r   r   r   r   r&   r
   r	   r"   r'   r   r   r   r2  r    r$   r#   r?  r%   r  r  r  r<  r   r   r  r(   r   r   r   r\   r?   r=   r   r      sE       EE (+)<
 <
<
 <
 %)	<

 #<
 <
 "%<
 <
 <
 <
 <
 <
|  &
 
 
0>'< > > > >
Xo. 
 
 
 
9d?+ 9 9 9 9v
 
.23;.?
	n	
 
 
 
@N-$N- N- "	N-
 !N- N- 
E
Xt2334U>?234	6
N- N- N- N-`'-'- '- o&	'- '- '- '-R/-/- %gz1223/- E4 4556	/-
 /- h%; <<=/- /- /- /-bW$W$ -W$ 	W$
 ZW$ h%; <<=W$ o&W$ 
W$ W$ W$ W$r!U !y ! ! ! !F2.2.38I9M3N2. 2. 2. 2.h-2-=F-	t[/1	2- - - -^9h%; <<=9 
h((	)9 9 9 9vaZa h%; <<=a 8%556	a
 )a a a aFC C C C Cr?   r   r   c                     |                                  }|sdS g }|D ]Y}t          |t                    }|s dS |                                D ](\  }}|                    t          ||                     )Z|S )zAtari games have multiple logical episodes, one per life.

    However, for metrics reporting we count full episodes, all lives included.
    N)get_sub_environmentsr   r   next_episode_resultsr  r   )r   sub_environments	atari_outsub_envmonitoreps_reweps_lens          r=   r   r     s    
  4466 tI# ? ?$Wj99 	44 ' < < > > 	? 	?GW^GW==>>>>	?r?   mappingr   c                     || vr5t          d                    ||                                                     | |         S )a  Returns an object under key `policy_id` in `mapping`.

    Args:
        mapping (Dict[PolicyID, Union[Policy, Preprocessor, Filter]]): The
            mapping dict from policy id (str) to actual object (Policy,
            Preprocessor, etc.).
        policy_id: The policy ID to lookup.

    Returns:
        Union[Policy, Preprocessor, Filter]: The found object.

    Raises:
        ValueError: If `policy_id` cannot be found in `mapping`.
    z\Could not find policy for agent: PolicyID `{}` not found in policy map, whose keys are `{}`.)r   rs   keys)r  r   s     r=   rT  rT    sJ    " 228&GLLNN2S2S
 
 	
 9r?   )Uloggingr   collectionsr   typingr   r   r   r   r   r	   r
   r   numpyr   ri  ray.rllib.env.base_envr   r   ray.rllib.env.external_envr   %ray.rllib.env.wrappers.atari_wrappersr   r   5ray.rllib.evaluation.collectors.simple_list_collectorr   ray.rllib.evaluation.episode_v2r   ray.rllib.evaluation.metricsr   ray.rllib.models.preprocessorsr   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   r   r   ray.rllib.utils.annotationsr   ray.rllib.utils.filterr   ray.rllib.utils.numpyr   "ray.rllib.utils.spaces.space_utilsr   r   ray.rllib.utils.typingr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   ray.util.debugr,   r   r.   ray.rllib.callbacks.callbacksr/   #ray.rllib.evaluation.rollout_workerr0   	getLoggerrU   rq   r   r   rL   r2   r^   r[   r	  ry   r   r   r   rT  r\   r?   r=   <module>r     s^     # # # # # # S S S S S S S S S S S S S S S S S S S S      > > > > > > > > 9 9 9 9 9 9 P P P P P P P P W W W W W W 5 5 5 5 5 5 7 7 7 7 7 7 7 7 7 7 7 7 * * * * * * V V V V V V V V V V 3 3 3 3 3 3 ) ) ) ) ) ) 2 2 2 2 2 2 J J J J J J J J                                 $ # # # # # BJJJJJJ;;;;;;AAAAAA 
	8	$	$ !  $ 	 D# D# D# D# D# D# D# D#N     k    2W2W(2W 2W !%	2W
 2W 2W 2W 2Wj tK/@ [      XC XC XC XC XC XC XC XCv7 tN/C    $(E&,">??@MU
6<'(     r?   