
    &`iZ                        d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZBmCZCmDZDmEZE erd dlFmGZG  e*            \  ZHZIdZJ G d deKe          ZL G d de!          ZM G d de>e'          ZNdS )    N)Enum)TYPE_CHECKING
CollectionDictIterableListOptionalUnion)DataIterator)EnvToModulePipeline)ALL_MODULES!COMPONENT_ENV_TO_MODULE_CONNECTORCOMPONENT_RL_MODULEDEFAULT_AGENT_IDDEFAULT_MODULE_ID)Columns)MultiRLModuleSpec)SingleAgentEpisode)SCHEMAOfflinePreLearner)MultiAgentBatch)override)Checkpointable)
get_devicetry_import_torch)DATASET_NUM_ITERS_EVALUATED$DATASET_NUM_ITERS_EVALUATED_LIFETIMEEPISODE_LEN_MAXEPISODE_LEN_MEANEPISODE_LEN_MINEPISODE_RETURN_MAXEPISODE_RETURN_MEANEPISODE_RETURN_MINMODULE_SAMPLE_BATCH_SIZE_MEANNUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLED_LIFETIMENUM_MODULE_STEPS_SAMPLED!NUM_MODULE_STEPS_SAMPLED_LIFETIMEOFFLINE_SAMPLING_TIMERWEIGHTS_SEQ_NO)MiniBatchRayDataIterator)Runnerconvert_to_torch_tensor)
DeviceType	EpisodeID	StateDict
TensorType)AlgorithmConfigtotal_eval_lossc                       e Zd ZdZdZdZdS )OfflinePolicyEvaluationTypesa  Defines the offline policy evaluation types.

    IS: Importance Sampling.
    PDIS: Per-Decision Importance Sampling. In contrast to IS this method
        weighs each reward and not the return as a whole. As a result it
        usually exhibits lower variance.
    ispdisN)__name__
__module____qualname____doc__ISPDIS     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/offline/offline_policy_evaluation_runner.pyr6   r6   F   s$          
BDDDr@   r6   c                   R    e Zd Zdeeej        f         deeej        f         fdZdS )OfflinePolicyPreEvaluatorbatchreturnc           	         | j         rdd ldd lfd|d         D             }|                     |          }| j                            |           | j                            | j        j        | j	        
                                r | j        j                            dd          nd | j                            dd          pddd          }nA| j        rt          j        | j        |dt"          | j        j        z  | j        j        	          d
         }|                     |          }| j                            |           | j                            | j        j        | j	        
                                r | j        j                            dd          nd | j                            dd          pddd          }nM|                     | j        |t"          | j        j        z  d| j        j        | j        | j                  d
         }g }|D ]}i }|                    t1          dt3          |                              |t4          j        <   |                                |t4          j        <   |                                |t4          j        <   |                     t4          j!                  |t4          j!        <   |"                    |           d
|iS )Nr   c                 l    g | ]0}t          j                            |j                             1S ))object_hook)r   
from_stateunpackbdecode).0statemnpmsgpacks     rA   
<listcomp>z6OfflinePolicyPreEvaluator.__call__.<locals>.<listcomp>\   sM     2 2 2  #-OOEszOBB 2 2 2r@   itemmax_seq_lenn_step   T)	num_itemsbatch_length_TrS   sample_episodesto_numpy)rX   schemainput_compress_columnsepisodesF)rY   rX   rZ   observation_spaceaction_space)key)#input_read_episodesrO   msgpack_numpy_validate_episodesepisode_bufferaddsampleconfigtrain_batch_size_per_learner_moduleis_statefulmodel_configgetinput_read_sample_batchesr   _map_sample_batch_to_episode_is_multi_agentr   input_read_schemarZ   _map_to_episodesr\   r]   get_observationsslicelenr   OBSget_actionsACTIONSget_rewardsREWARDSget_extra_model_outputsACTION_LOGPappend)selfrD   r[   episode_dictsepisodeepisode_dictrN   rO   s         @@rA   __call__z"OfflinePolicyPreEvaluator.__call__T   s   # G	NNN''''2 2 2 2 2 #6]	2 2 2H ..x88H##H---*11+B |//11DK,00BBB{x338q !% 2  HH + (	 ">$ =='+{'I     ..x88H##H---*11+B |//11DK,00BBB{x338q !% 2  HH 261F1F$ =='+{'I"&"8!. 2G 2 2 2H  	/ 	/G L(/(@(@q#g,,AWAW(X(XL%,3,?,?,A,AL),3,?,?,A,AL)070O0O' 1P 1 1L,-   ....M**r@   N)r9   r:   r;   r   strnumpyndarrayr   r?   r@   rA   rC   rC   S   sV        Y+d3#56 Y+4U]@R;S Y+ Y+ Y+ Y+ Y+ Y+r@   rC   c                   T   e Zd Z	 d/dddee         fdZ	 	 d0ded	ed
dfdZd
efdZ	ded	ed
dfdZ
 ee          d             Z ee          	 d/dddeeeee         f                  deeeee         f                  d
efd            Zd
efdZd1dZd1dZ ee          d             Z ee          d             Z	 	 	 d2dedededed
ef
dZ ee          ded
dfd            Zded ed
dfd!Zd"ed#efd$Z  ee          d%             Z! ee          d&             Z"e#d
e$fd'            Z%d( Z&e#d
e'fd)            Z(e#d
ee)df         fd*            Z*e#d
efd+            Z+e#d
e,ee-j.        j/        f         fd,            Z0e#d
e1fd-            Z2e#d
e3fd.            Z4dS )3OfflinePolicyEvaluationRunnerNre   r3   module_specc                 H   || _         d | _        d | _        t          j        | fd|i| t          j        |            |                    d          | _        | j        	                    | j
        | j                  | _        t          | j        d                   | _        d S )Nre   spaces)r   deviceoffline_evaluation_type)+_OfflinePolicyEvaluationRunner__module_spec0_OfflinePolicyEvaluationRunner__dataset_iterator._OfflinePolicyEvaluationRunner__batch_iteratorr,   __init__r   rj   &_OfflinePolicyEvaluationRunner__spacesre   build_env_to_module_connector_spaces_device-_OfflinePolicyEvaluationRunner__env_to_moduler6   7_OfflinePolicyEvaluationRunner__offline_evaluation_type)r{   re   r   kwargss       rA   r   z&OfflinePolicyEvaluationRunner.__init__   s     1<"& $66V6v666%%% 

8,,#{HH<  I  
  
 *FK12*
 *
&&&r@   FTexploretrainrE   c                    | j         t          |  d          | j        s | j        di | j        j        | _        | j                            t          | j
        d           | j                            t                    5  || j        j        }|                     ||          cd d d            S # 1 swxY w Y   d S )NzS doesn't have a data iterator. Can't call `run` on `OfflinePolicyEvaluationRunner`.rT   )r^   valuewindow)r   r   r?   )r   
ValueError_batch_iterator_create_batch_iteratorre   iter_batches_kwargsr   metrics	log_valuer*   _weights_seq_nolog_timer)   r   	_evaluate)r{   r   r   r   s       rA   runz!OfflinePolicyEvaluationRunner.run   s>    "* 3 3 3  
 # 	$?D$? % %+1% %D!
 	& 	 	
 	
 	
 \""#9:: 	 	+- >> "  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   %B55B9<B9c           	          ddl m dt          t          t          j        f         dt          t          t          t          t          j        f         f         fd}dt          t          t          t          t          j        f         f         dt          t          t          t          t          f         f         f fd}t          d j	        || j
        j         j
        j        d|S )	Nr   )+convert_ndarray_batch_to_torch_tensor_batch_batchrE   c                     | d         S )Nr[   r?   )r   s    rA   _collate_fnzIOfflinePolicyEvaluationRunner._create_batch_iterator.<locals>._collate_fn   s     *%%r@   c                 "    fd| D             S )Nc                 J    g | ]} |j         t          j                    S ))r   dtypes)r   torchfloat32)rL   r}   r   r{   s     rA   rP   z^OfflinePolicyEvaluationRunner._create_batch_iterator.<locals>._finalize_fn.<locals>.<listcomp>  sH         <;DL    r@   r?   )r   r   r{   s    rA   _finalize_fnzJOfflinePolicyEvaluationRunner._create_batch_iterator.<locals>._finalize_fn   s5          &	   r@   )iterator
collate_fnfinalize_fnminibatch_size	num_itersr?   )ray.air._internal.torch_utilsr   r   r   r   r   r0   r2   r+   _dataset_iteratorre   "offline_eval_batch_size_per_runner!dataset_num_iters_per_eval_runner)r{   r   r   r   r   s   `   @rA   r   z4OfflinePolicyEvaluationRunner._create_batch_iterator   s   	
 	
 	
 	
 	
 	
	&em+,	&)T#u}"4556	& 	& 	& 	&		Dem);$<<=		)T#z/223		 		 		 		 		 		 		 ( 
+"$;IkC
 
 
 
 	
r@   c                    d}t          | j                  D ]m\  }}|D ]@}| j        t                                                   }| j        t                                       |          t          j                 }|                    |          }	|		                                }
|	
                    |
          }t          j        |v r|t          j                 }n%|	
                    |t          j                           }| j        t          j        k    rt                               t                               |          t                               |          z            }|t          j                                                 }||z                                  }n| j        t          j        k    rrt                               |          t                               |          z  }t                               ||t          j                                                           }|t          j                 j        d         dz   }||z  }|                     ||           B|                     t7          |          |           o| j                            t<          t>          f|dz   d           | j                            t<          t@          f|dz   d           | j        !                                S )Nr   rT   sum)reducelifetime_sum)"	enumerater   moduler   get_inference_action_dist_clsforward_inferencer   ACTION_DIST_INPUTSfrom_logitsrd   logpry   ru   r   r6   r=   r   prodexprw   r   rQ   r>   dotshape_log_episode_metrics_log_batch_metricsrr   r   r   r   r   r   r   )r{   r   r   num_env_steps	iterationtensor_minibatchr}   action_dist_clsaction_logitsaction_distactionsaction_logpbehavior_action_logpweightepisode_returnoffline_returnweightsepisode_lens                     rA   r   z'OfflinePolicyEvaluationRunner._evaluate  s    +4T5I+J+J )	J )	J'I'+ &G &G"&+%#//11   !%,= > P P! !,!.
 .99-HH%,,..)..w77&'11+273F+G(( ,7+;+;GGO<T+U+U( 15Q5TTT"ZZ		+..;O1P1PP F
 &-W_%=%A%A%C%CN&,~&=%C%C%E%ENN26R6WWW#ii44uyyAU7V7VVG%*YYw8P%Q%Q%V%V%X%XN%go6<Q?!C,))+~FFFF##C(8$9$9=IIII 	56M 	 	
 	
 	

 	>?M! 	 	
 	
 	
 |""$$$r@   c                     dd| j         ifS )Nr?   re   )re   r{   s    rA   get_ctor_args_and_kwargsz6OfflinePolicyEvaluationRunner.get_ctor_args_and_kwargsS  s     t{#
 	
r@   )not_components
componentsr   c                   t           | j                            t           d          i}|                     t          ||          r^ | j        j        d|                     t          |          |                     t          |          d||t          <   | j        |t          <   |                     t          ||          r!| j                                        |t          <   |S )Nr   )default)r   r   r?   )r&   r   peek_check_componentr   r   	get_state_get_subcomponentsr   r*   r   _env_to_module)r{   r   r   r   rM   s        rA   r   z'OfflinePolicyEvaluationRunner.get_stateZ  s     +!!"@!!LL
   !4j.QQ 	9)>)> *223F
SS#66'   * *
 * *E%& %)$8E.!  -z>
 
 	W 8<7J7T7T7V7VE34r@   c                      t          |          S )z0Converts structs to a framework-specific tensor.r-   )r{   structs     rA   _convert_to_tensorz0OfflinePolicyEvaluationRunner._convert_to_tensorx  s    &v...r@   c                     dS )zReleases all resources used by this EnvRunner.

        For example, when using a gym.Env in this EnvRunner, you should make sure
        that its `close()` method is called.
        Nr?   r   s    rA   stopz"OfflinePolicyEvaluationRunner.stop|  s	     	r@   c                     dS )z:If this Actor is deleted, clears all resources used by it.Nr?   r   s    rA   __del__z%OfflinePolicyEvaluationRunner.__del__  s    r@   c                 8    | j         rt          | d          sJ dS )a  Checks that self.__init__() has been completed properly.

        Ensures that the instances has a `MultiRLModule` and an
        environment defined.

        Raises:
            AssertionError: If the EnvRunner Actor has NOT been properly initialized.
        r   N)r   hasattrr   s    rA   assert_healthyz,OfflinePolicyEvaluationRunner.assert_healthy  s(     %A'$*A*AAAAAAr@   c                 4    | j                                         S N)r   r   r   s    rA   get_metricsz)OfflinePolicyEvaluationRunner.get_metrics  s    |""$$$r@   rD   	to_device
pin_memory
use_streamc                     t          |j        |r| j        nd ||          }t          d |                                D                       }t          ||          }|S )N)r   r   r   c              3   4   K   | ]}t          |          V  d S r   )rr   )rL   bs     rA   	<genexpr>zDOfflinePolicyEvaluationRunner._convert_batch_type.<locals>.<genexpr>  s(      44SVV444444r@   )	env_steps)r.   policy_batchesr   maxvaluesr   )r{   rD   r   r   r   lengths         rA   _convert_batch_typez1OfflinePolicyEvaluationRunner._convert_batch_type  so     ( #,64<<$!!	
 
 
 44U\\^^44444888r@   rM   c                    t           |v r%| j                            |t                               t          |v r|                    t
          d          }|dk    s| j        |k     rU|t                   }t          |t          j	                  rt          j        |          }| j
                            |           |dk    r|| _        d S d S d S Nr   )r   r   	set_stater   rj   r*   r   
isinstanceray	ObjectRefr   )r{   rM   weights_seq_norl_module_states       rA   r   z'OfflinePolicyEvaluationRunner.set_state  s    ,55))%0Q*RSSS %'' #YY~q99N ""d&:^&K&K"'(;"<os}== ?&)go&>&>O%%o666 !!'5$$$ (' "!r@   r   r   c           	         t          dt          t          j        | j        j        | j        j        pdz                                }| j                            t          ||           | j                            t          ||           | j                            dt          f||           | j                            dt          f||           | j                            t          |d|           | j                            t          |d|           | j                            t          |d|           | j                            t           |d|           dS )	z&Logs episode metrics for each episode.rT   r   agent_episode_return_meanmodule_episode_return_meanmin)r   r   r   N)r   intmathceilre   "metrics_num_episodes_for_smoothingnum_offline_eval_runnersr   r   r   r"   r   r   r    r#   r   r!   )r{   r   r   wins       rA   r   z2OfflinePolicyEvaluationRunner._log_episode_metrics  s    	KB{;@qB  
 
 	/SIII2N3OOO(*:;^TW 	 	
 	
 	
 	)+<= 	 	
 	
 	
 	ERUVVVuS 	 	
 	
 	
 	ERUVVVuS 	 	
 	
 	
 	
 	
r@   
batch_sizer   c                    | j                             t          t          f| j        d           | j                             t          t
          f|           | j                             t          t          f|d           | j                             t          t          f|d           | j                             t          t          f|d           | j                             t          t          f|d           | j                             t          t          f|d           | j                             t          t          f|dd           d	S )
z'Logs batch metrics for each mini batch.rT   r  )r^   r   r   )r^   r   r   r   T)r^   r   r   with_throughputN)r   r   r   r*   r   r$   r'   r(   r   r%   r&   )r{   r  r   s      rA   r   z0OfflinePolicyEvaluationRunner._log_batch_metrics  s    	/  	 	
 	
 	
 	"$AB 	 	
 	
 	

 	"$<= 	 	
 	
 	

 	"$EF! 	 	
 	
 	
 	67 	 	
 	
 	

 	?@! 	 	
 	
 	
 	34 	 	
 	
 	

 	<=! 	 	 	
 	
 	
 	
 	
r@   c                     	 t          | j        | j        sdn| j        j                  | _        d S # t
          $ r d | _        Y d S w xY wr   )r   re   worker_index num_gpus_per_offline_eval_runner&_OfflinePolicyEvaluationRunner__deviceNotImplementedErrorr   s    rA   
set_devicez(OfflinePolicyEvaluationRunner.set_device  se    
	!&  ,FAAE DMMM # 	! 	! 	! DMMMM	!s   -1 AAc                 f    	 ddl m}  j        sN j                             j        j        | j        j         j        j        fi j        j                   _	         j        
                                 _         j                             fd           d S # t          $ r d  _        Y d S w xY w)Nr   )INPUT_ENV_SPACES)envr   inference_onlyc                 z    t          |t          j        j                  r|                    j                  n|S r   )r   r   nnModuletor   )midmodr{   s     rA   <lambda>z;OfflinePolicyEvaluationRunner.make_module.<locals>.<lambda>D  s2    ,6sEHO,L,LUCFF4<(((RU r@   )ray.rllib.envr  _module_specre   get_multi_rl_module_specr  r\   r]   %offline_eval_rl_module_inference_onlyr   buildr   foreach_moduler  )r{   r  s   ` rA   make_modulez)OfflinePolicyEvaluationRunner.make_module*  s     	666666$ %)[%I%I( K9 K4+ $(;#T &J 
& 
&" +1133DK K&&        # 	 	 	DKKKK	s   BB B0/B0c                     | j         S )zReturns the dataset iterator.r   r   s    rA   r   z/OfflinePolicyEvaluationRunner._dataset_iteratorN  s     &&r@   c                     || _         dS )zSets the dataset iterator.Nr,  )r{   r   s     rA   set_dataset_iteratorz2OfflinePolicyEvaluationRunner.set_dataset_iteratorS  s    "*r@   c                     | j         S r   )r   r   s    rA   r   z-OfflinePolicyEvaluationRunner._batch_iteratorW  s    $$r@   c                     | j         S r   )r  r   s    rA   r   z%OfflinePolicyEvaluationRunner._device[  s
    }r@   c                     | j         S )z1Returns the `MultiRLModuleSpec` of this `Runner`.)r   r   s    rA   r%  z*OfflinePolicyEvaluationRunner._module_spec_  s     !!r@   c                     | j         S )z$Returns the spaces of thsi `Runner`.)r   r   s    rA   r   z%OfflinePolicyEvaluationRunner._spacesd  s     }r@   c                     | j         S )z4Returns the env-to-module pipeline of this `Runner`.)r   r   s    rA   r   z,OfflinePolicyEvaluationRunner._env_to_modulei  s     ##r@   c                     | j         S )z5Returns the offline evaluation type of this `Runner`.)r   r   s    rA   _offline_evaluation_typez6OfflinePolicyEvaluationRunner._offline_evaluation_typen  s     --r@   r   )FT)rE   N)TFF)5r9   r:   r;   r	   r   r   boolr   r   r   r   r   r   r   r
   r   r   r1   r   r2   r   r   r   r,   r   r   r   r   r   r
  floatr   r   r  r*  propertyr   r   r.  r+   r   r/   r   r%  r   gymr   Spacer   r   r   r   r5  r?   r@   rA   r   r      sy        48
 
!
 /0
 
 
 
8 ! !! !
 
! ! ! !F#
( #
 #
 #
 #
J>%>% >% 
	>% >% >% >%@ Xn
 
 
 Xn =A AE	  U3
3#789 !sJsO';!<=	 
   :/J / / / /       Xf
B 
B 
B Xf% % %      	
  
   $ Xn6y 6T 6 6 6 6,&
 &
U &
t &
 &
 &
 &
P0
S 0
 0
 0
 0
 0
d Xf! ! ! Xf! ! !F '< ' ' ' X'+ + + %!9 % % % X% z4/0    X "/ " " " X" c3:#334    X $ 3 $ $ $ X$ .$ . . . X. . .r@   r   )Or  enumr   typingr   r   r   r   r   r	   r
   	gymnasiumr9  r   r  ray.data.iteratorr   "ray.rllib.connectors.env_to_moduler   ray.rllib.corer   r   r   r   r   ray.rllib.core.columnsr   (ray.rllib.core.rl_module.multi_rl_moduler   "ray.rllib.env.single_agent_episoder   $ray.rllib.offline.offline_prelearnerr   r   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   ray.rllib.utils.checkpointsr   ray.rllib.utils.frameworkr   r   ray.rllib.utils.metricsr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   ray.rllib.utils.minibatch_utilsr+   ray.rllib.utils.runners.runnerr,   ray.rllib.utils.torch_utilsr.   ray.rllib.utils.typingr/   r0   r1   r2   %ray.rllib.algorithms.algorithm_configr3   r   _TOTAL_EVAL_LOSS_KEYr   r6   rC   r   r?   r@   rA   <module>rQ     s                                



 * * * * * * B B B B B B              + * * * * * F F F F F F A A A A A A J J J J J J J J 9 9 9 9 9 9 0 0 0 0 0 0 6 6 6 6 6 6 B B B B B B B B                                 " E D D D D D 1 1 1 1 1 1 ? ? ? ? ? ?             FEEEEEEq' 
 
 
 
 
3 
 
 
Z+ Z+ Z+ Z+ Z+ 1 Z+ Z+ Z+zA. A. A. A. A.FN A. A. A. A. A.r@   