
    &`i1                         d dl Z d dlZd dlmZmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dlmZ  e j                    Z e G d de                      Z!dS )    N)AnyDictListOptional)Dataset)FQETorchModel)OffPolicyEstimator)compute_q_and_v_values)OfflineEvaluator)Policy)SampleBatch convert_ma_batch_to_sample_batch)DeveloperAPIoverride)convert_to_numpy)SampleBatchTypec                       e Zd ZdZ ee          	 	 ddedededee	         f fd            Z
 ee          d	ed
e	eef         fd            Z ee          ded
e	eee         f         fd            Zd Z ee          ded
e	eef         fd            Z ee          dddeded
e	eef         fd            Z xZS )DirectMethoda  The Direct Method estimator.

    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.

    This method trains a Q-model for the evaluation policy \pi_e on behavior
    data generated by \pi_b. Currently, RLlib implements this using
    Fitted-Q Evaluation (FQE). You can also implement your own model
    and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`.

    This estimator computes the expected return for \pi_e for an episode as:
    V^{\pi_e}(s_0) = \sum_{a \in A} \pi_e(a | s_0) Q(s_0, a)
    and returns the mean and standard deviation over episodes.

    For more information refer to https://arxiv.org/pdf/1911.06854.pdf        Npolicygammaepsilon_greedyq_model_configc                 \   t                                          |||           t          |d          r)|j                            dd          dk    s
J d            |pi }|                    dt                    } |d
||d|| _        t          | j        d          s
J d            d	S )a  Initializes a Direct Method OPE Estimator.

        Args:
            policy: Policy to evaluate.
            gamma: Discount factor of the environment.
            epsilon_greedy: The probability by which we act acording to a fully random
                policy during deployment. With 1-epsilon_greedy we act according the
                target policy.
            q_model_config: Arguments to specify the Q-model. Must specify
                a `type` key pointing to the Q-model class.
                This Q-model is trained in the train() method and is used
                to compute the state-value estimates for the DirectMethod estimator.
                It must implement `train` and `estimate_v`.
                TODO (Rohan138): Unify this with RLModule API.
        config	frameworktorchz,Framework must be torch to use DirectMethod.type)r   r   
estimate_vz'self.model must implement `estimate_v`!N )super__init__hasattrr   getpopr   model)selfr   r   r   r   	model_cls	__class__s         ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.pyr"   zDirectMethod.__init__&   s    0 	777 68$$ 	>!!+w777BBB= CBB (-2"&&v}==	Y 

 
 
 


 J
 
 	5 	54	5 	5 
 	5 	5    episodereturnc                     i }|d         }d}t          |j                  D ]}|||         | j        |z  z  z  }|                     |d d                   }||d<   ||d<   |S )Nrewardsr      
v_behaviorv_target)rangecountr   _compute_v_target)r'   r,   estimates_per_epsioder/   r1   tr2   s          r*   estimate_on_single_episodez'DirectMethod.estimate_on_single_episodeR   s     ")$
w}%% 	5 	5A'!*tz1}44JJ))'"1"+66.8l+,4j)$$r+   batchc                 \    i }|d         }|}|                      |          }||d<   ||d<   |S )Nr/   r1   r2   )r5   )r'   r9   r6   r/   r1   r2   s         r*   estimate_on_single_step_samplesz,DirectMethod.estimate_on_single_step_samplesb   sH     !#	"
))%00.8l+,4j)$$r+   c                 X    | j                             |          }t          |          }|S )N)r&   r   r   )r'   	init_stepr2   s      r*   r5   zDirectMethod._compute_v_targetq   s)    :((33#H--r+   c                     t          |          }| j                            |          }dt          j        |          iS )zTrains self.model on the given batch.

        Args:
            batch: A SampleBatchType to train on

        Returns:
            A dict with key "loss" and value as the mean training loss.
        loss)r   r&   trainnpmean)r'   r9   lossess      r*   r@   zDirectMethod.trainv   s9     177!!%((((r+   .)n_parallelismdatasetrD   c          	         t          |                                |z  d          }|                    t          |d| j        j        | j                                        dd          }|                    d          }|                    d          }||z  }|                    d          |z  t          j
        |                                          z  }||||dS )	ao  Calculates the Direct Method estimate on the given dataset.

        Note: This estimate works for only discrete action spaces for now.

        Args:
            dataset: Dataset to compute the estimate on. Each record in dataset should
                include the following columns: `obs`, `actions`, `action_prob` and
                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
            n_parallelism: The number of parallel workers to use.

        Returns:
            Dictionary with the following keys:
                v_target: The estimated value of the target policy.
                v_behavior: The estimated value of the behavior policy.
                v_gain: The estimated gain of the target policy over the behavior
                    policy.
                v_std: The standard deviation of the estimated value of the target.
        r0   pandasF)model_classmodel_statecompute_q_values)
batch_sizebatch_format	fn_kwargsr/   v_values)r1   r2   v_gain_mean
v_gain_ste)maxr4   map_batchesr
   r&   r)   	get_staterB   stdmathsqrt)	r'   rE   rD   rK   
updated_dsr1   r2   rO   rP   s	            r*   estimate_on_datasetz DirectMethod.estimate_on_dataset   s    . M91==
(("!!#z3#z3355$) 	 ) 	
 	

  __Y//
??:..+NN:&&3di6P6PP 	
 % &$	
 
 	
r+   )r   N)__name__
__module____qualname____doc__r   r	   r   floatr   r   r"   r   strr   r8   r   r;   r5   r   r@   r   r   intrX   __classcell__)r)   s   @r*   r   r      s       J J X !!
 !$)-)5 )5)5 )5 	)5
 !)5 )5 )5 )5 )5 "!)5V X !!%+ %$sCx. % % % "!% X !!% %	c4;	% % % "!%  
 X !!)? )tCH~ ) ) ) "!) X8;.
 .
 .
.
25.
	c3h.
 .
 .
  .
 .
 .
 .
 .
r+   r   )"loggingrU   typingr   r   r   r   numpyrA   ray.datar   ,ray.rllib.offline.estimators.fqe_torch_modelr   1ray.rllib.offline.estimators.off_policy_estimatorr	   *ray.rllib.offline.offline_evaluation_utilsr
   #ray.rllib.offline.offline_evaluatorr   ray.rllib.policyr   ray.rllib.policy.sample_batchr   r   ray.rllib.utils.annotationsr   r   ray.rllib.utils.numpyr   ray.rllib.utils.typingr   	getLoggerloggerr   r    r+   r*   <module>rp      sb     , , , , , , , , , , , ,           F F F F F F P P P P P P M M M M M M @ @ @ @ @ @ # # # # # # W W W W W W W W > > > > > > > > 2 2 2 2 2 2 2 2 2 2 2 2				 ]
 ]
 ]
 ]
 ]
% ]
 ]
 ]
 ]
 ]
r+   