
    &`i%                        d dl Z d dlZd dlmZmZmZmZ d dlZd dl	Z
d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dl m!Z!  e j"                    Z#e G d de                      Z$dS )    N)AnyDictListOptional)Dataset)FQETorchModel)OffPolicyEstimator)compute_is_weightscompute_q_and_v_values)OfflineEvaluator)Policy)SampleBatch convert_ma_batch_to_sample_batch)DeveloperAPIoverride)convert_to_numpy)SampleBatchTypec                       e Zd ZdZ ee          	 	 	 ddedededed	e	e
         f
 fd
            Z ee          dede
eef         fd            Z ee          dede
eee         f         fd            Z ee          dede
eef         fd            Z ee          dddedede
eef         fd            Z xZS )DoublyRobusta.  The Doubly Robust estimator.

    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.

    This method trains a Q-model for the evaluation policy \pi_e on behavior
    data generated by \pi_b. Currently, RLlib implements this using
    Fitted-Q Evaluation (FQE). You can also implement your own model
    and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`.

    For behavior policy \pi_b and evaluation policy \pi_e, define the
    cumulative importance ratio at timestep t as:
    p_t = \sum_{t'=0}^t (\pi_e(a_{t'} | s_{t'}) / \pi_b(a_{t'} | s_{t'})).

    Consider an episode with length T. Let V_T = 0.
    For all t in {0, T - 1}, use the following recursive update:
    V_t^DR = (\sum_{a \in A} \pi_e(a | s_t) Q(s_t, a))
        + p_t * (r_t + \gamma * V_{t+1}^DR - Q(s_t, a_t))

    This estimator computes the expected return for \pi_e for an episode as:
    V^{\pi_e}(s_0) = V_0^DR
    and returns the mean and standard deviation over episodes.

    For more information refer to https://arxiv.org/pdf/1911.06854.pdf        TNpolicygammaepsilon_greedynormalize_weightsq_model_configc                 `   t                                          |||           |pi }||d<   |                    dt                    | _        || _        || _         | j        d	d|i|| _        t          | j        d          s
J d            t          | j        d          s
J d            dS )
a=  Initializes a Doubly Robust OPE Estimator.

        Args:
            policy: Policy to evaluate.
            gamma: Discount factor of the environment.
            epsilon_greedy: The probability by which we act acording to a fully random
                policy during deployment. With 1-epsilon_greedy we act
                according the target policy.
            normalize_weights: If True, the inverse propensity scores are normalized to
                their sum across the entire dataset. The effect of this is similar to
                weighted importance sampling compared to standard importance sampling.
            q_model_config: Arguments to specify the Q-model. Must specify
                a `type` key pointing to the Q-model class.
                This Q-model is trained in the train() method and is used
                to compute the state-value and Q-value estimates
                for the DoublyRobust estimator.
                It must implement `train`, `estimate_q`, and `estimate_v`.
                TODO (Rohan138): Unify this with RLModule API.
        r   typer   
estimate_vz'self.model must implement `estimate_v`!
estimate_qz'self.model must implement `estimate_q`!N )	super__init__popr   
_model_cls_model_configs_normalize_weightsmodelhasattr)selfr   r   r   r   r   	__class__s         ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.pyr"   zDoublyRobust.__init__3   s    : 	777'-2"'w(,,V]CC,"3$T_ 
 


 

 J
 
 	5 	54	5 	5 
 J
 
 	5 	54	5 	5 
 	5 	5    episodereturnc                 R   i }|d         |d         }}|                      |          }||z  }d}d}| j                            |          }	t          |	          }	| j                            |          }
t          |
          }
|	j        |
j        cxk    r|j        fk    sn J t          t          |j                            D ]C}||         | j	        |z  z   }|
|         ||         ||         | j	        |z  z   |	|         z
  z  z   }D|
                                }||d<   ||d<   |S )Nrewardsaction_probr   
v_behaviorv_target)compute_action_probsr'   r   r   r   shapecountreversedranger   item)r)   r-   estimates_per_epsioder0   old_probnew_probweightr2   r3   q_valuesv_valuests               r+   estimate_on_single_episodez'DoublyRobust.estimate_on_single_episodec   sL    "#I.0F,,W55H$
:((11#H--:((11#H--~CCCCGM3CCCCCCC%..// 	 	A dj:&==J{VAY
TZ(22Xa[@& HH ==??.8l+,4j)$$r,   batchc                 0   i }|d         |d         }}|                      |          }| j                            |          }t          |          }| j                            |          }t          |          }|}||z  }	||	||z
  z  z   }
||d<   |
|d<   |S )Nr0   r1   r2   r3   )r4   r'   r   r   r   )r)   rB   r:   r0   r;   r<   r>   r?   r2   r=   r3   s              r+   estimate_on_single_step_samplesz,DoublyRobust.estimate_on_single_step_samples   s     !#!),eM.B,,U33:((//#H--:((//#H--
H$f((:;;.8l+,4j)$$r,   c                     t          |          }| j                            |          }dt          j        |          iS )zTrains self.model on the given batch.

        Args:
        batch: A SampleBatch or MultiAgentbatch to train on

        Returns:
            A dict with key "loss" and value as the mean training loss.
        loss)r   r'   trainnpmean)r)   rB   lossess      r+   rG   zDoublyRobust.train   s9     177!!%((((r,   .)n_parallelismdatasetrK   c                &   t          |                                |z  d          }|                    t          |d| j                                        | j        d          }t          |                                |z  d          }|                    t          |d| j        j        | j                                        d          }ddt          j
        dt          fd	}| j        r|                    d
          nd}|                    ||dd|i          }|                    d          }|                    d          }||z  }	|                    d          |z  |z  t          j        |                                          z  }
|||	|
dS )a  Estimates the policy value using the Doubly Robust estimator.

        The doubly robust estimator uses normalization of importance sampling weights
        (aka. propensity ratios) to the average of the importance weights across the
        entire dataset. This is done to reduce the variance of the estimate (similar to
        weighted importance sampling). You can disable this by setting
        `normalize_weights=False` in the constructor.

        Note: This estimate works for only discrete action spaces for now.

        Args:
            dataset: Dataset to compute the estimate on. Each record in dataset should
                include the following columns: `obs`, `actions`, `action_prob` and
                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
            n_parallelism: Number of parallelism to use for the computation.

        Returns:
            A dict with the following keys:
                v_target: The estimated value of the target policy.
                v_behavior: The estimated value of the behavior policy.
                v_gain: The estimated gain of the target policy over the behavior
                    policy.
                v_std: The standard deviation of the estimated value of the target.
           pandas)policy_stateestimator_class)
batch_sizebatch_format	fn_kwargs)model_classmodel_state      ?rB   
normalizerc                 r    | d         |z  }| d         || d         | d         z
  z  z   | d<   | d         | d<   | S )Nweightsr?   r0   r>   r3   r2   r    )rB   rX   rZ   s      r+   compute_v_targetz:DoublyRobust.estimate_on_dataset.<locals>.compute_v_target   sT    I&3G %j 1Gi 5#445 !E* #(	"2E,Lr,   rZ   r2   r3   )r2   r3   v_gain_mean
v_gain_ste)rW   )maxr6   map_batchesr
   r   	get_stater*   r   r'   pd	DataFramefloatr&   rI   stdmathsqrt)r)   rL   rK   rR   
updated_dsr[   rX   r2   r3   r\   r]   s              r+   estimate_on_datasetz DoublyRobust.estimate_on_dataset   s   < M91==
((!! $ 5 5 7 7#'> 	 ) 
 

 ))++}<a@@
++"!!#z3#z3355 	 , 
 

	 	BL 	e 	 	 	 	 483JSZ__Y///PS
++!!#Z0	 , 
 

  __\22
??:..+NN:&& i(() 	 % &$	
 
 	
r,   )r   TN)__name__
__module____qualname____doc__r   r	   r   rc   boolr   r   r"   r   strr   rA   r   rD   r   rG   r   r   intrh   __classcell__)r*   s   @r+   r   r      s       J J0 X !!
 !$"&)--5 -5-5 -5 	-5
  -5 !-5 -5 -5 -5 -5 "!-5^ X !!%+ %$sCx. % % % "!%8 X !!% %	c4;	% % % "!%. X !!)? )tCH~ ) ) ) "!) X8;T
 T
 T
T
25T
	c3hT
 T
 T
  T
 T
 T
 T
 T
r,   r   )%loggingre   typingr   r   r   r   numpyrH   rO   ra   ray.datar   ,ray.rllib.offline.estimators.fqe_torch_modelr   1ray.rllib.offline.estimators.off_policy_estimatorr	   *ray.rllib.offline.offline_evaluation_utilsr
   r   #ray.rllib.offline.offline_evaluatorr   ray.rllib.policyr   ray.rllib.policy.sample_batchr   r   ray.rllib.utils.annotationsr   r   ray.rllib.utils.numpyr   ray.rllib.utils.typingr   	getLoggerloggerr   r    r,   r+   <module>r      s     , , , , , , , , , , , ,               F F F F F F P P P P P P        A @ @ @ @ @ # # # # # # W W W W W W W W > > > > > > > > 2 2 2 2 2 2 2 2 2 2 2 2				 a
 a
 a
 a
 a
% a
 a
 a
 a
 a
r,   