
    &`i                     f   d dl mZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ erd dlmZ d dlmZ e	 dd
ej        ded         deeef         dedej        f
d            Zed
ej        deeef         ded         dej        fd            Zed
ej        dej        fd            ZdS )    )TYPE_CHECKINGAnyDictTypeN)Policy)SampleBatch)DeveloperAPI)convert_to_numpy)FQETorchModel)OffPolicyEstimatorTbatchmodel_classr   model_statecompute_q_valuesreturnc           	         |                     |          }t          t          j        t          j        | t          j                           t          j        t          j        | t          j                                               d          i          }|                    |          }t          |          }|| d<   |r)|	                    |          }t          |          }|| d<   | S )a  Computes the Q and V values for the given batch of samples.

    This function is to be used with map_batches() to perform a batch prediction on a
    dataset of records with `obs` and `actions` columns.

    Args:
        batch: A sub-batch from the dataset.
        model_class: The model class to use for the prediction. This class should be a
            sub-class of FQEModel that implements the estimate_q() and estimate_v()
            methods.
        model_state: The state of the model to use for the prediction.
        compute_q_values: Whether to compute the Q values or not. If False, only the V
            is computed and returned.

    Returns:
        The modified batch with the Q and V values added as columns.
    v_valuesq_values)

from_stater   OBSnpvstackACTIONSsqueeze
estimate_vr
   
estimate_q)r   r   r   r   modelsample_batchr   r   s           ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/offline/offline_evaluation_utils.pycompute_q_and_v_valuesr!      s    0 "";//EORYu[_'=>>51D+E!F!F!N!Nr!R!R	
 L --H))H E* %##L11#H--$jL    policy_stateestimator_classr   c                    t          j        |          } ||dd          }t          t          j        t	          j        | d         j                  t          j        t	          j        | d         j                                      d          t          j	        t	          j        | d         j                                      d          t          j
        t	          j        | d         j                                      d          i          }|                    |          }|t          j	                 }|t          j
                 }||z  }	|	|z  }
|	| d<   |
| d	<   || d
<   || d<   | S )a%  Computes the importance sampling weights for the given batch of samples.

    For a lot of off-policy estimators, the importance sampling weights are computed as
    the propensity score ratio between the new and old policies
    (i.e. new_pi(act|obs) / old_pi(act|obs)). This function is to be used with
    map_batches() to perform a batch prediction on a dataset of records with `obs`,
    `actions`, `action_prob` and `rewards` columns.

    Args:
        batch: A sub-batch from the dataset.
        policy_state: The state of the policy to use for the prediction.
        estimator_class: The estimator class to use for the prediction. This class

    Returns:
        The modified batch with the importance sampling weights, weighted rewards, new
        and old propensities added as columns.
    r   )policygammaepsilon_greedyobsactionsr   action_probrewardsweightsweighted_rewardsnew_probold_prob)r   r   r   r   r   r   valuesr   r   ACTION_PROBREWARDScompute_action_probs)r   r#   r$   r&   	estimatorr   r/   r0   r,   r-   r.   s              r    compute_is_weightsr6   =   sA   . |,,FvQqIIIIORYuU|':;;5+;+B!C!C!K!KB!O!O#RYu]/C/J%K%K%S%STV%W%W5+;+B!C!C!K!KB!O!O		
 L --l;;HK34H;./G!G(E) 0E
 E* E*Lr"   c                     t           j        t           j        t           j        t           j        t           j        t           j        h}| j        D ]%}||v r| |                             d           | |<   &| S )a  Removes the time dimension from the given sub-batch of the dataset.

    If each row in a dataset has a time dimension ([T, D]), and T=1, this function will
    remove the T dimension to convert each row to of shape [D]. If T > 1, the row is
    left unchanged. This function is to be used with map_batches().

    Args:
        batch: The batch to remove the time dimension from.
    Returns:
        The modified batch with the time dimension removed (when applicable)
    c                 <    t          |           dk    r| d         n| S )N   r   )len)xs    r    <lambda>z!remove_time_dim.<locals>.<lambda>   s    A!! r"   )	r   r   r   r2   r3   NEXT_OBSDONEScolumnsapply)r   BATCHED_KEYSks      r    remove_time_dimrC   l   sq     	L ] L LQx~~&J&JKKE!HLr"   )T)typingr   r   r   r   numpyr   pandaspdray.rllib.policyr   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr	   ray.rllib.utils.numpyr
   ,ray.rllib.offline.estimators.fqe_torch_modelr   1ray.rllib.offline.estimators.off_policy_estimatorr   	DataFramestrboolr!   r6   rC    r"   r    <module>rR      s   1 1 1 1 1 1 1 1 1 1 1 1         # # # # # # 5 5 5 5 5 5 4 4 4 4 4 4 2 2 2 2 2 2 UJJJJJJTTTTTT 
 "	) )<)o&) c3h) 	)
 \) ) ) )X +<+sCx.+ ./+ \	+ + + +\ 2< BL      r"   