
    &`ic/                        d dl mZmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZ e G d d	                      Zed
edede
ddfd            Ze	 	 	 	 	 	 d!de
dededededededefd            Ze	 	 d"dede
deeee
f                  de
fd            Zede
dede
fd            Zedej        dedej        fd             ZdS )#    )DictOptionalN)Policy)SampleBatch)DeveloperAPIOldAPIStack)convert_to_numpy)AgentID
TensorTypec                       e Zd ZdZdZdZdS )Postprocessingz(Constant definitions for postprocessing.
advantagesvalue_targetsN)__name__
__module____qualname____doc__
ADVANTAGESVALUE_TARGETS     w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/evaluation/postprocessing.pyr   r      s        22J#MMMr   r   n_stepgammabatchreturnc           
      ,   |                                 s
J d            t          |          }t          j        |t          j                 | d         t          j        |t          j                 d         gt          | |          z            gd          |t          j        <   t          j        |t          j	                 | dz
  d         t          j
        |t          j	                 d         t          | dz
  |                    gd          |t          j	        <   t          j        |v rzt          j        |t          j                 | dz
  d         t          j
        |t          j                 d         t          | dz
  |                    gd          |t          j        <   t          |          D ]]}t          d|           D ]J}||z   |k     r?|t          j                 |xx         ||z  |t          j                 ||z            z  z  cc<   K^dS )a  Rewrites `batch` to encode n-step rewards, terminateds, truncateds, and next-obs.

    Observations and actions remain unaffected. At the end of the trajectory,
    n is truncated to fit in the traj length.

    Args:
        n_step: The number of steps to look ahead and adjust.
        gamma: The discount factor.
        batch: The SampleBatch to adjust (in place).

    Examples:
        n-step=3
        Trajectory=o0 r0 d0, o1 r1 d1, o2 r2 d2, o3 r3 d3, o4 r4 d4=True o5
        gamma=0.9
        Returned trajectory:
        0: o0 [r0 + 0.9*r1 + 0.9^2*r2 + 0.9^3*r3] d3 o0'=o3
        1: o1 [r1 + 0.9*r2 + 0.9^2*r3 + 0.9^3*r4] d4 o1'=o4
        2: o2 [r2 + 0.9*r3 + 0.9^2*r4] d4 o1'=o5
        3: o3 [r3 + 0.9*r4] d4 o3'=o5
        4: o4 r4 d4 o4'=o5
    z8Unexpected terminated|truncated in middle of trajectory!Nr   axis   )is_single_trajectorylennpconcatenater   OBSstackNEXT_OBSminTERMINATEDStile
TRUNCATEDSrangeREWARDS)r   r   r   len_ijs         r   adjust_nstepr2      s   2 	""$$B BAB B$ u::D #%.+/"677+HeK01"56VT9J9JJKK	
 # # #E+
 &(^+)*6A:<<8GE+1226FQJ8M8MNN	
 & & &E+
!" &&(*k,-fqjll;k45b93vz4;P;PQQ )
 )
 )
k$% 4[[  q&!! 	 	A1ut||k)*1---1Hu[%89!a%@@---	 r   ?      ?Trolloutlast_rlambda_use_gae
use_criticrewardsvf_predsc                    t           j        | v s|r
J d            |s|r
J d            t          |          }|| t           j                 }||r| t           j                 }|rt	          j        |t	          j        |g          g          }|||dd         z  z   |dd         z
  }	t          |	||z            | t          j	        <   | t          j	                 |z   
                    t          j                  | t          j        <   nt	          j        |t	          j        |g          g          }
t          |
|          dd         
                    t          j                  }|r"||z
  | t          j	        <   || t          j        <   n@|| t          j	        <   t	          j        | t          j	                           | t          j        <   | t          j	                 
                    t          j                  | t          j	        <   | S )a  Given a rollout, compute its value targets and the advantages.

    Args:
        rollout: SampleBatch of a single trajectory.
        last_r: Value estimation for last observation.
        gamma: Discount factor.
        lambda_: Parameter for GAE.
        use_gae: Using Generalized Advantage Estimation.
        use_critic: Whether to use critic (value estimates). Setting
            this to False will use 0 as baseline.
        rewards: Override the reward values in rollout.
        vf_preds: Override the value function predictions in rollout.

    Returns:
        SampleBatch with experience from rollout and processed rewards.
    z$use_critic=True but values not foundz,Can't use gae without using a value functionNr!   r   )r   VF_PREDSr	   r.   r$   r%   arraydiscount_cumsumr   r   astypefloat32r   
zeros_like)r5   r6   r   r7   r8   r9   r:   r;   vpred_tdelta_trewards_plus_vdiscounted_returnss               r   compute_advantagesrG   U   s   8 	''z''- 	('9TWTT&TTT$f%%F+-.J;/0 .(BHfX,>,>!?@@EGABBK//'#2#,> .=Wego-V-V)*N-.9
&

 	,-- "(F82D2D(EFF,^UCCCRCHOOJ
 
  	1Ch1NGN-.4FGN0111CGN-.46M125 5GN01 *11J)K)R)R

* *GN%& Nr   policysample_batchother_agent_batchesc                    t          ||           }t          j        |t          j                           }t          j        |t          j                           }t          |j                  dk    rA|j        |j        k    sJ t          j        |d          }t          j        |d          }d}nd}t          ||t          j
                 d         | j        d         | j        d         | j        d	         | j                            d
d          ||          }|r3t          j        |t          j                 d          |t          j        <   |S )a  Adds GAE (generalized advantage estimations) to a trajectory.

    The trajectory contains only data from one episode and from one agent.
    - If  `config.batch_mode=truncate_episodes` (default), sample_batch may
    contain a truncated (at-the-end) episode, in case the
    `config.rollout_fragment_length` was reached by the sampler.
    - If `config.batch_mode=complete_episodes`, sample_batch will contain
    exactly one episode (no matter how long).
    New columns can be added to sample_batch and existing ones may be altered.

    Args:
        policy: The Policy used to generate the trajectory (`sample_batch`)
        sample_batch: The SampleBatch to postprocess.
        other_agent_batches: Optional dict of AgentIDs mapping to other
            agents' trajectory data (from the same episode).
            NOTE: The other agents use the same policy.
        episode: Optional multi-agent episode object in which the agents
            operated.

    Returns:
        The postprocessed, modified SampleBatch (or a new one).
       r!   r   TFr   r   lambdar8   r9   )r5   r6   r   r7   r8   r9   r;   r:   )compute_bootstrap_valuer$   r>   r   r=   r.   r#   shapesqueezerG   VALUES_BOOTSTRAPPEDconfiggetexpand_dimsr   r   )rH   rI   rJ   episoder;   r:   squeezedr   s           r   compute_gae_for_sample_batchrW      sA   > +<@@Lx[%9:;;Hh|K$7899G 8>a~....:hQ///*W1--- K;<R@mG$h'i(=$$\488	 	 	E  
 ,.>.+,1,
 ,
 ,
n'( Lr   c                    | t           j                 d         rd}n)|                     |j        d          } |j        di |}t          j        | t           j                           }t          |j	                  dk    rt          j
        |d          }d}nd	}t          j        t          |dd
                   t          j        t          |          gt
          j                  gd          | t           j        <   |rVt          j        |d          | t           j        <   t          j        | t           j                 d          | t           j        <   | S )a  Performs a value function computation at the end of a trajectory.

    If the trajectory is terminated (not truncated), will not use the value function,
    but assume that the value of the last timestep is 0.0.
    In all other cases, will use the given policy's value function to compute the
    "bootstrapped" value estimate at the end of the given trajectory. To do so, the
    very last observation (sample_batch[NEXT_OBS][-1]) and - if applicable -
    the very last state output (sample_batch[STATE_OUT][-1]) wil be used as inputs to
    the value function.

    The thus computed value estimate will be stored in a new column of the
    `sample_batch`: SampleBatch.VALUES_BOOTSTRAPPED. Thereby, values at all timesteps
    in this column are set to 0.0, except or the last timestep, which receives the
    computed bootstrapped value.
    This is done, such that in any loss function (which processes raw, intact
    trajectories, such as those of IMPALA and APPO) can use this new column as follows:

    Example: numbers=ts in episode, '|'=episode boundary (terminal),
    X=bootstrapped value (!= 0.0 b/c ts=12 is not a terminal).
    ts=5 is NOT a terminal.
    T:                     8   9  10  11  12 <- no terminal
    VF_PREDS:              .   .   .   .   .
    VALUES_BOOTSTRAPPED:   0   0   0   0   X

    Args:
        sample_batch: The SampleBatch (single trajectory) for which to compute the
            bootstrap value at the end. This SampleBatch will be altered in place
            (by adding a new column: SampleBatch.VALUES_BOOTSTRAPPED).
        policy: The Policy object, whose value function to use.

    Returns:
         The altered SampleBatch (with the extra SampleBatch.VALUES_BOOTSTRAPPED
         column).
    r   g        last)indexrL   r!   r   TFN)dtyper   r   )r   r*   get_single_step_input_dictview_requirements_valuer$   r>   r=   r#   rO   rP   r%   r	   rA   rQ   rT   )rI   rH   r6   
input_dictr;   rV   s         r   rN   rN      s`   J K+,R0 - "<<$F = 
 

 ,,,,x[%9:;;H
8>a:hQ///
 57NXabb\**H&v../rzBBB	
 5 5 5L01  
-/^H1-M-M-M[)*8:899
 9
 9
[45 r   xc                     t           j                            dgdt          |           g| ddd         d          ddd         S )a  Calculates the discounted cumulative sum over a reward sequence `x`.

    y[t] - discount*y[t+1] = x[t]
    reversed(y)[t] - discount*reversed(y)[t-1] = reversed(x)[t]

    Args:
        gamma: The discount factor gamma.

    Returns:
        The sequence containing the discounted cumulative sums
        for each individual reward in `x` till the end of the trajectory.

     .. testcode::
        :skipif: True

        x = np.array([0.0, 1.0, 2.0, 3.0])
        gamma = 0.9
        discount_cumsum(x, gamma)

    .. testoutput::

        array([0.0 + 0.9*1.0 + 0.9^2*2.0 + 0.9^3*3.0,
               1.0 + 0.9*2.0 + 0.9^2*3.0,
               2.0 + 0.9*3.0,
               3.0])
    r!   Nr   r   r   )scipysignallfilterfloat)r`   r   s     r   r?   r?   ,  sG    8 <av%744R4qII$$B$OOr   )r3   r4   TTNN)NN)typingr   r   numpyr$   scipy.signalrb   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.numpyr	   ray.rllib.utils.typingr
   r   r   intre   r2   boolrG   rW   rN   ndarrayr?   r   r   r   <module>rq      s   ! ! ! ! ! ! ! !         * * * * * * 5 5 5 5 5 5 A A A A A A A A 2 2 2 2 2 2 6 6 6 6 6 6 6 6 $ $ $ $ $ $ $ $ < <U <; <4 < < < <~  A AAA A 	A
 A A A A A A AH  AE	@ @@@ "$w';"<=@
 @ @ @ @F J+ Jv J+ J J J JZ Prz P% PBJ P P P P P Pr   