
    &`itS                       d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
l m!Z!m"Z" d dl#m$Z$  e$d           G d d                      Z%dS )    N)defaultdict)AnyDictListOptionalSupportsFloatUnion)ActTypeObsType)
Deprecated)Columns)InfiniteLookbackBuffer)SampleBatch)gym_space_from_dictgym_space_to_dict)AgentIDModuleID)	PublicAPIalpha)	stabilityc            !       \   e Zd ZdZdZ	 dTddddddddddddddddee         deeee	         e
f                  d	eej                 d
eeee         e
f                  deeee         e
f                  deej                 deeee         e
f                  dededeeeef                  dee         deeef         dee         dee         dee         fdZ	 dTde	d
ee         ddfdZ	 dTddddde	deded
eeeef                  dededeeeef                  ddfdZdUdZed             Zedefd             Zedefd!            Zedefd"            ZdVd#ZdWd%ZdXdedd fd'Z 	 dTdddd(d)eeeee         e!f                  d*ed+ee         d,edef
d-Z"	 dTddd.d)eeeee         e!f                  d*ed+ee         defd/Z#	 dTdddd(d)eeeee         e!f                  d*ed+ee         d,edef
d0Z$	 dTddd.d)eeeee         e!f                  d*ed+ee%         defd1Z&	 dTddd.d2ed)eeeee         e!f                  d*ed+ee         def
d3Z'ddd4d5eeeee         e!f                  d*eddfd6Z(ddd4d5eeeee         e!f                  d*eddfd7Z)ddd4d5eeeee         e!f                  d*eddfd8Z*ddd4d5eeeee         e!f                  d*eddfd9Z+dd:d;e!dee         dd fd<Z!d= Z,de-fd>Z.de%fd?Z/de%fd@Z0defdAZ1defdBZ2deeef         fdCZ3e4dDeeef         dd fdE            Z5edF             Z6e6j7        dG             Z6edH             Z8e8j7        dI             Z8defdJZ9dK Z:dLe!dd fdMZ; e<dNdOP          dQ             Z= e<dRdOP          dS             Z>dS )YSingleAgentEpisodea.  A class representing RL environment episodes for individual agents.

    SingleAgentEpisode stores observations, info dicts, actions, rewards, and all
    module outputs (e.g. state outs, action logp, etc..) for an individual agent within
    some single-agent or multi-agent environment.
    The two main APIs to add data to an ongoing episode are the `add_env_reset()`
    and `add_env_step()` methods, which should be called passing the outputs of the
    respective gym.Env API calls: `env.reset()` and `env.step()`.

    A SingleAgentEpisode might also only represent a chunk of an episode, which is
    useful for cases, in which partial (non-complete episode) sampling is performed
    and collected episode data has to be returned before the actual gym.Env episode has
    finished (see `SingleAgentEpisode.cut()`). In order to still maintain visibility
    onto past experiences within such a "cut" episode, SingleAgentEpisode instances
    can have a "lookback buffer" of n timesteps at their beginning (left side), which
    solely exists for the purpose of compiling extra data (e.g. "prev. reward"), but
    is not considered part of the finished/packaged episode (b/c the data in the
    lookback buffer is already part of a previous episode chunk).

    Powerful getter methods, such as `get_observations()` help collect different types
    of data from the episode at individual time indices or time ranges, including the
    "lookback buffer" range described above. For example, to extract the last 4 rewards
    of an ongoing episode, one can call `self.get_rewards(slice(-4, None))` or
    `self.rewards[-4:]`. This would work, even if the ongoing SingleAgentEpisode is
    a continuation chunk from a much earlier started episode, as long as it has a
    lookback buffer size of sufficient size.

    Examples:

    .. testcode::

        import gymnasium as gym
        import numpy as np

        from ray.rllib.env.single_agent_episode import SingleAgentEpisode

        # Construct a new episode (without any data in it yet).
        episode = SingleAgentEpisode()
        assert len(episode) == 0

        # Fill the episode with some data (10 timesteps).
        env = gym.make("CartPole-v1")
        obs, infos = env.reset()
        episode.add_env_reset(obs, infos)

        # Even with the initial obs/infos, the episode is still considered len=0.
        assert len(episode) == 0
        for _ in range(5):
            action = env.action_space.sample()
            obs, reward, term, trunc, infos = env.step(action)
            episode.add_env_step(
                observation=obs,
                action=action,
                reward=reward,
                terminated=term,
                truncated=trunc,
                infos=infos,
            )
        assert len(episode) == 5

        # We can now access information from the episode via the getter APIs.

        # Get the last 3 rewards (in a batch of size 3).
        episode.get_rewards(slice(-3, None))  # same as `episode.rewards[-3:]`

        # Get the most recent action (single item, not batched).
        # This works regardless of the action space or whether the episode has
        # been numpy'ized or not (see below).
        episode.get_actions(-1)  # same as episode.actions[-1]

        # Looking back from ts=1, get the previous 4 rewards AND fill with 0.0
        # in case we go over the beginning (ts=0). So we would expect
        # [0.0, 0.0, 0.0, r0] to be returned here, where r0 is the very first received
        # reward in the episode:
        episode.get_rewards(slice(-4, 0), neg_index_as_lookback=True, fill=0.0)

        # Note the use of fill=0.0 here (fill everything that's out of range with this
        # value) AND the argument `neg_index_as_lookback=True`, which interprets
        # negative indices as being left of ts=0 (e.g. -1 being the timestep before
        # ts=0).

        # Assuming we had a complex action space (nested gym.spaces.Dict) with one or
        # more elements being Discrete or MultiDiscrete spaces:
        # 1) The `fill=...` argument would still work, filling all spaces (Boxes,
        # Discrete) with that provided value.
        # 2) Setting the flag `one_hot_discrete=True` would convert those discrete
        # sub-components automatically into one-hot (or multi-one-hot) tensors.
        # This simplifies the task of having to provide the previous 4 (nested and
        # partially discrete/multi-discrete) actions for each timestep within a training
        # batch, thereby filling timesteps before the episode started with 0.0s and
        # one-hot'ing the discrete/multi-discrete components in these actions:
        episode = SingleAgentEpisode(action_space=gym.spaces.Dict({
            "a": gym.spaces.Discrete(3),
            "b": gym.spaces.MultiDiscrete([2, 3]),
            "c": gym.spaces.Box(-1.0, 1.0, (2,)),
        }))

        # ... fill episode with data ...
        episode.add_env_reset(observation=0)
        # ... from a few steps.
        episode.add_env_step(
            observation=1,
            action={"a":0, "b":np.array([1, 2]), "c":np.array([.5, -.5], np.float32)},
            reward=1.0,
        )

        # In your connector
        prev_4_a = []
        # Note here that len(episode) does NOT include the lookback buffer.
        for ts in range(len(episode)):
            prev_4_a.append(
                episode.get_actions(
                    indices=slice(ts - 4, ts),
                    # Make sure negative indices are interpreted as
                    # "into lookback buffer"
                    neg_index_as_lookback=True,
                    # Zero-out everything even further before the lookback buffer.
                    fill=0.0,
                    # Take care of discrete components (get ready as NN input).
                    one_hot_discrete=True,
                )
            )

        # Finally, convert from list of batch items to a struct (same as action space)
        # of batched (numpy) arrays, in which all leafs have B==len(prev_4_a).
        from ray.rllib.utils.spaces.space_utils import batch

        prev_4_actions_col = batch(prev_4_a)
    )actionsagent_idextra_model_outputsid_infosis_terminatedis_truncated	module_idmulti_agent_episode_idobservationsrewardst	t_started_action_space_last_added_observation_last_added_infos_last_step_time_observation_space_start_time_custom_dataNFauto)r"   observation_spacer   r   action_spacer#   
terminated	truncatedr   r%   len_lookback_bufferr   r    r!   r   r"   r.   r   r   r/   r#   r0   r1   r   r%   r2   r   r    r!   c          
         |pt          j                    j        | _        || _        || _        || _        |t          |          nd}|dk    s||k    r|}|p'd t          t          |pg                     D             }d| _	        t          |t                    r|| _        nt          ||          | _        || _        t          |t                    r|| _        nt          ||          | _        d| _        t          |t                    r|| _        nt          ||          | _        || _        t          |t                    r|| _        n[t          ||t&          j                            t-          d          t-          d          dt.          j                  	          | _        || _        |	| _        i | _        |
pi                                 D ]>\  }}t          |t                    r|| j        |<   %t          ||          | j        |<   ?|pd| _        t          | j                  | j        z   | _        i | _        d| _         d| _!        d| _"        d| _#        | $                                 dS )
a  Initializes a SingleAgentEpisode instance.

        This constructor can be called with or without already sampled data, part of
        which might then go into the lookback buffer.

        Args:
            id_: Unique identifier for this episode. If no ID is provided the
                constructor generates a unique hexadecimal code for the id.
            observations: Either a list of individual observations from a sampling or
                an already instantiated `InfiniteLookbackBuffer` object (possibly
                with observation data in it). If a list, will construct the buffer
                automatically (given the data and the `len_lookback_buffer` argument).
            observation_space: An optional gym.Space, which all individual observations
                should abide to. If not None and this SingleAgentEpisode is numpy'ized
                (via the `self.to_numpy()` method), and data is appended or set, the new
                data will be checked for correctness.
            infos: Either a list of individual info dicts from a sampling or
                an already instantiated `InfiniteLookbackBuffer` object (possibly
                with info dicts in it). If a list, will construct the buffer
                automatically (given the data and the `len_lookback_buffer` argument).
            actions: Either a list of individual info dicts from a sampling or
                an already instantiated `InfiniteLookbackBuffer` object (possibly
                with info dict] data in it). If a list, will construct the buffer
                automatically (given the data and the `len_lookback_buffer` argument).
            action_space: An optional gym.Space, which all individual actions
                should abide to. If not None and this SingleAgentEpisode is numpy'ized
                (via the `self.to_numpy()` method), and data is appended or set, the new
                data will be checked for correctness.
            rewards: Either a list of individual rewards from a sampling or
                an already instantiated `InfiniteLookbackBuffer` object (possibly
                with reward data in it). If a list, will construct the buffer
                automatically (given the data and the `len_lookback_buffer` argument).
            extra_model_outputs: A dict mapping string keys to either lists of
                individual extra model output tensors (e.g. `action_logp` or
                `state_outs`) from a sampling or to already instantiated
                `InfiniteLookbackBuffer` object (possibly with extra model output data
                in it). If mapping is to lists, will construct the buffers automatically
                (given the data and the `len_lookback_buffer` argument).
            terminated: A boolean indicating, if the episode is already terminated.
            truncated: A boolean indicating, if the episode has been truncated.
            t_started: Optional. The starting timestep of the episode. The default
                is zero. If data is provided, the starting point is from the last
                observation onwards (i.e. `t_started = len(observations) - 1`). If
                this parameter is provided the episode starts at the provided value.
            len_lookback_buffer: The size of the (optional) lookback buffers to keep in
                front of this Episode for each type of data (observations, actions,
                etc..). If larger than 0, the first `len_lookback_buffer`
                items of each type of data are interpreted as NOT part of this actual
                episode chunk, but instead serve as "historical" record that may be
                viewed and used to derive new data from. For example, it might be
                necessary to have a lookback buffer of four if you would like to do
                observation frame stacking and your episode has been cut and you're now
                operating on a new chunk (continuing from the cut one). Then, for the
                first 3 items, you would have to be able to look back into the old
                chunk's data.
                If `len_lookback_buffer` is "auto" (default), will interpret all
                provided data in the constructor as part of the lookback buffers.
            agent_id: An optional AgentID indicating which agent this episode belongs
                to. This information is stored under `self.agent_id` and only serves
                reference purposes.
            module_id: An optional ModuleID indicating which RLModule this episode
                belongs to. Normally, this information is obtained by querying an
                `agent_to_module_mapping_fn` with a given agent ID. This information
                is stored under `self.module_id` and only serves reference purposes.
            multi_agent_episode_id: An optional EpisodeID of the encapsulating
                `MultiAgentEpisode` that this `SingleAgentEpisode` belongs to.
        Nr   r-   c                     g | ]}i S  r5   ).0_s     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/env/single_agent_episode.py
<listcomp>z/SingleAgentEpisode.__init__.<locals>.<listcomp>  s    EEE"EEE    datalookbackz-infinfr5   r<   r=   space)%uuiduuid4hexr   r   r    r!   lenranger*   
isinstancer   r"   r.   r   r&   r   r/   r#   gymspacesBoxfloatnpfloat32r   r   r   itemsr%   r$   r,   r+   r)   r'   r(   validate)selfr   r"   r.   r   r   r/   r#   r0   r1   r   r%   r2   r   r    r!   len_rewardskvs                      r8   __init__zSingleAgentEpisode.__init__   s   l *$*,,* "&<# '.&9c'lllq&((,?+,M,M"-EEEeC0B,C,C&D&DEEE #'l$:;; 	 ,D 6!,! ! !D "3 e344 	DJJ/,  DJ "g566 	"DLL1,  DL ) g566 	"DLL1,jnnU6]]E%LL"bjQQ  DL ( & $& (.B5577 
	 
	DAq!344 	./(++ /E%8/ / /(++ #aT\""T^3   #'+$!% 	r:   observationreturnc                 x   | j         rJ | j        rJ t          | j                  dk    sJ | j        | j        cxk    rdk    sn J |pi }| j                            |           | j                            |           || _        || _	        | 
                                 t          j                    | _        dS )a/  Adds the initial data (after an `env.reset()`) to the episode.

        This data consists of initial observations and initial infos.

        Args:
            observation: The initial observation returned by `env.reset()`.
            infos: An (optional) info dict returned by `env.reset()`.
        r   N)is_resetis_donerD   r"   r$   r%   appendr   r'   r(   rN   timeperf_counterr+   )rO   rT   r   s      r8   add_env_resetz SingleAgentEpisode.add_env_resetf  s     =   <4$%%**** v,,,,1,,,,,,  ---
%   '2$!& 	  ,..r:   )r0   r1   r   actionrewardc                   | j         r
J d            | j                            |           | j                            |           | j                            |           |pi }| j                            |           | xj        dz  c_        |\|                                D ]G\  }}	|| j        vrt          |	g          | j        |<   '| j        |                             |	           H|| _
        || _        || _        || _        | j        rx| j        dz  rn| j        0| j                            |          sJ d| d| j         d            | j        0| j                            |          sJ d| d	| j         d            |                                  t'          j                    | _        | j        | j        | _        dS dS )
a  Adds results of an `env.step()` call (including the action) to this episode.

        This data consists of an observation and info dict, an action, a reward,
        terminated/truncated flags, and extra model outputs (e.g. action probabilities
        or RNN internal state outputs).

        Args:
            observation: The next observation received from the environment after(!)
                taking `action`.
            action: The last action used by the agent during the call to `env.step()`.
            reward: The last reward received by the agent after taking `action`.
            infos: The last info received from the environment after taking `action`.
            terminated: A boolean indicating, if the environment has been
                terminated (after taking `action`).
            truncated: A boolean indicating, if the environment has been
                truncated (after taking `action`).
            extra_model_outputs: The last timestep's specific model outputs.
                These are normally outputs of an RLModule that were computed along with
                `action`, e.g. `action_logp` or `action_dist_inputs`.
        z?The agent is already done: no data can be added to its episode.   Nd   z`observation` z6 does NOT fit SingleAgentEpisode's observation_space: !z	`action` z1 does NOT fit SingleAgentEpisode's action_space: )rX   r"   rY   r   r#   r   r$   rM   r   r   r   r   r'   r(   is_numpyr.   containsr/   rN   rZ   r[   r)   r+   )
rO   rT   r]   r^   r   r0   r1   r   rQ   rR   s
             r8   add_env_stepzSingleAgentEpisode.add_env_step  s@   B 	M 	ML	M 	M 	  ---F###F###
%   !*+1133 : :1D4442H!2M2MD,Q//,Q/66q9999'%'2$!& = 
	TVc\ 
	%1-66{CC  D[ D D*.*@D D D C  ,(11&99  : : :%)%6: : : 9 	  $022##3D $#r:   c                    t          | j                  t          | j                  k    sJ t          | j                  dk    rt          | j                  t          | j                  cxk    rt          | j                  cxk    rdk    sn J | j                                        D ]8\  }}t          |          dk    s J |||j        t          |          f            9dS t          | j                  t          | j                  cxk    r7t          | j                  dz   cxk    rt          | j                  dz   k    sXn J t          | j                  t          | j                  t          | j                  t          | j                  f            | j                                        D ]/\  }}t          |          t          | j                  dz
  k    sJ 0dS )zValidates the episode's data.

        This function ensures that the data stored to a `SingleAgentEpisode` is
        in order (e.g. that the correct number of observations, actions, rewards
        are there).
        r   r`   N)rD   r"   r   r#   r   r   rM   r<   rO   rQ   rR   s      r8   rN   zSingleAgentEpisode.validate  s	    4$%%TZ8888t !!Q&&tz??c$,&7&7QQQQ3t|;L;LQQQQPQQQQQQQ06688 ; ;11vv{{{Q163q66$:{{{{; ; D%&&tz??) ) ) )t|$$q() ) ) ) t|$$q() ) ) ) )
 D%&&DJDL!!DL!!		) ) ) 06688 < <11vvT%6!7!7!!;;;;;;< <r:   c                     | j         S N)r,   rO   s    r8   custom_datazSingleAgentEpisode.custom_data  s      r:   c                 2    t          | j                  dk    S )z?Returns True if `self.add_env_reset()` has already been called.r   )rD   r"   rj   s    r8   rW   zSingleAgentEpisode.is_reset  s     4$%%))r:   c                     | j         j        S )zDTrue, if the data in this episode is already stored as numpy arrays.)r#   	finalizedrj   s    r8   rc   zSingleAgentEpisode.is_numpy  s    
 |%%r:   c                     | j         p| j        S )a  Whether the episode is actually done (terminated or truncated).

        A done episode cannot be continued via `self.add_timestep()` or being
        concatenated on its right-side with another episode chunk or being
        succeeded via `self.create_successor()`.
        )r   r   rj   s    r8   rX   zSingleAgentEpisode.is_done   s     !6T%66r:   c                 >   | j                                          t          |           dk    rp| j                                         | j                                         | j                                        D ]$\  }}| j        |                                          %| S )a
  Converts this Episode's list attributes to numpy arrays.

        This means in particular that this episodes' lists of (possibly complex)
        data (e.g. if we have a dict obs space) will be converted to (possibly complex)
        structs, whose leafs are now numpy arrays. Each of these leaf numpy arrays will
        have the same length (batch dimension) as the length of the original lists.

        Note that the data under the Columns.INFOS are NEVER numpy'ized and will remain
        a list (normally, a list of the original, env-returned dicts). This is due to
        the herterogenous nature of INFOS returned by envs, which would make it unwieldy
        to convert this information to numpy arrays.

        After calling this method, no further data may be added to this episode via
        the `self.add_env_step()` method.

        Examples:

        .. testcode::

            import numpy as np

            from ray.rllib.env.single_agent_episode import SingleAgentEpisode

            episode = SingleAgentEpisode(
                observations=[0, 1, 2, 3],
                actions=[1, 2, 3],
                rewards=[1, 2, 3],
                # Note: terminated/truncated have nothing to do with an episode
                # being numpy'ized or not (via the `self.to_numpy()` method)!
                terminated=False,
                len_lookback_buffer=0,  # no lookback; all data is actually "in" episode
            )
            # Episode has not been numpy'ized yet.
            assert not episode.is_numpy
            # We are still operating on lists.
            assert episode.get_observations([1]) == [1]
            assert episode.get_observations(slice(None, 2)) == [0, 1]
            # We can still add data (and even add the terminated=True flag).
            episode.add_env_step(
                observation=4,
                action=4,
                reward=4,
                terminated=True,
            )
            # Still NOT numpy'ized.
            assert not episode.is_numpy

            # Numpy'ized the episode.
            episode.to_numpy()
            assert episode.is_numpy

            # We cannot add data anymore. The following would crash.
            # episode.add_env_step(observation=5, action=5, reward=5)

            # Everything is now numpy arrays (with 0-axis of size
            # B=[len of requested slice]).
            assert isinstance(episode.get_observations([1]), np.ndarray)  # B=1
            assert isinstance(episode.actions[0:2], np.ndarray)  # B=2
            assert isinstance(episode.rewards[1:4], np.ndarray)  # B=3

        Returns:
             This `SingleAgentEpisode` object with the converted numpy data.
        r   )r"   finalizerD   r   r#   r   rM   rg   s      r8   to_numpyzSingleAgentEpisode.to_numpy
  s    B 	""$$$t99q==L!!###L!!###06688 7 71(+446666r:   otherc           
         |j         | j         k    sJ | j        rJ | j        |j        k    sJ |                                 t          j        |j        d         | j        d                    t          j	        t          j
        t          j        t          j        |j        d         | j        d                                       sJ | j                                         | j                                         | j                            |                                           | j                            |                                           | j                            |                                           | j                            |                                           |j        | _        |j        rd| _        n|j        rd| _        |j                                        D ]@}|| j        v sJ | j        |                             |                    |                     A| j                            |j                   |                                  dS )a  Adds the given `other` SingleAgentEpisode to the right side of `self`.

        In order for this to work, both chunks (`self` and `other`) must fit
        together. This is checked by the IDs (must be identical), the time step counters
        (`self.env_t` must be the same as `episode_chunk.env_t_started`), as well as the
        observations/infos at the concatenation boundaries. Also, `self.is_done` must
        not be True, meaning `self.is_terminated` and `self.is_truncated` are both
        False.

        Args:
            other: The other `SingleAgentEpisode` to be concatenated to this one.

        Returns:
            A `SingleAgentEpisode` instance containing the concatenated data
            from both episodes (`self` and `other`).
        r   TN)r   rX   r$   r%   rN   treeassert_same_structurer"   rK   allflattenmap_structurearray_equalpopr   extendget_observationsr   get_actionsr#   get_rewards	get_infosr   r   r   keysget_extra_model_outputsrk   update)rO   rs   keys      r8   concat_episodez!SingleAgentEpisode.concat_episodeT  s?   " yDH$$$$ <v(((( 	"5#5a#8$:KB:OPPP vL"NE$6q$94;LR;P  
 
 	
 	
 
 	
 	  !7!7!9!9:::E--//000E--//000
%//++,,, 	%!%D 	% $D,1133 	U 	UC$22222$S)001N1Ns1S1STTTT
 	 1222 	r:   r   c                      j         s|dk    sJ t          | dz
  d          }|dk    rt          | d          nt          dd          t           j                             |           j                             |                                          j         	                               fd j
                                        D              j        d
  
        }t          j         j                  |_        |S )a}  Returns a successor episode chunk (of len=0) continuing from this Episode.

        The successor will have the same ID as `self`.
        If no lookback buffer is requested (len_lookback_buffer=0), the successor's
        observations will be the last observation(s) of `self` and its length will
        therefore be 0 (no further steps taken yet). If `len_lookback_buffer` > 0,
        the returned successor will have `len_lookback_buffer` observations (and
        actions, rewards, etc..) taken from the right side (end) of `self`. For example
        if `len_lookback_buffer=2`, the returned successor's lookback buffer actions
        will be identical to `self.actions[-2:]`.

        This method is useful if you would like to discontinue building an episode
        chunk (b/c you have to return it from somewhere), but would like to have a new
        episode instance to continue building the actual gym.Env episode at a later
        time. Vie the `len_lookback_buffer` argument, the continuing chunk (successor)
        will still be able to "look back" into this predecessor episode's data (at
        least to some extend, depending on the value of `len_lookback_buffer`).

        Args:
            len_lookback_buffer: The number of timesteps to take along into the new
                chunk as "lookback buffer". A lookback buffer is additional data on
                the left side of the actual episode data for visibility purposes
                (but without actually being part of the new chunk). For example, if
                `self` ends in actions 5, 6, 7, and 8, and we call
                `self.cut(len_lookback_buffer=2)`, the returned chunk will have
                actions 7 and 8 already in it, but still `t_started`==t==8 (not 7!) and
                a length of 0. If there is not enough data in `self` yet to fulfil
                the `len_lookback_buffer` request, the value of `len_lookback_buffer`
                is automatically adjusted (lowered).

        Returns:
            The successor Episode chunk of this one with the same ID and state and the
            only observation being the last observation in self.
        r   r`   N)indicesc                 >    i | ]}|                     |          S r5   r   )r6   rQ   indices_restrO   s     r8   
<dictcomp>z*SingleAgentEpisode.cut.<locals>.<dictcomp>  s;     ! ! ! 4//<@@! ! !r:   r-   )
r   r"   r.   r   r   r/   r#   r   r%   r2   )rX   slicer   r   r~   r.   r   r   r/   r   r   r   r$   copydeepcopyrk   r,   )rO   r2   indices_obs_and_infos
sa_episoder   s   `   @r8   cutzSingleAgentEpisode.cut  sE   F <<$71$<$<$<< !&':&:Q&> E E #Q&& &&---tQ 	 (..7L.MM"4..)>.??$$\$::*$$\$::! ! ! ! !16688! ! !
 f &!
 
 

& #'-0@"A"A
r:   )neg_index_as_lookbackfillone_hot_discreter   r   r   r   c                >    | j                             ||||          S )a7  Returns individual observations or batched ranges thereof from this episode.

        Args:
            indices: A single int is interpreted as an index, from which to return the
                individual observation stored at this index.
                A list of ints is interpreted as a list of indices from which to gather
                individual observations in a batch of size len(indices).
                A slice object is interpreted as a range of observations to be returned.
                Thereby, negative indices by default are interpreted as "before the end"
                unless the `neg_index_as_lookback=True` option is used, in which case
                negative indices are interpreted as "before ts=0", meaning going back
                into the lookback buffer.
                If None, will return all observations (from ts=0 to the end).
            neg_index_as_lookback: If True, negative values in `indices` are
                interpreted as "before ts=0", meaning going back into the lookback
                buffer. For example, an episode with observations [4, 5, 6,  7, 8, 9],
                where [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will
                respond to `get_observations(-1, neg_index_as_lookback=True)`
                with `6` and to
                `get_observations(slice(-2, 1), neg_index_as_lookback=True)` with
                `[5, 6,  7]`.
            fill: An optional value to use for filling up the returned results at
                the boundaries. This filling only happens if the requested index range's
                start/stop boundaries exceed the episode's boundaries (including the
                lookback buffer on the left side). This comes in very handy, if users
                don't want to worry about reaching such boundaries and want to zero-pad.
                For example, an episode with observations [10, 11,  12, 13, 14] and
                lookback buffer size of 2 (meaning observations `10` and `11` are part
                of the lookback buffer) will respond to
                `get_observations(slice(-7, -2), fill=0.0)` with
                `[0.0, 0.0, 10, 11, 12]`.
            one_hot_discrete: If True, will return one-hot vectors (instead of
                int-values) for those sub-components of a (possibly complex) observation
                space that are Discrete or MultiDiscrete.  Note that if `fill=0` and the
                requested `indices` are out of the range of our data, the returned
                one-hot vectors will actually be zero-hot (all slots zero).

        Examples:

        .. testcode::

            import gymnasium as gym

            from ray.rllib.env.single_agent_episode import SingleAgentEpisode
            from ray.rllib.utils.test_utils import check

            episode = SingleAgentEpisode(
                # Discrete(4) observations (ints between 0 and 4 (excl.))
                observation_space=gym.spaces.Discrete(4),
                observations=[0, 1, 2, 3],
                actions=[1, 2, 3], rewards=[1, 2, 3],  # <- not relevant for this demo
                len_lookback_buffer=0,  # no lookback; all data is actually "in" episode
            )
            # Plain usage (`indices` arg only).
            check(episode.get_observations(-1), 3)
            check(episode.get_observations(0), 0)
            check(episode.get_observations([0, 2]), [0, 2])
            check(episode.get_observations([-1, 0]), [3, 0])
            check(episode.get_observations(slice(None, 2)), [0, 1])
            check(episode.get_observations(slice(-2, None)), [2, 3])
            # Using `fill=...` (requesting slices beyond the boundaries).
            check(episode.get_observations(slice(-6, -2), fill=-9), [-9, -9, 0, 1])
            check(episode.get_observations(slice(2, 5), fill=-7), [2, 3, -7])
            # Using `one_hot_discrete=True`.
            check(episode.get_observations(2, one_hot_discrete=True), [0, 0, 1, 0])
            check(episode.get_observations(3, one_hot_discrete=True), [0, 0, 0, 1])
            check(episode.get_observations(
                slice(0, 3),
                one_hot_discrete=True,
            ), [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]])
            # Special case: Using `fill=0.0` AND `one_hot_discrete=True`.
            check(episode.get_observations(
                -1,
                neg_index_as_lookback=True,  # -1 means one left of ts=0
                fill=0.0,
                one_hot_discrete=True,
            ), [0, 0, 0, 0])  # <- all 0s one-hot tensor (note difference to [1 0 0 0]!)

        Returns:
            The collected observations.
            As a 0-axis batch, if there are several `indices` or a list of exactly one
            index provided OR `indices` is a slice object.
            As single item (B=0 -> no additional 0-axis) if `indices` is a single int.
        r   r   r   r   )r"   getrO   r   r   r   r   s        r8   r~   z#SingleAgentEpisode.get_observations  s1    x  $$"7-	 % 
 
 	
r:   )r   r   c                <    | j                             |||          S )a  Returns individual info dicts or list (ranges) thereof from this episode.

        Args:
            indices: A single int is interpreted as an index, from which to return the
                individual info dict stored at this index.
                A list of ints is interpreted as a list of indices from which to gather
                individual info dicts in a list of size len(indices).
                A slice object is interpreted as a range of info dicts to be returned.
                Thereby, negative indices by default are interpreted as "before the end"
                unless the `neg_index_as_lookback=True` option is used, in which case
                negative indices are interpreted as "before ts=0", meaning going back
                into the lookback buffer.
                If None, will return all infos (from ts=0 to the end).
            neg_index_as_lookback: If True, negative values in `indices` are
                interpreted as "before ts=0", meaning going back into the lookback
                buffer. For example, an episode with infos
                [{"l":4}, {"l":5}, {"l":6},  {"a":7}, {"b":8}, {"c":9}], where the
                first 3 items are the lookback buffer (ts=0 item is {"a": 7}), will
                respond to `get_infos(-1, neg_index_as_lookback=True)` with
                `{"l":6}` and to
                `get_infos(slice(-2, 1), neg_index_as_lookback=True)` with
                `[{"l":5}, {"l":6},  {"a":7}]`.
            fill: An optional value to use for filling up the returned results at
                the boundaries. This filling only happens if the requested index range's
                start/stop boundaries exceed the episode's boundaries (including the
                lookback buffer on the left side). This comes in very handy, if users
                don't want to worry about reaching such boundaries and want to
                auto-fill. For example, an episode with infos
                [{"l":10}, {"l":11},  {"a":12}, {"b":13}, {"c":14}] and lookback buffer
                size of 2 (meaning infos {"l":10}, {"l":11} are part of the lookback
                buffer) will respond to `get_infos(slice(-7, -2), fill={"o": 0.0})`
                with `[{"o":0.0}, {"o":0.0}, {"l":10}, {"l":11}, {"a":12}]`.

        Examples:

        .. testcode::

            from ray.rllib.env.single_agent_episode import SingleAgentEpisode

            episode = SingleAgentEpisode(
                infos=[{"a":0}, {"b":1}, {"c":2}, {"d":3}],
                # The following is needed, but not relevant for this demo.
                observations=[0, 1, 2, 3], actions=[1, 2, 3], rewards=[1, 2, 3],
                len_lookback_buffer=0,  # no lookback; all data is actually "in" episode
            )
            # Plain usage (`indices` arg only).
            episode.get_infos(-1)  # {"d":3}
            episode.get_infos(0)  # {"a":0}
            episode.get_infos([0, 2])  # [{"a":0},{"c":2}]
            episode.get_infos([-1, 0])  # [{"d":3},{"a":0}]
            episode.get_infos(slice(None, 2))  # [{"a":0},{"b":1}]
            episode.get_infos(slice(-2, None))  # [{"c":2},{"d":3}]
            # Using `fill=...` (requesting slices beyond the boundaries).
            # TODO (sven): This would require a space being provided. Maybe we can
            #  skip this check for infos, which don't have a space anyways.
            # episode.get_infos(slice(-5, -3), fill={"o":-1})  # [{"o":-1},{"a":0}]
            # episode.get_infos(slice(3, 5), fill={"o":-2})  # [{"d":3},{"o":-2}]

        Returns:
            The collected info dicts.
            As a 0-axis batch, if there are several `indices` or a list of exactly one
            index provided OR `indices` is a slice object.
            As single item (B=0 -> no additional 0-axis) if `indices` is a single int.
        r   r   r   )r   r   rO   r   r   r   s       r8   r   zSingleAgentEpisode.get_infosI  s+    N z~~"7  
 
 	
r:   c                >    | j                             ||||          S )a  Returns individual actions or batched ranges thereof from this episode.

        Args:
            indices: A single int is interpreted as an index, from which to return the
                individual action stored at this index.
                A list of ints is interpreted as a list of indices from which to gather
                individual actions in a batch of size len(indices).
                A slice object is interpreted as a range of actions to be returned.
                Thereby, negative indices by default are interpreted as "before the end"
                unless the `neg_index_as_lookback=True` option is used, in which case
                negative indices are interpreted as "before ts=0", meaning going back
                into the lookback buffer.
                If None, will return all actions (from ts=0 to the end).
            neg_index_as_lookback: If True, negative values in `indices` are
                interpreted as "before ts=0", meaning going back into the lookback
                buffer. For example, an episode with actions [4, 5, 6,  7, 8, 9], where
                [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will respond
                to `get_actions(-1, neg_index_as_lookback=True)` with `6` and
                to `get_actions(slice(-2, 1), neg_index_as_lookback=True)` with
                `[5, 6,  7]`.
            fill: An optional value to use for filling up the returned results at
                the boundaries. This filling only happens if the requested index range's
                start/stop boundaries exceed the episode's boundaries (including the
                lookback buffer on the left side). This comes in very handy, if users
                don't want to worry about reaching such boundaries and want to zero-pad.
                For example, an episode with actions [10, 11,  12, 13, 14] and
                lookback buffer size of 2 (meaning actions `10` and `11` are part
                of the lookback buffer) will respond to
                `get_actions(slice(-7, -2), fill=0.0)` with `[0.0, 0.0, 10, 11, 12]`.
            one_hot_discrete: If True, will return one-hot vectors (instead of
                int-values) for those sub-components of a (possibly complex) action
                space that are Discrete or MultiDiscrete. Note that if `fill=0` and the
                requested `indices` are out of the range of our data, the returned
                one-hot vectors will actually be zero-hot (all slots zero).

        Examples:

        .. testcode::

            import gymnasium as gym
            from ray.rllib.env.single_agent_episode import SingleAgentEpisode

            episode = SingleAgentEpisode(
                # Discrete(4) actions (ints between 0 and 4 (excl.))
                action_space=gym.spaces.Discrete(4),
                actions=[1, 2, 3],
                observations=[0, 1, 2, 3], rewards=[1, 2, 3],  # <- not relevant here
                len_lookback_buffer=0,  # no lookback; all data is actually "in" episode
            )
            # Plain usage (`indices` arg only).
            episode.get_actions(-1)  # 3
            episode.get_actions(0)  # 1
            episode.get_actions([0, 2])  # [1, 3]
            episode.get_actions([-1, 0])  # [3, 1]
            episode.get_actions(slice(None, 2))  # [1, 2]
            episode.get_actions(slice(-2, None))  # [2, 3]
            # Using `fill=...` (requesting slices beyond the boundaries).
            episode.get_actions(slice(-5, -2), fill=-9)  # [-9, -9, 1, 2]
            episode.get_actions(slice(1, 5), fill=-7)  # [2, 3, -7, -7]
            # Using `one_hot_discrete=True`.
            episode.get_actions(1, one_hot_discrete=True)  # [0 0 1 0] (action=2)
            episode.get_actions(2, one_hot_discrete=True)  # [0 0 0 1] (action=3)
            episode.get_actions(
                slice(0, 2),
                one_hot_discrete=True,
            )   # [[0 1 0 0], [0 0 0 1]] (actions=1 and 3)
            # Special case: Using `fill=0.0` AND `one_hot_discrete=True`.
            episode.get_actions(
                -1,
                neg_index_as_lookback=True,  # -1 means one left of ts=0
                fill=0.0,
                one_hot_discrete=True,
            )  # [0 0 0 0]  <- all 0s one-hot tensor (note difference to [1 0 0 0]!)

        Returns:
            The collected actions.
            As a 0-axis batch, if there are several `indices` or a list of exactly one
            index provided OR `indices` is a slice object.
            As single item (B=0 -> no additional 0-axis) if `indices` is a single int.
        r   )r   r   r   s        r8   r   zSingleAgentEpisode.get_actions  s0    p |"7-	   
 
 	
r:   c                <    | j                             |||          S )aE  Returns individual rewards or batched ranges thereof from this episode.

        Args:
            indices: A single int is interpreted as an index, from which to return the
                individual reward stored at this index.
                A list of ints is interpreted as a list of indices from which to gather
                individual rewards in a batch of size len(indices).
                A slice object is interpreted as a range of rewards to be returned.
                Thereby, negative indices by default are interpreted as "before the end"
                unless the `neg_index_as_lookback=True` option is used, in which case
                negative indices are interpreted as "before ts=0", meaning going back
                into the lookback buffer.
                If None, will return all rewards (from ts=0 to the end).
            neg_index_as_lookback: Negative values in `indices` are interpreted as
                 as "before ts=0", meaning going back into the lookback buffer.
                 For example, an episode with rewards [4, 5, 6,  7, 8, 9], where
                 [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will respond
                 to `get_rewards(-1, neg_index_as_lookback=True)` with `6` and
                 to `get_rewards(slice(-2, 1), neg_index_as_lookback=True)` with
                 `[5, 6,  7]`.
            fill: An optional float value to use for filling up the returned results at
                the boundaries. This filling only happens if the requested index range's
                start/stop boundaries exceed the episode's boundaries (including the
                lookback buffer on the left side). This comes in very handy, if users
                don't want to worry about reaching such boundaries and want to zero-pad.
                For example, an episode with rewards [10, 11,  12, 13, 14] and
                lookback buffer size of 2 (meaning rewards `10` and `11` are part
                of the lookback buffer) will respond to
                `get_rewards(slice(-7, -2), fill=0.0)` with `[0.0, 0.0, 10, 11, 12]`.

        Examples:

        .. testcode::

            from ray.rllib.env.single_agent_episode import SingleAgentEpisode

            episode = SingleAgentEpisode(
                rewards=[1.0, 2.0, 3.0],
                observations=[0, 1, 2, 3], actions=[1, 2, 3],  # <- not relevant here
                len_lookback_buffer=0,  # no lookback; all data is actually "in" episode
            )
            # Plain usage (`indices` arg only).
            episode.get_rewards(-1)  # 3.0
            episode.get_rewards(0)  # 1.0
            episode.get_rewards([0, 2])  # [1.0, 3.0]
            episode.get_rewards([-1, 0])  # [3.0, 1.0]
            episode.get_rewards(slice(None, 2))  # [1.0, 2.0]
            episode.get_rewards(slice(-2, None))  # [2.0, 3.0]
            # Using `fill=...` (requesting slices beyond the boundaries).
            episode.get_rewards(slice(-5, -2), fill=0.0)  # [0.0, 0.0, 1.0, 2.0]
            episode.get_rewards(slice(1, 5), fill=0.0)  # [2.0, 3.0, 0.0, 0.0]

        Returns:
            The collected rewards.
            As a 0-axis batch, if there are several `indices` or a list of exactly one
            index provided OR `indices` is a slice object.
            As single item (B=0 -> no additional 0-axis) if `indices` is a single int.
        r   )r#   r   r   s       r8   r   zSingleAgentEpisode.get_rewards  s-    B |"7   
 
 	
r:   r   c                z    | j         |         }t          |t                    r|                    |||          S J )a1  Returns extra model outputs (under given key) from this episode.

        Args:
            key: The `key` within `self.extra_model_outputs` to extract data for.
            indices: A single int is interpreted as an index, from which to return an
                individual extra model output stored under `key` at index.
                A list of ints is interpreted as a list of indices from which to gather
                individual actions in a batch of size len(indices).
                A slice object is interpreted as a range of extra model outputs to be
                returned. Thereby, negative indices by default are interpreted as
                "before the end" unless the `neg_index_as_lookback=True` option is
                used, in which case negative indices are interpreted as "before ts=0",
                meaning going back into the lookback buffer.
                If None, will return all extra model outputs (from ts=0 to the end).
            neg_index_as_lookback: If True, negative values in `indices` are
                interpreted as "before ts=0", meaning going back into the lookback
                buffer. For example, an episode with
                extra_model_outputs['a'] = [4, 5, 6,  7, 8, 9], where [4, 5, 6] is the
                lookback buffer range (ts=0 item is 7), will respond to
                `get_extra_model_outputs("a", -1, neg_index_as_lookback=True)` with
                `6` and to `get_extra_model_outputs("a", slice(-2, 1),
                neg_index_as_lookback=True)` with `[5, 6,  7]`.
            fill: An optional value to use for filling up the returned results at
                the boundaries. This filling only happens if the requested index range's
                start/stop boundaries exceed the episode's boundaries (including the
                lookback buffer on the left side). This comes in very handy, if users
                don't want to worry about reaching such boundaries and want to zero-pad.
                For example, an episode with
                extra_model_outputs["b"] = [10, 11,  12, 13, 14] and lookback buffer
                size of 2 (meaning `10` and `11` are part of the lookback buffer) will
                respond to
                `get_extra_model_outputs("b", slice(-7, -2), fill=0.0)` with
                `[0.0, 0.0, 10, 11, 12]`.
                TODO (sven): This would require a space being provided. Maybe we can
                automatically infer the space from existing data?

        Examples:

        .. testcode::

            from ray.rllib.env.single_agent_episode import SingleAgentEpisode

            episode = SingleAgentEpisode(
                extra_model_outputs={"mo": [1, 2, 3]},
                len_lookback_buffer=0,  # no lookback; all data is actually "in" episode
                # The following is needed, but not relevant for this demo.
                observations=[0, 1, 2, 3], actions=[1, 2, 3], rewards=[1, 2, 3],
            )

            # Plain usage (`indices` arg only).
            episode.get_extra_model_outputs("mo", -1)  # 3
            episode.get_extra_model_outputs("mo", 1)  # 0
            episode.get_extra_model_outputs("mo", [0, 2])  # [1, 3]
            episode.get_extra_model_outputs("mo", [-1, 0])  # [3, 1]
            episode.get_extra_model_outputs("mo", slice(None, 2))  # [1, 2]
            episode.get_extra_model_outputs("mo", slice(-2, None))  # [2, 3]
            # Using `fill=...` (requesting slices beyond the boundaries).
            # TODO (sven): This would require a space being provided. Maybe we can
            #  automatically infer the space from existing data?
            # episode.get_extra_model_outputs("mo", slice(-5, -2), fill=0)  # [0, 0, 1]
            # episode.get_extra_model_outputs("mo", slice(2, 5), fill=-1)  # [3, -1, -1]

        Returns:
            The collected extra_model_outputs[`key`].
            As a 0-axis batch, if there are several `indices` or a list of exactly one
            index provided OR `indices` is a slice object.
            As single item (B=0 -> no additional 0-axis) if `indices` is a single int.
        r   )r   rF   r   r   )rO   r   r   r   r   values         r8   r   z*SingleAgentEpisode.get_extra_model_outputs<  sP    X (-e344 	99&;     	ur:   )
at_indicesr   r   c                @    | j                             |||           dS )a
  Overwrites all or some of this Episode's observations with the provided data.

        Note that an episode's observation data cannot be written to directly as it is
        managed by a `InfiniteLookbackBuffer` object. Normally, individual, current
        observations are added to the episode either by calling `self.add_env_step` or
        more directly (and manually) via `self.observations.append|extend()`.
        However, for certain postprocessing steps, the entirety (or a slice) of an
        episode's observations might have to be rewritten, which is when
        `self.set_observations()` should be used.

        Args:
            new_data: The new observation data to overwrite existing data with.
                This may be a list of individual observation(s) in case this episode
                is still not numpy'ized yet. In case this episode has already been
                numpy'ized, this should be (possibly complex) struct matching the
                observation space and with a batch size of its leafs exactly the size
                of the to-be-overwritten slice or segment (provided by `at_indices`).
            at_indices: A single int is interpreted as one index, which to overwrite
                with `new_data` (which is expected to be a single observation).
                A list of ints is interpreted as a list of indices, all of which to
                overwrite with `new_data` (which is expected to be of the same size
                as `len(at_indices)`).
                A slice object is interpreted as a range of indices to be overwritten
                with `new_data` (which is expected to be of the same size as the
                provided slice).
                Thereby, negative indices by default are interpreted as "before the end"
                unless the `neg_index_as_lookback=True` option is used, in which case
                negative indices are interpreted as "before ts=0", meaning going back
                into the lookback buffer.
            neg_index_as_lookback: If True, negative values in `at_indices` are
                interpreted as "before ts=0", meaning going back into the lookback
                buffer. For example, an episode with
                observations = [4, 5, 6,  7, 8, 9], where [4, 5, 6] is the
                lookback buffer range (ts=0 item is 7), will handle a call to
                `set_observations(individual_observation, -1,
                neg_index_as_lookback=True)` by overwriting the value of 6 in our
                observations buffer with the provided "individual_observation".

        Raises:
            IndexError: If the provided `at_indices` do not match the size of
                `new_data`.
        new_datar   r   N)r"   setrO   r   r   r   s       r8   set_observationsz#SingleAgentEpisode.set_observations  s8    b 	!"7 	 	
 	
 	
 	
 	
r:   c                @    | j                             |||           dS )af
  Overwrites all or some of this Episode's actions with the provided data.

        Note that an episode's action data cannot be written to directly as it is
        managed by a `InfiniteLookbackBuffer` object. Normally, individual, current
        actions are added to the episode either by calling `self.add_env_step` or
        more directly (and manually) via `self.actions.append|extend()`.
        However, for certain postprocessing steps, the entirety (or a slice) of an
        episode's actions might have to be rewritten, which is when
        `self.set_actions()` should be used.

        Args:
            new_data: The new action data to overwrite existing data with.
                This may be a list of individual action(s) in case this episode
                is still not numpy'ized yet. In case this episode has already been
                numpy'ized, this should be (possibly complex) struct matching the
                action space and with a batch size of its leafs exactly the size
                of the to-be-overwritten slice or segment (provided by `at_indices`).
            at_indices: A single int is interpreted as one index, which to overwrite
                with `new_data` (which is expected to be a single action).
                A list of ints is interpreted as a list of indices, all of which to
                overwrite with `new_data` (which is expected to be of the same size
                as `len(at_indices)`).
                A slice object is interpreted as a range of indices to be overwritten
                with `new_data` (which is expected to be of the same size as the
                provided slice).
                Thereby, negative indices by default are interpreted as "before the end"
                unless the `neg_index_as_lookback=True` option is used, in which case
                negative indices are interpreted as "before ts=0", meaning going back
                into the lookback buffer.
            neg_index_as_lookback: If True, negative values in `at_indices` are
                interpreted as "before ts=0", meaning going back into the lookback
                buffer. For example, an episode with
                actions = [4, 5, 6,  7, 8, 9], where [4, 5, 6] is the
                lookback buffer range (ts=0 item is 7), will handle a call to
                `set_actions(individual_action, -1,
                neg_index_as_lookback=True)` by overwriting the value of 6 in our
                actions buffer with the provided "individual_action".

        Raises:
            IndexError: If the provided `at_indices` do not match the size of
                `new_data`.
        r   N)r   r   r   s       r8   set_actionszSingleAgentEpisode.set_actions  7    b 	!"7 	 	
 	
 	
 	
 	
r:   c                @    | j                             |||           dS )a*
  Overwrites all or some of this Episode's rewards with the provided data.

        Note that an episode's reward data cannot be written to directly as it is
        managed by a `InfiniteLookbackBuffer` object. Normally, individual, current
        rewards are added to the episode either by calling `self.add_env_step` or
        more directly (and manually) via `self.rewards.append|extend()`.
        However, for certain postprocessing steps, the entirety (or a slice) of an
        episode's rewards might have to be rewritten, which is when
        `self.set_rewards()` should be used.

        Args:
            new_data: The new reward data to overwrite existing data with.
                This may be a list of individual reward(s) in case this episode
                is still not numpy'ized yet. In case this episode has already been
                numpy'ized, this should be a np.ndarray with a length exactly
                the size of the to-be-overwritten slice or segment (provided by
                `at_indices`).
            at_indices: A single int is interpreted as one index, which to overwrite
                with `new_data` (which is expected to be a single reward).
                A list of ints is interpreted as a list of indices, all of which to
                overwrite with `new_data` (which is expected to be of the same size
                as `len(at_indices)`).
                A slice object is interpreted as a range of indices to be overwritten
                with `new_data` (which is expected to be of the same size as the
                provided slice).
                Thereby, negative indices by default are interpreted as "before the end"
                unless the `neg_index_as_lookback=True` option is used, in which case
                negative indices are interpreted as "before ts=0", meaning going back
                into the lookback buffer.
            neg_index_as_lookback: If True, negative values in `at_indices` are
                interpreted as "before ts=0", meaning going back into the lookback
                buffer. For example, an episode with
                rewards = [4, 5, 6,  7, 8, 9], where [4, 5, 6] is the
                lookback buffer range (ts=0 item is 7), will handle a call to
                `set_rewards(individual_reward, -1,
                neg_index_as_lookback=True)` by overwriting the value of 6 in our
                rewards buffer with the provided "individual_reward".

        Raises:
            IndexError: If the provided `at_indices` do not match the size of
                `new_data`.
        r   N)r#   r   r   s       r8   set_rewardszSingleAgentEpisode.set_rewards  r   r:   c                b    || j         v sJ | j         |                             |||           dS )af  Overwrites all or some of this Episode's extra model outputs with `new_data`.

        Note that an episode's `extra_model_outputs` data cannot be written to directly
        as it is managed by a `InfiniteLookbackBuffer` object. Normally, individual,
        current `extra_model_output` values are added to the episode either by calling
        `self.add_env_step` or more directly (and manually) via
        `self.extra_model_outputs[key].append|extend()`. However, for certain
        postprocessing steps, the entirety (or a slice) of an episode's
        `extra_model_outputs` might have to be rewritten or a new key (a new type of
        `extra_model_outputs`) must be inserted, which is when
        `self.set_extra_model_outputs()` should be used.

        Args:
            key: The `key` within `self.extra_model_outputs` to override data on or
                to insert as a new key into `self.extra_model_outputs`.
            new_data: The new data to overwrite existing data with.
                This may be a list of individual reward(s) in case this episode
                is still not numpy'ized yet. In case this episode has already been
                numpy'ized, this should be a np.ndarray with a length exactly
                the size of the to-be-overwritten slice or segment (provided by
                `at_indices`).
            at_indices: A single int is interpreted as one index, which to overwrite
                with `new_data` (which is expected to be a single reward).
                A list of ints is interpreted as a list of indices, all of which to
                overwrite with `new_data` (which is expected to be of the same size
                as `len(at_indices)`).
                A slice object is interpreted as a range of indices to be overwritten
                with `new_data` (which is expected to be of the same size as the
                provided slice).
                Thereby, negative indices by default are interpreted as "before the end"
                unless the `neg_index_as_lookback=True` option is used, in which case
                negative indices are interpreted as "before ts=0", meaning going back
                into the lookback buffer.
            neg_index_as_lookback: If True, negative values in `at_indices` are
                interpreted as "before ts=0", meaning going back into the lookback
                buffer. For example, an episode with
                rewards = [4, 5, 6,  7, 8, 9], where [4, 5, 6] is the
                lookback buffer range (ts=0 item is 7), will handle a call to
                `set_rewards(individual_reward, -1,
                neg_index_as_lookback=True)` by overwriting the value of 6 in our
                rewards buffer with the provided "individual_reward".

        Raises:
            IndexError: If the provided `at_indices` do not match the size of
                `new_data`.
        r   N)r   r   )rO   r   r   r   r   s        r8   set_extra_model_outputsz*SingleAgentEpisode.set_extra_model_outputsB  sQ    n d..... %))!"7 	* 	
 	
 	
 	
 	
r:   )r2   slice_c                   |j         }|j        }|d}n|dk     rt          |           |z   }|t          |           }n|dk     rt          |           |z   }|j        |j        nd}|t          |           k    }| j        |z   }||n| j        j        }|dk    r+||z
  dk     r"| j        j        ||z
  k     r| j        j        |z   }t          |                     t          ||z
  |dz   |          d          || j
                  }	||n| j        j        }|dk    r+||z
  dk     r"| j        j        ||z
  k     r| j        j        |z   }t          |                     t          ||z
  |dz   |          d          |          }
||n| j        j        }|dk    r+||z
  dk     r"| j        j        ||z
  k     r| j        j        |z   }t          |                     t          ||z
  ||          d          || j                  }||n| j        j        }|dk    r+||z
  dk     r"| j        j        ||z
  k     r| j        j        |z   }t          |                     t          ||z
  ||          d          |          }i }| j                                        D ]s\  }}||n|j        }|dk    r!||z
  dk     r|j        ||z
  k     r
|j        |z   }t          |                     |t          ||z
  ||          d          |          ||<   tt+          | j        |	| j
        |
|| j        |||r| j        nd	|r| j        nd	|
          S )a]  Returns a slice of this episode with the given slice object.

        For example, if `self` contains o0 (the reset observation), o1, o2, o3, and o4
        and the actions a1, a2, a3, and a4 (len of `self` is 4), then a call to
        `self.slice(slice(1, 3))` would return a new SingleAgentEpisode with
        observations o1, o2, and o3, and actions a2 and a3. Note here that there is
        always one observation more in an episode than there are actions (and rewards
        and extra model outputs) due to the initial observation received after an env
        reset.

        .. testcode::

            from ray.rllib.env.single_agent_episode import SingleAgentEpisode
            from ray.rllib.utils.test_utils import check

            # Generate a simple multi-agent episode.
            observations = [0, 1, 2, 3, 4, 5]
            actions = [1, 2, 3, 4, 5]
            rewards = [0.1, 0.2, 0.3, 0.4, 0.5]
            episode = SingleAgentEpisode(
                observations=observations,
                actions=actions,
                rewards=rewards,
                len_lookback_buffer=0,  # all given data is part of the episode
            )
            slice_1 = episode[:1]
            check(slice_1.observations, [0, 1])
            check(slice_1.actions, [1])
            check(slice_1.rewards, [0.1])

            slice_2 = episode[-2:]
            check(slice_2.observations, [3, 4, 5])
            check(slice_2.actions, [4, 5])
            check(slice_2.rewards, [0.4, 0.5])

        Args:
            slice_: The slice object to use for slicing. This should exclude the
                lookback buffer, which will be prepended automatically to the returned
                slice.
            len_lookback_buffer: If not None, forces the returned slice to try to have
                this number of timesteps in its lookback buffer (if available). If None
                (default), tries to make the returned slice's lookback as large as the
                current lookback buffer of this episode (`self`).

        Returns:
            The new SingleAgentEpisode representing the requested slice.
        Nr   r`   T)r   r?   r;   )r   r   r   F)r   r"   r.   r   r   r/   r#   r   r0   r1   r%   )startstoprD   stepr%   r"   r=   r   r~   r   r.   r   r   r   r   r/   r#   r   r   rM   r   r   r   r   r   )rO   r   r2   r   r   r   	keep_doner%   _lbr"   r   r   r#   r   rQ   rR   s                   r8   r   zSingleAgentEpisode.slice  s>   n { =EEQYYII%E <t99DDAXXt99t#D$k5v{{1 CII%	NU*	 #.  "+ 	 QJJa!*cEk::#,u4C-&&eck4!8T22&* '   (
 
 
 #.  $ 	
 A::%#+//dj.AS5[.Q.Q*%-C&eck4!8T22&*     
 
 
 #.  & 	
 A::%#+//dl.CsU{.S.S,'%/C(!!eck4..&* "   #
 
 
 #.  & 	
 A::%#+//dl.CsU{.S.S,'%/C(!!eck4..&* "   
 
 
 !,2244 	 	DAq)<)H%%ajCzzeckAoo!*e2L2Lj5(%;11!%#+tT::*. 2  
 & & &"" "%"4* 3.7B**U,5@t((5
 
 
 	
r:   c                     t          t           j         j                            }dgt	                     dz
  z   j        gz   }dgt	                     dz
  z   j        gz   } j        gt	                     z  } j        rPt          j
        |          }t          j
        |          }t          j
        |          }t          j
        |          }t          t          j        |t          j        |t          j        |t          j        |t          j                             t'          dd                    t          j                             t'          dd                    t          j                                         t          j                                         ifi  fd j                                        D             S )a<  Converts a SingleAgentEpisode into a data dict mapping str keys to data.

        The keys used are:
        Columns.EPS_ID, T, OBS, INFOS, ACTIONS, REWARDS, TERMINATEDS, TRUNCATEDS,
        and those in `self.extra_model_outputs`.

        Returns:
            A data dict mapping str keys to data records.
        Fr`   Nru   c                 <    i | ]}|                     |          S r5   r   )r6   rQ   rO   s     r8   r   z4SingleAgentEpisode.get_data_dict.<locals>.<dictcomp>U  s9        4//22  r:   )listrE   r%   r$   rD   r   r   r   rc   rK   arraydictr   TERMINATEDS
TRUNCATEDSTEPS_IDOBSr~   r   INFOSr   ACTIONSr   REWARDSr   r   r   )rO   r$   terminateds
truncatedseps_ids   `    r8   get_data_dictz SingleAgentEpisode.get_data_dict/  s    t~tv..//gTQ/43E2FFWD		A.$2C1DD
(c$ii'= 	&A(;//K*--JXf%%F #["J	1
 T225r??CCt~~eD"oo>>!1!1!3!3!1!1!3!3
 
"   16688  #
 
 	
r:   c                 D    t          |                                           S )zConverts this `SingleAgentEpisode` into a `SampleBatch`.

        Returns:
            A SampleBatch containing all of this episode's data.
        )r   r   rj   s    r8   get_sample_batchz#SingleAgentEpisode.get_sample_batch[  s     4--//000r:   c                 D    t          |                                           S )a9  Calculates an episode's return, excluding the lookback buffer's rewards.

        The return is computed by a simple sum, neglecting the discount factor.
        Note that if `self` is a continuation chunk (resulting from a call to
        `self.cut()`), the previous chunk's rewards are NOT counted and thus NOT
        part of the returned reward sum.

        Returns:
            The sum of rewards collected during this episode, excluding possible data
            inside the lookback buffer and excluding possible data in a predecessor
            chunk.
        )sumr   rj   s    r8   
get_returnzSingleAgentEpisode.get_returnc  s     4##%%&&&r:   c                 2    | j         dS | j         | j        z
  S )z8Returns the duration of this Episode (chunk) in seconds.Ng        )r)   r+   rj   s    r8   get_duration_sz!SingleAgentEpisode.get_duration_sr  s!    '3#d&666r:   c                      t          |           S )zReturns the number of environment steps.

        Note, this episode instance could be a chunk of an actual episode.

        Returns:
            An integer that counts the number of environment steps this episode instance
            has seen.
        )rD   rj   s    r8   	env_stepszSingleAgentEpisode.env_stepsx  s     4yyr:   c                 *    |                                  S )a  Returns the number of agent steps.

        Note, these are identical to the environment steps for a single-agent episode.

        Returns:
            An integer counting the number of agent steps executed during the time this
            episode instance records.
        )r   rj   s    r8   agent_stepszSingleAgentEpisode.agent_steps  s     ~~r:   c                    | j                                         }t          j        d |d         D                       |d<   i d| j        d| j        d| j        d| j        d| j                                        d| j	                                        d	| j
                                        d
| j                                         dt          | j                  dk    r#d | j                                        D             ndd| j        d| j        d| j        d| j        d| j        rt'          | j                  ndd| j        rt'          | j                  ndd| j        d| j        d| j        iS )a%  Returns the pickable state of an episode.

        The data in the episode is stored into a dictionary. Note that episodes
        can also be generated from states (see `SingleAgentEpisode.from_state()`).

        Returns:
            A dict containing all the data from the episode.
        c                     g | ]}|r|nd 	S ri   r5   )r6   infos     r8   r9   z0SingleAgentEpisode.get_state.<locals>.<listcomp>  s!    !S!S!ST$"8$$D!S!S!Sr:   r<   r   r   r    r!   r"   r   r#   r   r   r   c                 F    i | ]\  }}||r|                                 n|S r5   )	get_stater6   rQ   rR   s      r8   r   z0SingleAgentEpisode.get_state.<locals>.<dictcomp>  s?     $ $ $Aq A,1;;===1$ $ $r:   Nr   r   r%   r$   r*   r&   r+   r)   rk   )r   r   rK   r   r   r   r    r!   r"   r   r#   rD   r   rM   r   r   r%   r$   r*   r   r&   r+   r)   rk   )rO   r   s     r8   r   zSingleAgentEpisode.get_state  s    
$$&&!S!SU6]!S!S!STTf
48

 
 %d&A	
 D-7799
 t|--//
 t|--//
 TZ))++
 " 4+,,q00	$ $ 4::<<$ $ $ $
 
  T/!
" D-#
$ %
& '
( !&#"3D4K"L"L"L-
. !.t/ABBB3
4 4+5
6 t37
8 4+9
 
 	
r:   statec                 *   t          | d                   }| d         |_        | d         |_        | d         |_        t	          j        | d                   |_        t	          j        | d                   |_        t	          j        | d                   |_        t	          j        | d	                   |_	        | d
         rUt          t          j        t          |j        j                  d | d
                                         D                       n1t          t          j        t          |j        j                            |_        | d         |_        | d         |_        | d         |_        | d         |_        | d         rt)          | d                   nd|_        | d         rt)          | d                   nd|_        | d         |_        | d         |_        |                     di           |_        |                                 |S )a  Creates a new `SingleAgentEpisode` instance from a state dict.

        Args:
            state: The state dict, as returned by `self.get_state()`.

        Returns:
            A new `SingleAgentEpisode` instance with the data from the state dict.
        r   )r   r   r    r!   r"   r   r#   r   r   )r=   c                 >    i | ]\  }}|t          j        |          S r5   )r   
from_stater   s      r8   r   z1SingleAgentEpisode.from_state.<locals>.<dictcomp>  s:       1 -8;;  r:   r   r   r%   r$   r*   Nr&   r+   r)   rk   )r   r   r    r!   r   r   r"   r   r#   r   r   	functoolspartialr=   rM   r   r   r   r%   r$   r   r*   r&   r+   r)   r   r,   rN   )r   episodes     r8   r   zSingleAgentEpisode.from_state  s    %u666 ,!+.)./G)H&5@~AVWW0;E)<LMM0;E)<LMM.9%.II *+K!*W5I5R    %&; < B B D D  	   !*W5I5R    	#" !&o 6$^4!+.#J	 )*&: ;<<< 	" _%o 6777 	
 $M2"'(9":$yy;;r:   c                     | j         S ri   )r*   rj   s    r8   r.   z$SingleAgentEpisode.observation_space  s    &&r:   c                 ,    |x| _         | j        _        d S ri   )r*   r"   r@   rO   r   s     r8   r.   z$SingleAgentEpisode.observation_space  s    <AA$"3"9"9"9r:   c                     | j         S ri   )r&   rj   s    r8   r/   zSingleAgentEpisode.action_space  s    !!r:   c                 ,    |x| _         | j        _        d S ri   )r&   r   r@   r   s     r8   r/   zSingleAgentEpisode.action_space  s    277T\///r:   c                      | j         | j        z
  S )ao  Returning the length of an episode.

        The length of an episode is defined by the length of its data, excluding
        the lookback buffer data. The length is the number of timesteps an agent has
        stepped through an environment thus far.

        The length is 0 in case of an episode whose env has NOT been reset yet, but
        also 0 right after the `env.reset()` data has been added via
        `self.add_env_reset()`. Only after the first call to `env.step()` (and
        `self.add_env_step()`, the length will be 1.

        Returns:
            An integer, defining the length of an episode.
        )r$   r%   rj   s    r8   __len__zSingleAgentEpisode.__len__  s     v&&r:   c           	      r    dt          |            d| j         d|                                  d| j         d	S )Nz
SAEps(len=z done=z R=z id_=))rD   rX   r   r   rj   s    r8   __repr__zSingleAgentEpisode.__repr__  sW    5T 5 5$, 5 5""5 5)-5 5 5	
r:   itemc                 ~    t          |t                    r|                     |          S t          d| d          )zGEnable squared bracket indexing- and slicing syntax, e.g. episode[-4:].)r   z2SingleAgentEpisode does not support getting item 'z>'! Only slice objects allowed with the syntax: `episode[a:b]`.)rF   r   NotImplementedError)rO   r   s     r8   __getitem__zSingleAgentEpisode.__getitem__  sT    dE"" 	::T:***%NT N N N  r:   z.SingleAgentEpisode.custom_data[some-key] = ...T)newerrorc                     d S ri   r5   rj   s    r8   add_temporary_timestep_dataz.SingleAgentEpisode.add_temporary_timestep_data&      r:   z(SingleAgentEpisode.custom_data[some-key]c                     d S ri   r5   rj   s    r8   get_temporary_timestep_dataz.SingleAgentEpisode.get_temporary_timestep_data*  r   r:   ri   )rU   N)rU   r   )rs   r   rU   N)r   )?__name__
__module____qualname____doc__	__slots__r   strr	   r   r   r   rG   Spacer   r
   r   boolr   intr   r   rS   r\   re   rN   propertyrk   rW   rc   rX   rr   r   r   r   r~   r   r   rJ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodr   r.   setterr/   r   r   r   r   r   r   r5   r:   r8   r   r      s	       @ @DI2 "s PT15EIJN,0PT 8<#'/5&*(,04#s s sc]s uT']4J%JKL	s
 $CI.s d4j*@@ABs %W/E EFGs sy)s %] 35K KLMs s s &d38n5s C=s #38_s 7#s  H%!s" !)#s s s sp !% /  / / ~ / 
	 /  /  /  /N +/I4 !8<I4 I4 I4I4 I4 	I4
 S#X'I4 I4 I4 &d38n5I4 
I4 I4 I4 I4V< < < <: ! ! X! *$ * * * X* &$ & & & X& 7 7 7 7 X7H H H HTB B B BHC Cs C3G C C C C` ;?a
 ',"!&a
 a
 a
%T#Y 567a
  $	a

 sma
 a
 
a
 a
 a
 a
J ;?K
 ',"K
 K
 K
%T#Y 567K
  $	K

 smK
 
K
 K
 K
 K
^ ;?]
 ',"!&]
 ]
 ]
%T#Y 567]
  $	]

 sm]
 ]
 
]
 ]
 ]
 ]
B ;?E
 ', $E
 E
 E
%T#Y 567E
  $	E

 uoE
 
E
 E
 E
 E
T ;?_

 ',"_
 _
 _
_
 %T#Y 567_

  $_
 sm_
 
_
 _
 _
 _
J >B&+5
 5
 5
 U3S	5#89:	5

  $5
 
5
 5
 5
 5
v >B&+5
 5
 5
 U3S	5#89:	5

  $5
 
5
 5
 5
 5
v >B&+5
 5
 5
 U3S	5#89:	5

  $5
 
5
 5
 5
 5
x >B&+<
 <
 <

 U3S	5#89:<
  $<
 
<
 <
 <
 <
D .2	m
 m
 m
m
 &c]	m

 
m
 m
 m
 m
^*
 *
 *
X1+ 1 1 1 1'E ' ' ' '7 7 7 7 7	3 	 	 	 		 S 	  	  	  	 (
4S> (
 (
 (
 (
T :$sCx. :-A : : : \:x ' ' X' B B B " " X" 8 8 8' ' ' ' '"
 
 
 *>     ZDDQQQ  RQ Z>dKKK  LK  r:   r   )&r   r   rZ   rA   collectionsr   typingr   r   r   r   r   r	   	gymnasiumrG   numpyrK   rv   gymnasium.corer
   r   ray._common.deprecationr   ray.rllib.core.columnsr   ,ray.rllib.env.utils.infinite_lookback_bufferr   ray.rllib.policy.sample_batchr   ray.rllib.utils.serializationr   r   ray.rllib.utils.typingr   r   ray.util.annotationsr   r   r5   r:   r8   <module>r     s          # # # # # # B B B B B B B B B B B B B B B B          + + + + + + + + . . . . . . * * * * * * O O O O O O 5 5 5 5 5 5 P P P P P P P P 4 4 4 4 4 4 4 4 * * * * * * WU U U U U U U U U Ur:   