
    &`ix                     B   d dl Z d dlZd dlZd dlmZmZmZmZmZ d dl	Z
d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZm Z m!Z!m"Z" erd d
l#m$Z$m%Z% d dl&m'Z' d dl(m)Z)  e j*        e+          Z, e            \  Z-Z.dZ/dZ0e-r ej1        d          Z2n e3d          Z2edddede"dee4e"f         fd            Z5eddddddee6         de4de"fd            Z7eddde"fd             Z8eded!         dee4e"f         fd"            Z9e	 	 	 	 dWd$ee4         d%e:d&e:d'eed(                  fd)            Z;edXd*e!d$ee4         fd+            Z<ed,e"d-e"de"fd.            Z=e	 	 dYd/e!d0ee          d1e:de"fd2            Z>ed3ee"         de"fd4            Z?edZd*e"d6e6de"fd7            Z@ed*e"de"fd8            ZAed*e"d9e
jB        de"fd:            ZCedXd*e"d;eeD         de"fd<            ZEe	 	 	 d[d=e"d>eeD         d?e:de"fd@            ZFedAedBedCe6ddfdD            ZGedddEe"ddfdF            ZHedXdGeeD         ddfdH            ZIedIe"dJe"de"fdK            ZJed\dM            ZKed]dN            ZLe	 	 	 	 d^ddLdReDdSe6dTe6d$ee4         f
dU            ZMdV ZNdS )_    N)TYPE_CHECKINGDictListOptionalUnion)DiscreteMultiDiscrete)version)RepeatedValues)DeveloperAPIOldAPIStack	PublicAPI)try_import_torch)SMALL_NUMBER)LocalOptimizerNetworkTypeSpaceStructTensorStructType
TensorType)	ParamDict	ParamList)TorchPolicy)TorchPolicyV2gߌ3gߌ3Gz2.0.0zFtorch is not installed. TORCH_COMPILE_REQUIRED_VERSION is not defined.policyr   	optimizerlossreturnc                    d}| j         d         | j         d         }nt          j        }d}|j        D ]}t	          t          d |d                             }|rtt          j                            ||          }t          |t          j                  r&|                                                                }|t          ||          z  }|dz  }|t          |j                  k    ri S d|iS )aU  Applies gradient clipping to already computed grads inside `optimizer`.

    Note: This function does NOT perform an analogous operation as
    tf.clip_by_global_norm. It merely clips by norm (per gradient tensor) and
    then computes the global norm across all given tensors (but without clipping
    by that global norm).

    Args:
        policy: The TorchPolicy, which calculated `loss`.
        optimizer: A local torch optimizer object.
        loss: The torch loss tensor.

    Returns:
        An info dict containing the "grad_norm" key and the resulting clipped
        gradients.
    r   	grad_clipNc                     | j         d uS N)grad)ps    o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/utils/torch_utils.py<lambda>z%apply_grad_clipping.<locals>.<lambda>K   s    qvT'9     params   
grad_gnorm)confignpinfparam_groupslistfilternnutilsclip_grad_norm_
isinstancetorchTensorcpunumpyminlen)	r   r   r   r)   
clip_valuenum_none_gradsparam_groupr'   global_norms	            r$   apply_grad_clippingr>   -   s    ( J}[!-];/

V
N -     f99;x;PQQRR 
	  (226:FFK+u|44 8)oo//5577#k:666JJaNN Y34444	*%%r&   value)r   grad_clip_bygradients_dictr   r   r@   c                2   |dS |dvrt          d| d          |dk    r@|                                 D ])\  }}|dnt                              || |          | |<   *dS |dk    rf|                                 D ]O\  }}|H|                    d                              dd	
          }||k    r|                    ||z             PdS t          |                                           }t          |          }t          |          dk    r|S |t                              |dz   |          z  }t                              |d          }	|D ]C}
|
?|
                                                    |	                    |
j                             D|S )a?  Performs gradient clipping on a grad-dict based on a clip value and clip mode.

    Changes the provided gradient dict in place.

    Args:
        gradients_dict: The gradients dict, mapping str to gradient tensors.
        grad_clip: The value to clip with. The way gradients are clipped is defined
            by the `grad_clip_by` arg (see below).
        grad_clip_by: One of 'value', 'norm', or 'global_norm'.

    Returns:
        If `grad_clip_by`="global_norm" and `grad_clip` is not None, returns the global
        norm of all tensors, otherwise returns None.
    N)r?   normr=   z`grad_clip_by` (z*) must be one of [value|norm|global_norm]!r?   rC          e    eAneginfposinfr   gư>)r8         ?)max)
ValueErroritemsr4   cliprC   
nan_to_nummul_r.   valuescompute_global_normr9   clampdetachtodevice)rA   r   r@   kvrC   gradients_list
total_norm
clip_coeffclip_coeff_clampedgs              r$   clip_gradientsr^   `   s   , ;;;W|WWW
 
 	

 w"((** 	 	DAq	uzz!iZ'K'K 1	 	 
		"((** 	- 	-DAq}vvayy++5+FF)##FF9t+,,,	- 	- n335566(88
~!## Z$->I!N!NN

 #[[[== 	A 	AA}

 2 5 5ah ? ?@@@r&   rY   r   c                    dt          |           dk    rt                              d          S t                              t                              fd| D                                                     dd          }|S )zComputes the global norm for a gradients dict.

    Args:
        gradients_list: The gradients list containing parameters.

    Returns:
        Returns the global norm of all tensors in `gradients_list`.
           @r           c                     g | ]F}|t                               |                                                              dd          GS )NrE   rF   rG   )r4   rC   rT   rO   ).0r]   	norm_types     r$   
<listcomp>z'compute_global_norm.<locals>.<listcomp>   sQ        = 

188::y11 566 ==r&   rE   rF   rG   )r9   r4   tensorrC   stackrO   )rY   rZ   rd   s     @r$   rR   rR      s     I
>a||C        (  
	
 
	
 	  jdj++   r&   )r   r   c                      t                                fd j        D             d          }| _        |t                               |          dS )az  Concatenates multi-GPU (per-tower) TD error tensors given TorchPolicy.

    TD-errors are extracted from the TorchPolicy via its tower_stats property.

    Args:
        policy: The TorchPolicy to extract the TD-error values from.

    Returns:
        A dict mapping strings "td_error" and "mean_td_error" to the
        corresponding concatenated and mean-reduced values.
    c                     g | ]N}|j                             d t                              dg                                        j                  OS )td_errorra   )tower_statsgetr4   rf   rU   rV   )rc   tr   s     r$   re   z.concat_multi_gpu_td_errors.<locals>.<listcomp>   sY     	
 	
 	
 Mj%,,u*=*=>>AA&-PP	
 	
 	
r&   r   dim)rj   mean_td_error)r4   catmodel_gpu_towersrj   mean)r   rj   s   ` r$   concat_multi_gpu_td_errorsrt      su     yy	
 	
 	
 	
,	
 	
 	
    H FOH--  r&   FrV   
pin_memory
use_streamstream)ztorch.cuda.Streamztorch.cuda.classes.Streamc                 8   t                                         nt                               d          j        dk    ot           j                                        r|rrQt          t           j        j        t           j        j        j        f          sJ dt                     d            nBt           j                                        n#t           j                                      ndfdt          j
        |           S )aj  
    Converts any (possibly nested) structure to torch.Tensors.

    Args:
        x: The input structure whose leaves will be converted.
        device: The device to create the tensor on (e.g. "cuda:0" or "cpu").
        pin_memory: If True, calls `pin_memory()` on the created tensors.
        use_stream: If True, uses a separate CUDA stream for `Tensor.to()`.
        stream: An optional CUDA stream for the host-to-device copy in `Tensor.to()`.

    Returns:
        A new structure with the same layout as `x` but with all leaves converted
        to torch.Tensors. Leaves that are None are left unchanged.
    Nr6   cudaz-`stream` must be a torch.cuda.Stream but got .rV   c                 f   | | S t          | t                    r3t          t          j        | j                  | j        | j                  S t                              |           r| }nt          | t          j
                  r| j        t          k    s| j        j        t          j        u r| S | j        j        sZt#          j                    5  t#          j        d           t                              |           }d d d            n# 1 swxY w Y   nGt                              |           }n,t                              t          j        |                     }|                                r)|j        t          j        k    r|                                }rr|                                }riOt          j                                      5  |                    d          }d d d            n# 1 swxY w Y   n-|                    d          }n|                              }|S )NignoreT)non_blocking)r3   r   treemap_structurerQ   lengthsmax_lenr4   	is_tensorr+   ndarraydtypeobjecttypestr_flags	writeablewarningscatch_warningssimplefilter
from_numpyasarrayis_floating_pointfloat16floatru   ry   rw   rU   )itemrf   rV   is_cudamappingru   rw   s     r$   r   z(convert_to_torch_tensor.<locals>.mapping  s   <K dN++ 	!"7DK88   ??4   	8FFbj)) 	8zV##tz"''A'A:' 0,.. 4 4)(333"--d33F4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ))$//%%bj&6&677F ##%% 	$&,%-*G*G\\^^F  	)' 	)&&((F  	'!Z&&v.. B B#YYvDYAAFB B B B B B B B B B B B B B B  6== YYv&&Fs$   /D		DDG88G<?G<)r4   rV   r   ry   is_availabler3   Streamclassesdefault_streamr   r   )xrV   ru   rv   rw   r   r   s    `` `@@r$   convert_to_torch_tensorr      s;   0 &,%7U\\&!!!U\\%=P=PF{f$C%**A*A*C*CG   		>!!UZ.
0B0IJ  S SR4<<RRRS S  S **,,Z..f.==FF2 2 2 2 2 2 2 2 2h gq)))r&   r   c                 8    fd}t          j        ||           S )a  Creates a copy of `x` and makes deep copies torch.Tensors in x.

    Also moves the copied tensors to the specified device (if not None).

    Note if an object in x is not a torch.Tensor, it will be shallow-copied.

    Args:
        x : Any (possibly nested) struct possibly containing torch.Tensors.
        device : The device to move the tensors to.

    Returns:
        Any: A new struct with the same structure as `x`, but with all
            torch.Tensors deep-copied and moved to the specified device.

    c                     t          | t          j                  rU,t                              |                                           n&|                                                               S | S r!   )r3   r4   r5   clonerT   rU   )r   rV   s    r$   r   z#copy_torch_tensors.<locals>.mappingX  s_    dEL)) 	 > DKKMM***[[]]%%f-- Kr&   )r   r   )r   rV   r   s    ` r$   copy_torch_tensorsr   F  s1    $     gq)))r&   ypredc                    |                                  }t                              |d          }t                              ||                                 z
  d          }t                              dg                              |j                  }t                              |d||t          z   z  z
            d         S )a,  Computes the explained variance for a pair of labels and predictions.

    The formula used is:
    max(-1.0, 1.0 - (std(y - pred)^2 / std(y)^2))

    Args:
        y: The labels.
        pred: The predictions.

    Returns:
        The explained variance given a pair of labels and predictions.
    r   rn   g      r(   )squeezer4   varrf   rU   rV   rK   r   )r   r   
squeezed_yy_vardiff_varmin_s         r$   explained_variancer   e  s     JIIjaI((Eyydllnn4!y<<H<<""4;//D99T1EL,@ ABCCAFFr&   inputsspaces_struct	time_axisc                    t          j        |           }|t          j        |          ndgt          |          z  }d}d}g }t          ||          D ]d\  }}	||j        d         }|r|j        d         }t          |	t                    rW|rt                              |||z  g          }|	                    t          ||	                                                     t          |	t                    rX|r t                              |||z  dg          }|	                    t          ||	                                                     |r!t                              |||z  dg          }nt                              ||dg          }|	                    |                                           ft                              |d          }
|rt                              |
||dg          }
|
S )ao	  Flattens arbitrary input structs according to the given spaces struct.

    Returns a single 1D tensor resulting from the different input
    components' values.

    Thereby:
    - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes
    are not treated differently from other types of Boxes and get
    flattened as well.
    - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with
    Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]].
    - MultiDiscrete values are multi-one-hot'd, e.g. a batch of
    [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in
    [[1, 0,  0, 0, 1, 0, 0], [0, 1,  0, 0, 0, 0, 1]].

    Args:
        inputs: The inputs to be flattened.
        spaces_struct: The structure of the spaces that behind the input
        time_axis: Whether all inputs have a time-axis (after the batch axis).
            If True, will keep not only the batch axis (0th), but the time axis
            (1st) as-is and flatten everything from the 2nd axis up.

    Returns:
        A single 1D tensor resulting from concatenating all
        flattened/one-hot'd input components. Depending on the time_axis flag,
        the shape is (B, n) or (B, T, n).

    .. testcode::

        from gymnasium.spaces import Discrete, Box
        from ray.rllib.utils.torch_utils import flatten_inputs_to_1d_tensor
        import torch
        struct = {
            "a": np.array([1, 3]),
            "b": (
                np.array([[1.0, 2.0], [4.0, 5.0]]),
                np.array(
                    [[[8.0], [7.0]], [[5.0], [4.0]]]
                ),
            ),
                "c": {
                    "cb": np.array([1.0, 2.0]),
                },
        }
        struct_torch = tree.map_structure(lambda s: torch.from_numpy(s), struct)
        spaces = dict(
            {
                "a": gym.spaces.Discrete(4),
                "b": (gym.spaces.Box(-1.0, 10.0, (2,)), gym.spaces.Box(-1.0, 1.0, (2,
                        1))),
                "c": dict(
                    {
                        "cb": gym.spaces.Box(-1.0, 1.0, ()),
                    }
                ),
            }
        )
        print(flatten_inputs_to_1d_tensor(struct_torch, spaces_struct=spaces))

    .. testoutput::

        tensor([[0., 1., 0., 0., 1., 2., 8., 7., 1.],
                [0., 0., 0., 1., 4., 5., 5., 4., 2.]])

    Nr   r(   rn   )r   flattenr9   zipshaper3   r   r4   reshapeappendone_hotr   r	   rq   )r   r   r   flat_inputsflat_spacesBToutinput_spacemergeds              r$   flatten_inputs_to_1d_tensorr   z  s   P ,v&&K $ 	]###Vc+&&&  	AA
C[+66 ' '9QA $LO eX&& 	' 8vAw77JJwvu--33556666}-- 
	' <vAr{;;JJwvu--33556666  8vAr{;;v2w77JJv||~~&&&&YYsY##F 3v1bz22Mr&   tensorsc                 ~    d | D             }t                               t          d |D                       d          S )aN  Returns the global L2 norm over a list of tensors.

    output = sqrt(SUM(t ** 2 for t in tensors)),
        where SUM reduces over all tensors and over all elements in tensors.

    Args:
        tensors: The list of tensors to calculate the global norm over.

    Returns:
        The global L2 norm over the given tensor list.
    c           
          g | ]N}t                               t                               t                               |d                     d          OS )r`         ?r4   powsum)rc   rm   s     r$   re   zglobal_norm.<locals>.<listcomp>  s@    PPP1%))EIIeii3&7&788#>>PPPr&   c              3   L   K   | ]}t                               |d           V   dS )r`   N)r4   r   )rc   l2s     r$   	<genexpr>zglobal_norm.<locals>.<genexpr>  s0      AA2s++AAAAAAr&   r   r   )r   
single_l2ss     r$   r=   r=     sC     QPPPPJ99SAAjAAAAA3GGGr&   rJ   deltac                     t                               t                               |           |k     t                               | d          dz  |t                               |           d|z  z
  z            S )a  Computes the huber loss for a given term and delta parameter.

    Reference: https://en.wikipedia.org/wiki/Huber_loss
    Note that the factor of 0.5 is implicitly included in the calculation.

    Formula:
        L = 0.5 * x^2  for small abs x (delta threshold)
        L = delta * (abs(x) - 0.5*delta)  for larger abs x (delta threshold)

    Args:
        x: The input term, e.g. a TD error.
        delta: The delta parmameter in the above formula.

    Returns:
        The Huber loss resulting from `x` and `delta`.
    r`   r   )r4   whereabsr   )r   r   s     r$   
huber_lossr      s]    $ ;;		!u		!SC1e+,  r&   c                 n    dt                               t                               | d                    z  S )zComputes half the L2 norm over a tensor's values without the sqrt.

    output = 0.5 * sum(x ** 2)

    Args:
        x: The input tensor.

    Returns:
        0.5 times the L2 norm over the given tensor's values (w/o sqrt).
    r   r`   )r4   r   r   r   s    r$   l2_lossr     s)     599Q,,----r&   r   c                 B    t          |t                    r7t          j                                                             |j                  S t          |t                    rt          |j        d         t          j
                  r;t          j        |j                  }                      j        d         d           n|j        }t                               fdt!          |          D             d          S t#          d                    |                    )a  Returns a one-hot tensor, given and int tensor and a space.

    Handles the MultiDiscrete case as well.

    Args:
        x: The input tensor.
        space: The space to use for generating the one-hot tensor.

    Returns:
        The resulting one-hot tensor.

    Raises:
        ValueError: If the given space is not a discrete one.

    .. testcode::

        import torch
        import gymnasium as gym
        from ray.rllib.utils.torch_utils import one_hot
        x = torch.IntTensor([0, 3])  # batch-dim=2
        # Discrete space with 4 (one-hot) slots per batch item.
        s = gym.spaces.Discrete(4)
        print(one_hot(x, s))
        x = torch.IntTensor([[0, 1, 2, 3]])  # batch-dim=1
        # MultiDiscrete space with 5 + 4 + 4 + 7 = 20 (one-hot) slots
        # per batch item.
        s = gym.spaces.MultiDiscrete([5, 4, 4, 7])
        print(one_hot(x, s))

    .. testoutput::

        tensor([[1, 0, 0, 0],
                [0, 0, 0, 1]])
        tensor([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]])
    r   r   c                     g | ]A\  }}t           j                            d d |f                                         |          BS r!   )r0   
functionalr   long)rc   inr   s      r$   re   zone_hot.<locals>.<listcomp>V  sC    RRR$!QR]""1QQQT7<<>>155RRRr&   rn   z#Unsupported space for `one_hot`: {})r3   r   r0   r   r   r   r   r	   nvecr+   r   ravelr   r   r4   rq   	enumeraterL   format)r   r   r   s   `  r$   r   r   (  s    J %"" N}$$QVVXXuw777	E=	)	) NejmRZ00 	8EJ''D		!'!*b))AA:DyyRRRR)D//RRR  
 
 	

 >EEeLLMMMr&   axisc                 L   t                               | t          d                    }t                               || t                               |                     }t                               ||          t                               |                                |          z  S )zSame as torch.mean() but ignores -inf values.

    Args:
        x: The input tensor to reduce mean over.
        axis: The axis over which to reduce. None for all axes.

    Returns:
        The mean reduced inputs, ignoring inf values.
    z-inf)r4   ner   r   
zeros_liker   )r   r   maskx_zeroeds       r$   reduce_mean_ignore_infr   ]  so     88AuV}}%%D{{4E$4$4Q$7$788H99Xt$$uyyt'D'DDDr&   r   maxlen
time_majorc                    ||                                  }t                              t          | j                  t          |          fz             }|                    | j                                      d          	                                | k     }|s|	                                }|
                    |pt          j                   |S )al  Offers same behavior as tf.sequence_mask for torch.

    Thanks to Dimitris Papatheodorou
    (https://discuss.pytorch.org/t/pytorch-equivalent-for-tf-sequence-mask/
    39036).

    Args:
        lengths: The tensor of individual lengths to mask by.
        maxlen: The maximum length to use for the time axis. If None, use
            the max of `lengths`.
        dtype: The torch dtype to use for the resulting mask.
        time_major: Whether to return the mask as [B, T] (False; default) or
            as [T, B] (True).

    Returns:
         The sequence mask resulting from the given input and parameters.
    Nr(   rn   )rK   r4   onestupler   intrU   rV   cumsumrm   r   bool)r   r   r   r   r   s        r$   sequence_maskr   m  s    2 ~::eGM**c&kk^;<<DWWW^$$+++224466@AD vvxx 	IIe!uz"""Kr&   main_net
target_nettauc                     |                                  fd|                                                                 D             }|                    |           dS )a  Updates a torch.nn.Module target network using Polyak averaging.

    .. code-block:: text

        new_target_net_weight = (
            tau * main_net_weight + (1.0 - tau) * current_target_net_weight
        )

    Args:
        main_net: The nn.Module to update from.
        target_net: The target network to update.
        tau: The tau value to use in the Polyak averaging formula.
    c                 @    i | ]\  }}||         z  d z
  |z  z   S )r(    )rc   rW   rX   
state_dictr   s      r$   
<dictcomp>z)update_target_network.<locals>.<dictcomp>  sE       Aq 	
3A!c'Q.  r&   N)r   rM   load_state_dict)r   r   r   new_state_dictr   s     ` @r$   update_target_networkr     sx    ( $$&&J    ))++1133  N
 ~.....r&   kl_divergencec                     |                                  r0|                                rt                              d           d S d S d S )Na}  KL divergence is non-finite, this will likely destabilize your model and the training process. Action(s) in a specific state have near-zero probability. This can happen naturally in deterministic environments where the optimal policy has zero mass for a specific action. To fix this issue, consider setting the coefficient for the KL loss term to zero or increasing policy entropy.)loss_initializedisinfloggerwarning)r   r   s     r$   warn_if_infinite_kl_divergencer     sd    
    
]%8%8%:%: 
*	
 	
 	
 	
 	

 
 
 
r&   seedc                    | <t           r6t                               |            t           j        j        }|pt	          t           j        j                  dk    rNdt
          j        d<   t           j                            |            t           j                            |            nit          j        t           j	                  t          j        d          k    rt           
                    d           nt                               d           dt           j        j        _        dt           j        j        _        dS dS dS )ztSets the torch random seed to the given value.

    Args:
        seed: The seed to use or None for no seeding.
    Ngffffff$@z:4096:8CUBLAS_WORKSPACE_CONFIGz1.8.0TF)r4   manual_seedr
   ry   r   osenvironmanual_seed_allVersion__version__use_deterministic_algorithmsset_deterministicbackendscudnndeterministic	benchmark)r   cuda_versions     r$   set_torch_seedr    s    E$})#em.@(A(AT(I(I4=BJ01J""4(((J&&t,,,,u011W_W5M5MMM2248888''----1* */&&&) r&   logitslabelsc                 |    t                               | t          j                            | d          z  d          S )zSame behavior as tf.nn.softmax_cross_entropy_with_logits.

    Args:
        x: The input predictions.
        labels: The labels corresponding to `x`.

    Returns:
        The resulting softmax cross-entropy given predictions and labels.
    r   )r4   r   r0   r   log_softmax)r  r  s     r$   !softmax_cross_entropy_with_logitsr    s1     99fWr}88DDDbIIIr&   torch.Tensorc                     t                               |           t                               t                               |           dz             z  S )zThe symlog function as described in [1]:

    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf
    r(   )r4   signlogr   r   s    r$   symlogr    s4     ::a==599UYYq\\A%56666r&   c                     t                               |           t                               t                               |                     dz
  z  S )zInverse of the `symlog` function as desribed in [1]:

    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf
    r(   )r4   r  expr   )r   s    r$   inverse_symlogr    s4    ( ::a==EIIeiill33a788r&            4      4@num_bucketslower_boundupper_boundc                    t                               | ||          } t                               d| j        d         |                                          }||z
  |dz
  z  }| | z   |z  }t                               |          }t                               |          }	t                               |                    |	          |	dz   |	          }	t                               |	                    |          |	dz
  |	          }	|||z  z   }
||	|z  z   }| |z
  |
|z
  z  }d|z
  }t           	                    ||gd          }t           	                    ||	gd          }t           
                    ||gd                                          }t           
                    ||gd          }t                               | j        d         ||          }|||dddf         |dddf         f<   |S )	a+  Returns a two-hot vector of dim=num_buckets with two entries that are non-zero.

    See [1] for more details:
    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf

    Entries in the vector represent equally sized buckets within some fixed range
    (`lower_bound` to `upper_bound`).
    Those entries not 0.0 at positions k and k+1 encode the actual `value` and sum
    up to 1.0. They are the weights multiplied by the buckets values at k and k+1 for
    retrieving `value`.

    Example:
        num_buckets=11
        lower_bound=-5
        upper_bound=5
        value=2.5
        -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0]
        -> [-5   -4   -3   -2   -1   0    1    2    3    4    5] (0.5*2 + 0.5*3=2.5)

    Example:
        num_buckets=5
        lower_bound=-1
        upper_bound=1
        value=0.1
        -> [0.0, 0.0, 0.8, 0.2, 0.0]
        -> [-1  -0.5   0   0.5   1] (0.2*0.5 + 0.8*0=0.1)

    Args:
        value: The input tensor of shape (B,) to be two-hot encoded.
        num_buckets: The number of buckets to two-hot encode into.
        lower_bound: The lower bound value used for the encoding. If input values are
            lower than this boundary, they will be encoded as `lower_bound`.
        upper_bound: The upper bound value used for the encoding. If input values are
            higher than this boundary, they will be encoded as `upper_bound`.

    Returns:
        The two-hot encoded tensor of shape (B, num_buckets).
    r   r{   r(   rJ   r`   r   rn   N)r4   rS   aranger   r   floorceilr   eqrg   rq   r   zeros)r?   r  r  r   rV   batch_indicesbucket_deltaidxrW   kp1values_k
values_kp1	weights_kweights_kp1	indices_kindices_kp1indicesupdatesoutputs                      r$   two_hotr4    s   b KK{K88ELLEKN6LBBHHJJM+-+/BL<%<
/CCA
**S//C ++add3iisC
0
0C
 ++cff[))39c
:
:CQ--Hs\11J#:(=>I	/K ]A.B77I++}c2+;;KiiK0ai88==??GiiK0ai88G [[QV[DDF+2F7111a4='!!!Q$-'(Mr&   c                  6    	 dd l m}  dS # t          $ r Y dS w xY w)Nr   TF)torch._dynamo_dynamoImportError)dynamos    r$   _dynamo_is_availabler:  n  s@    &&&&&&t   uus   
 
)NFFNr!   )NF)rJ   )NNF)r   r  r   r  )r   r  r   r  )r  r  r  N)Ologgingr  r   typingr   r   r   r   r   	gymnasiumgymr7   r+   r   gymnasium.spacesr   r	   	packagingr
    ray.rllib.models.repeated_valuesr   ray.rllib.utils.annotationsr   r   r   ray.rllib.utils.frameworkr   ray.rllib.utils.numpyr   ray.rllib.utils.typingr   r   r   r   r   ray.rllib.core.learner.learnerr   r   ray.rllib.policy.torch_policyr    ray.rllib.policy.torch_policy_v2r   	getLogger__name__r   r4   r0   	FLOAT_MIN	FLOAT_MAXparseTORCH_COMPILE_REQUIRED_VERSIONrL   strr>   r   r^   rR   rt   r   r   r   r   r   r=   r   r   Spacer   r   r   r   r   r   r  r  r  r  r4  r:  r   r&   r$   <module>rQ     sz    				  = = = = = = = = = = = = = =          4 4 4 4 4 4 4 4       ; ; ; ; ; ; L L L L L L L L L L 6 6 6 6 6 6 . . . . . .               ?CCCCCCCC999999>>>>>>		8	$	$	r 		 %2W]7%;%;""%/ZP& &"
 /&/&&4/&<F/&	#z//& /& /& /&d  "&	@ @ @@ @ 	@
 @ @ @ @F    
        F 01	#z/   8  !PT]* ]*SM]* ]* 	]*
 UKLM]* ]* ]* ]*@ * ** *HSM * * * *< G* GJ G: G G G G(  ,0o ooK(o o 	o o o od Hj) Hj H H H H$  * U Z    0 .z .j . . . . 1Nz 1N#) 1N
 1N 1N 1N 1Nh E Ej E E E E E E  !
	% %%SM% 	%
 % % % %P /// 
/ 
	/ / / /: 


 

 
 
 
 / /# /$ / / / /: JJJ J J J J  7 7 7 7 9 9 9 9,   U UUU U 	U
 SMU U U Up    r&   