
    &`i2                     >   d dl Z d dlmZmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& erd dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0  e j1        e2          Z3 e            \  Z4Z5Z6eddddde	e7         de8de	e7         fd            Z9ede&de&de&fd            Z:e	 	 d[de%de	e$         de;de&fd            Z<edee8         fd            Z=eddddd d!d"e	ej>                 d#e	e         d$e	e8         de;d%e;dd&fd'            Z?ed(e
d)         d*ed+e#f         de
ed,                  fd-            Z@ed\d/e&d0e7de&fd1            ZAed/e&de&fd2            ZBe	 d]d3e	d4         d5e;defd6            ZCe	 d^d8e d9e&d:ed;         d<e7de!f
d=            ZDed/e&d"ej>        de&fd>            ZEed_d/e&d?e	eF         de&fd@            ZGe	 d]dAee8dBf         dCe;ded;         fdD            ZHed`dF            ZIedadG            ZJe	 	 	 	 dbd#dEdKeFdLe7dMe7fdN            ZKedOe"dPe"dQe7ddfdR            ZLedSe%de&fdT            ZMedUe
d)         dVe&ddfdW            ZNdX ZOe G dY dZ                      ZPdS )c    N)OrderedDictdeque)TYPE_CHECKINGAnyCallableListOptionalTypeUnion)DiscreteMultiDiscrete)
force_list)DeveloperAPI	PublicAPI)try_import_tf)SMALL_NUMBER)get_base_struct_from_space)LocalOptimizerModelGradientsNetworkTypePartialAlgorithmConfigDictSpaceStructTensorStructType
TensorType)AlgorithmConfig)	ParamDictEagerTFPolicyEagerTFPolicyV2TFPolicy)	grad_clipgradients_dictr   r#   grad_clip_byreturnc                b   |dS |dk    rN|                                                                  D ]%\  }}t                              || |          | |<   &dS |dk    rL|                                                                  D ]#\  }}t                              ||          | |<   $dS |dk    sJ t                              t          |                                           |          \  }}t          |                                  	                                |          D ]
\  }}|| |<   |S )a?  Performs gradient clipping on a grad-dict based on a clip value and clip mode.

    Changes the provided gradient dict in place.

    Args:
        gradients_dict: The gradients dict, mapping str to gradient tensors.
        grad_clip: The value to clip with. The way gradients are clipped is defined
            by the `grad_clip_by` arg (see below).
        grad_clip_by: One of 'value', 'norm', or 'global_norm'.

    Returns:
        If `grad_clip_by`="global_norm" and `grad_clip` is not None, returns the global
        norm of all tensors, otherwise returns None.
    Nvaluenormglobal_norm)
copyitemstfclip_by_valueclip_by_normclip_by_global_normlistvalueszipkeys)r$   r#   r%   kvclipped_gradsr*   s          l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/utils/tf_utils.pyclip_gradientsr9   $   se   ,  w"''))//11 	K 	KDAq " 0 0YJ	 J JN1	K 	K 
		"''))//11 	> 	>DAq "9 = =N1	> 	>
 },,,,%'%;%;&&(())9&
 &
"{ ++--2244mDD 	" 	"DAq !N1     ypredc                     t           j                            | dg          \  }}t           j                            | |z
  dg          \  }}t                               dd||t          z   z  z
            S )a,  Computes the explained variance for a pair of labels and predictions.

    The formula used is:
    max(-1.0, 1.0 - (std(y - pred)^2 / std(y)^2))

    Args:
        y: The labels.
        pred: The predictions.

    Returns:
        The explained variance given a pair of labels and predictions.
    r   )axesg         )r-   nnmomentsmaximumr   )r;   r<   _y_vardiff_vars        r8   explained_variancerF   U   se     u}}QaS}))HAu%--Ds-33KAx::dAU\-A!BCDDDr:   Finputsspaces_struct	time_axisc           	         t          j        |           }|t          j        |          ndgt          |          z  }d}d}g }t          ||          D ]\  }}	t                              |          }t                              |          }
||
d         }|r|
d         }t          |	t                    rh|rt          	                    |||z  g          }|
                    t                              t          ||	          t          j                             t          |	t                    rj|r t          	                    |||z  dg          }|
                    t                              t          ||	          t          j                             J|r!t          	                    |||z  dg          }nt          	                    ||dg          }|
                    t                              |t          j                             t                              |d          }|rt          	                    |||dg          }|S )aK  Flattens arbitrary input structs according to the given spaces struct.

    Returns a single 1D tensor resulting from the different input
    components' values.

    Thereby:
    - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes
    are not treated differently from other types of Boxes and get
    flattened as well.
    - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with
    Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]].
    - MultiDiscrete values are multi-one-hot'd, e.g. a batch of
    [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in
    [[1, 0,  0, 0, 1, 0, 0], [0, 1,  0, 0, 0, 0, 1]].

    Args:
        inputs: The inputs to be flattened.
        spaces_struct: The structure of the spaces that behind the input
        time_axis: Whether all inputs have a time-axis (after the batch axis).
            If True, will keep not only the batch axis (0th), but the time axis
            (1st) as-is and flatten everything from the 2nd axis up.

    Returns:
        A single 1D tensor resulting from concatenating all
        flattened/one-hot'd input components. Depending on the time_axis flag,
        the shape is (B, n) or (B, T, n).

    .. testcode::
        :skipif: True

        # B=2
        from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor
        from gymnasium.spaces import Discrete, Box
        out = flatten_inputs_to_1d_tensor(
            {"a": [1, 0], "b": [[[0.0], [0.1]], [1.0], [1.1]]},
            spaces_struct=dict(a=Discrete(2), b=Box(shape=(2, 1)))
        )
        print(out)

        # B=2; T=2
        out = flatten_inputs_to_1d_tensor(
            ([[1, 0], [0, 1]],
             [[[0.0, 0.1], [1.0, 1.1]], [[2.0, 2.1], [3.0, 3.1]]]),
            spaces_struct=tuple([Discrete(2), Box(shape=(2, ))]),
            time_axis=True
        )
        print(out)

    .. testoutput::

        [[0.0, 1.0,  0.0, 0.1], [1.0, 0.0,  1.0, 1.1]]  # B=2 n=4
        [[[0.0, 1.0, 0.0, 0.1], [1.0, 0.0, 1.0, 1.1]],
        [[1.0, 0.0, 2.0, 2.1], [0.0, 1.0, 3.0, 3.1]]]  # B=2 T=2 n=4
    Nr   r?   axis)treeflattenlenr3   r-   convert_to_tensorshape
isinstancer   reshapeappendcastone_hotfloat32r   concat)rG   rH   rI   flat_inputsflat_spacesBToutinput_spacerR   mergeds               r8   flatten_inputs_to_1d_tensorrb   h   s   z ,v&&K $ 	]###Vc+&&&  	AA
C[+66 4 4%%f--  9aA !H eX&& 	4 5FQUG44JJrwwwvu55rzBBCCCC}-- 
	4 9FQUBK88JJrwwwvu55rzBBCCCC  5FQUBK88FQG44JJrwwvrz223333YYsY$$F 0FQ2J//Mr:   c                     t           dk    rddlm}  |                                 }nS	 t          j                                        }n3# t          $ r& t          j        j                                        }Y nw xY wd |D             S )zReturns a list of GPU device names, e.g. ["/gpu:0", "/gpu:1"].

    Supports both tf1.x and tf2.x.

    Returns:
        List of GPU device names (str).
    r?   r   )
device_libc                 .    g | ]}d |j         v |j        S )GPU)device_typename).0ds     r8   
<listcomp>z#get_gpu_devices.<locals>.<listcomp>   s&    >>>qu'='=AF'='='=r:   )	tfvtensorflow.python.clientrd   list_local_devicesr-   configlist_physical_devices	Exceptionexperimental)rd   devicess     r8   get_gpu_devicesrt      s     axx777777//11	Ei5577GG 	E 	E 	Ei,BBDDGGG	E ?>G>>>>s   A -A76A7T)r`   r(   rh   rI   rO   r`   r(   rh   rO   ztf1.placeholderc                    ddl m} | t          | t          j        j        t          j        j        f          r=|r|                    | d          S t          j	        fdt          |                     S t                              d|rdndz   | j        z   | j        t          j        k    rt"          j        n| j                  S |J |j        dd         }t                              d|rdndz   t          |t&                    r|n t'          |                                          z   |j        t          j        k    rt"          j        n|j                  S )	a  Returns a tf1.placeholder object given optional hints, such as a space.

    Note that the returned placeholder will always have a leading batch
    dimension (None).

    Args:
        space: An optional gym.Space to hint the shape and dtype of the
            placeholder.
        value: An optional value to hint the shape and dtype of the
            placeholder.
        name: An optional name for the placeholder.
        time_axis: Whether the placeholder should also receive a time
            dimension (None).
        flatten: Whether to flatten the given space into a plain Box space
            and then create the placeholder from the resulting space.

    Returns:
        The tf1 placeholder.
    r   )ModelCatalogNc                 l    t          |dz   d                    d | D                       z             S )N.c                 ,    g | ]}t          |          S  strri   ps     r8   rk   z5get_placeholder.<locals>.<lambda>.<locals>.<listcomp>  s    3I3I3IqCFF3I3I3Ir:   )r`   rh   )get_placeholderjoin)path	componentrh   s     r8   <lambda>z!get_placeholder.<locals>.<lambda>  s@    O'!CZ#((3I3ID3I3I3I*J*JJ- - - r:   Nrz   )rR   dtyperh   r?   )ray.rllib.models.catalogrv   rS   gymspacesDictTupleget_action_placeholderrN   map_structure_with_pathr   tf1placeholderrR   r   npfloat64r-   rX   tupleas_list)r`   r(   rh   rI   rO   rv   rR   s     `    r8   r   r      sx   8 655555ecjosz/?@AA 
	 	#::5$GGG3    /u55   	9WWr:U[H %rz 9 9"**u{  
 
 	
    ABB#+ww-"5%00LuueEMMOO6L6LN !&rz 9 9"**u{  
 
 	
r:   orig_clsr"   ro   r   )r"   r   r    c                 |   | }|                     dd          }|dv rt          st          d          |dk    rt                                          st                                           t                                          sJ ddlm} ddlm} dd	l	m
} t          | d
          r%t          | |          s|                                 }n3t          | |          sn"t          d                    |                     |                     d          r&t          |||f          r|                                }|S )a[  Returns the corresponding tf-eager class for a given TFPolicy class.

    Args:
        orig_cls: The original TFPolicy class to get the corresponding tf-eager
            class for.
        config: The Algorithm config dict or AlgorithmConfig object.

    Returns:
        The tf eager policy class corresponding to the given TFPolicy class.
    	frameworkr-   )tf2r-   zCould not import tensorflow!r   r   r   r   r!   as_eagerz0This policy does not support eager execution: {}eager_tracing)getr   ImportErrorexecuting_eagerlyenable_eager_execution ray.rllib.policy.eager_tf_policyr   #ray.rllib.policy.eager_tf_policy_v2r    ray.rllib.policy.tf_policyr"   hasattr
issubclassr   
ValueErrorformatwith_tracing)r   ro   clsr   r   r    r"   s          r8   get_tf_eager_cls_if_necessaryr   #  su    C

;--IM!!#!8999E$$&& 	)&&((($$&&&&&BBBBBBGGGGGG777777 8Z(( 		Hm1T1T 		##%%CC Hh// 	BII(SS  
 ::o&& 	%:-1,
 ,
 	% ""$$CJr:         ?xdeltac                     t                               t                               |           |k     t           j                            |           dz  |t                               |           d|z  z
  z            S )a  Computes the huber loss for a given term and delta parameter.

    Reference: https://en.wikipedia.org/wiki/Huber_loss
    Note that the factor of 0.5 is implicitly included in the calculation.

    Formula:
        L = 0.5 * x^2  for small abs x (delta threshold)
        L = delta * (abs(x) - 0.5*delta)  for larger abs x (delta threshold)

    Args:
        x: The input term, e.g. a TD error.
        delta: The delta parmameter in the above formula.

    Returns:
        The Huber loss resulting from `x` and `delta`.
          ?)r-   whereabsmathsquare)r   r   s     r8   
huber_lossr   U  s]    $ 88
q		E
qCS5[()  r:   c                 n    dt                               t                               | d                    z  S )zComputes half the L2 norm over a tensor's values without the sqrt.

    output = 0.5 * sum(x ** 2)

    Args:
        x: The input tensor.

    Returns:
        0.5 times the L2 norm over the given tensor's values (w/o sqrt).
    r          @)r-   
reduce_sumpowr   s    r8   l2_lossr   n  s'     rvva~~....r:   session_or_noneztf1.Sessiondynamic_shapec                 Z     t                                           r J n J  fd}|S )aL  Returns a function that can be executed in either graph or eager mode.

    The function must take only positional args.

    If eager is enabled, this will act as just a function. Otherwise, it
    will build a function that executes a session run with placeholders
    internally.

    Args:
        session_or_none: tf.Session if in graph mode, else None.
        dynamic_shape: True if the placeholders should have a dynamic
            batch dimension. Otherwise they will be fixed shape.

    Returns:
        A function that can be called in either eager or static-graph mode.
    Nc                 :     g i d g fd}|S  S )Nc                    	 g }| D ]C}t          |          t          u r|                    |           .|                    |           D|} d         j                                        5  fd}t          j        ||           }t          j        |          D ]}
                    |           t          j        ||          }|	                                D ]
\  }}||<    
i d<   d d d            n# 1 swxY w Y   t          t          
t          j        |                               	t          j        	fd|                               d         	          }|S )Nr   c                     r.t          |j                  dk    rd|j        dd          z   }n
d}n|j        }t                              |j        |d                    d | D                                 S )Nr   r   r?   rz   rx   c                 ,    g | ]}t          |          S rz   r{   r}   s     r8   rk   zfmake_tf_callable.<locals>.make_wrapper.<locals>.call.<locals>._create_placeholders.<locals>.<listcomp>  s    .D.D.D!s1vv.D.D.Dr:   )r   rR   rh   )rP   rR   r   r   r   r   )r   r(   rR   r   s      r8   _create_placeholderszRmake_tf_callable.<locals>.make_wrapper.<locals>.call.<locals>._create_placeholders  s    , 4#&u{#3#3a#7#7,3ek!""o,EEE,.EE(-#&??&+k&+%(XX.D.Dt.D.D.D%E%E $3 $ $ r:   c                 0                         | |          S r   )__setitem__)phr6   	feed_dicts     r8   r   zFmake_tf_callable.<locals>.make_wrapper.<locals>.call.<locals>.<lambda>  s    )"7"7A">"> r:   )typer1   extendrU   graph
as_defaultrN   r   rO   r,   dictr3   map_structurerun)argskwargs	args_flatar   placeholdersr   r5   retr   args_placeholdersr   fnkwargs_placeholdersr   symbolic_outs            @r8   callz4make_tf_callable.<locals>.make_wrapper.<locals>.call  s   	 , ,AAww$!((++++!((++++ 
  ?*(.99;; X X     (,'C0$( ( #',|"<"< 9 9B-44R8888'+'C0&( ( &2%7%7%9%9 8 8EAr57/22*,".?*WCV*W*WQ9X X X X X X X X X X X X X X X: !%6T8J8J!K!KLL	">>>>'  
 &)),q/9EE
s   .BD  DDrz   )r   r   r   r   r   r   r   s   ` @@@r8   make_wrapperz&make_tf_callable.<locals>.make_wrapper  sc     & ""$ 6L1 1 1 1 1 1 1 1 1 1f K Ir:   )r-   r   )r   r   r   s   `` r8   make_tf_callabler   }  sb    * 
 +&&&&***@ @ @ @ @ @D r:         $@	optimizer	objectivevar_listztf.Variableclip_valc           
      0   dk    s
J             t                                           rF| j        }t          t	          t          |                    ||                    |                    }n|                     ||          }fd|D             S )a  Computes, then clips gradients using objective, optimizer and var list.

    Ensures the norm of the gradients for each variable is clipped to
    `clip_val`.

    Args:
        optimizer: Either a shim optimizer (tf eager) containing a
            tf.GradientTape under `self.tape` or a tf1 local optimizer
            object.
        objective: The loss tensor to calculate gradients on.
        var_list: The list of tf.Variables to compute gradients over.
        clip_val: The global norm clip value. Will clip around -clip_val and
            +clip_val.

    Returns:
        The resulting model gradients (list or tuples of grads + vars)
        corresponding to the input `var_list`.
    N        )r   c                 \    g | ](\  }}|t                               |          n||f)S r   )r-   r/   )ri   gr6   r   s      r8   rk   z%minimize_and_clip.<locals>.<listcomp>   sG       Q= *2)=H	%	%	%1aH==r:   )r-   r   taper1   r3   gradientcompute_gradients)r   r   r   r   r   grads_and_varss      `  r8   minimize_and_clipr     s    4 x#~~~x~~-	 S~c$t}}Y'I'I"J"JHUUVV"44Y4RR   $   r:   c                 :    t          |t                    r,t                               |j        t          j                  S t          |t                    rt          |j        d         t          j	                  rBt          j
        |j                  }t                                j        d         df           n|j        }t                               fdt          |          D             d          S t          d                    |                    )aj  Returns a one-hot tensor, given and int tensor and a space.

    Handles the MultiDiscrete case as well.

    Args:
        x: The input tensor.
        space: The space to use for generating the one-hot tensor.

    Returns:
        The resulting one-hot tensor.

    Raises:
        ValueError: If the given space is not a discrete one.

    .. testcode::
        :skipif: True

        import gymnasium as gym
        import tensorflow as tf
        from ray.rllib.utils.tf_utils import one_hot
        x = tf.Variable([0, 3], dtype=tf.int32)  # batch-dim=2
        # Discrete space with 4 (one-hot) slots per batch item.
        s = gym.spaces.Discrete(4)
        one_hot(x, s)

    .. testoutput::

        <tf.Tensor 'one_hot:0' shape=(2, 4) dtype=float32>

    .. testcode::
        :skipif: True

        x = tf.Variable([[0, 1, 2, 3]], dtype=tf.int32)  # batch-dim=1
        # MultiDiscrete space with 5 + 4 + 4 + 7 = 20 (one-hot) slots
        # per batch item.
        s = gym.spaces.MultiDiscrete([5, 4, 4, 7])
        one_hot(x, s)

    .. testoutput::

        <tf.Tensor 'concat:0' shape=(1, 20) dtype=float32>
    r   r   rK   c                 x    g | ]6\  }}t                               d d |f         |t           j                  7S )Nr   )r-   rW   rX   )ri   inr   s      r8   rk   zone_hot.<locals>.<listcomp><  s=    RRR$!QRZZ!!!Q$"*Z55RRRr:   rL   z#Unsupported space for `one_hot`: {})rS   r   r-   rW   r   rX   r   nvecr   ndarrayravelrT   rR   rY   	enumerater   r   )r   r`   r   s   `  r8   rW   rW     s    X %"" Nzz!UWBJz777	E=	)	) NejmRZ00 	8EJ''D

1qwqz2.//AA:DyyRRRR)D//RRR  
 
 	

 >EEeLLMMMr:   rM   c                    t                               | t           j        j                  }t                               || t                               |                     }t           j                            ||          t           j                            t                               |t           j                  |          z  S )zSame as tf.reduce_mean() but ignores -inf values.

    Args:
        x: The input tensor to reduce mean over.
        axis: The axis over which to reduce. None for all axes.

    Returns:
        The mean reduced inputs, ignoring inf values.
    )	r-   	not_equalrX   minr   
zeros_liker   r   rV   )r   rM   maskx_zeroeds       r8   reduce_mean_ignore_infr   C  s     <<2:>**Dxxaq!1!122H7h--0B0B
bj!!41 1  r:   scopeztf1.VariableScopetrainable_onlyc                     t                               |rt           j        j        nt           j        j        t          | t                    r| n| j                  S )a  Get variables inside a given scope.

    Args:
        scope: Scope in which the variables reside.
        trainable_only: Whether or not to return only the variables that were
            marked as trainable.

    Returns:
        The list of variables in the given `scope`.
    )r   )r   get_collection	GraphKeysTRAINABLE_VARIABLES	VARIABLESrS   r|   rh   )r   r   s     r8   
scope_varsr   U  sS     	%))]$!%--=ee5:	    r:   	tf.Tensorc                     t           j                            |           t           j                            t           j                            |           dz             z  S )zThe symlog function as described in [1]:

    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf
    r?   )r-   r   signlogr   r   s    r8   symlogr  k  s:     7<<??RW[[Q!);<<<<r:   c                     t           j                            |           t           j                            t           j                            |                     dz
  z  S )zInverse of the `symlog` function as desribed in [1]:

    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf
    r?   )r-   r   r   expr   )r;   s    r8   inverse_symlogr  v  s:    ( 7<<??bgkk"'++a..99A=>>r:            4      4@num_bucketslower_boundupper_boundc                 d   t                               | ||          } t                               t                               dt                               |           d                   |pt           j                  }||z
  |dz
  z  }| | z   |z  }t           j                            |          }t           j                            |          }	t           	                    t           
                    ||	          |	dz   |	          }	t           	                    t           
                    |	|          |	dz
  |	          }	|||z  z   }
||	|z  z   }| |z
  |
|z
  z  }d|z
  }t                               ||gd          }t                               ||	gd          }t                               ||gd          }t                               ||gd          }t                               t                               |t           j                  |t                               |           d         |f          S )a+  Returns a two-hot vector of dim=num_buckets with two entries that are non-zero.

    See [1] for more details:
    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf

    Entries in the vector represent equally sized buckets within some fixed range
    (`lower_bound` to `upper_bound`).
    Those entries not 0.0 at positions k and k+1 encode the actual `value` and sum
    up to 1.0. They are the weights multiplied by the buckets values at k and k+1 for
    retrieving `value`.

    Example:
        num_buckets=11
        lower_bound=-5
        upper_bound=5
        value=2.5
        -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0]
        -> [-5   -4   -3   -2   -1   0    1    2    3    4    5] (0.5*2 + 0.5*3=2.5)

    Example:
        num_buckets=5
        lower_bound=-1
        upper_bound=1
        value=0.1
        -> [0.0, 0.0, 0.8, 0.2, 0.0]
        -> [-1  -0.5   0   0.5   1] (0.2*0.5 + 0.8*0=0.1)

    Args:
        value: The input tensor of shape (B,) to be two-hot encoded.
        num_buckets: The number of buckets to two-hot encode into.
        lower_bound: The lower bound value used for the encoding. If input values are
            lower than this boundary, they will be encoded as `lower_bound`.
        upper_bound: The upper bound value used for the encoding. If input values are
            higher than this boundary, they will be encoded as `upper_bound`.

    Returns:
        The two-hot encoded tensor of shape (B, num_buckets).
    r   r   r?   r   r   rK   )rR   )r-   r.   rV   rangerR   rX   r   floorceilr   equalstackrY   
scatter_ndint32)r(   r  r	  r
  r   batch_indicesbucket_deltaidxr5   kp1values_k
values_kp1	weights_kweights_kp1	indices_kindices_kp1indicesupdatess                     r8   two_hotr    s   b UK==EGG
BHHUOOA&''!rz   M
  +-+/BL<%<
/C
cA
',,s

C ((288As##S3Y
4
4C
 ((288C--sSy#
>
>CQ--Hs\11J#:(=>I	/K -+R00I((M3/44KiiK0!44GiiK0!44G ==
""xxq!;/    r:   main_net
target_nettauc                     t          |j        | j                  D ](\  }}||z  d|z
  |z  z   }|                    |           )dS )a`  Updates a keras.Model target network using Polyak averaging.

    new_target_net_weight = (
        tau * main_net_weight + (1.0 - tau) * current_target_net_weight
    )

    Args:
        main_net: The keras.Model to update from.
        target_net: The target network to update.
        tau: The tau value to use in the Polyak averaging formula.
    r   N)r3   	variablesassign)r   r!  r"  old_varcurrent_varupdated_vars         r8   update_target_networkr)    s_    " !$J$8(:L M M $ $K'39*??{####$ $r:   actionsc                     t          j        |           d         }t                              |t          j                  }t          |j                  dk    r$|dddf         }t          |j                  dk    $|S )a  Helper function useful for returning dummy logp's (0) for some actions.

    Args:
        actions: The input actions. This can be any struct
            of complex action components or a simple tensor of different
            dimensions, e.g. [B], [B, 2], or {"a": [B, 4, 5], "b": [B]}.

    Returns:
        A 1D tensor of 0.0 (dummy logp's) matching the batch
        dim of `actions` (shape=[B]).
    r   r   r?   N)rN   rO   r-   r   rX   rP   rR   )r*  action_componentlogp_s      r8   zero_logps_from_actionsr.     su     |G,,Q/MM*"*M==E
 ek

Q

aaad ek

Q

Lr:   policymean_klc                     d |                                  r@t                              t          j                            |          d fd           d S d S )Nc                  j    t                               d           t                              d          S )Na}  KL divergence is non-finite, this will likely destabilize your model and the training process. Action(s) in a specific state have near-zero probability. This can happen naturally in deterministic environments where the optimal policy has zero mass for a specific action. To fix this issue, consider setting the coefficient for the KL loss term to zero or increasing policy entropy.r   )loggerwarningr-   constantrz   r:   r8   print_warningz5warn_if_infinite_kl_divergence.<locals>.print_warning  s1    *	
 	
 	
 {{3r:   c                  6    t                               d          S )Nr   )r-   r5  rz   r:   r8   r   z0warn_if_infinite_kl_divergence.<locals>.<lambda>,  s    R[[-- r:   c                                    S r   rz   )r6  s   r8   r   z0warn_if_infinite_kl_divergence.<locals>.<lambda>-  s    MMOO r:   )false_fntrue_fn)loss_initializedr-   condr   is_inf)r/  r0  r6  s     @r8   warn_if_infinite_kl_divergencer>    sz    	  	  	     

GNN7##--++++ 	 	
 	
 	
 	
 	

 
r:   c                    d}g }|D ]\}t          j        |t           j                  }| |||z                                |          }|                    |           ||z  }]t          |           |k    s
J d            |S )Nr   r   z.Passed weight does not have the correct shape.)r   prodint_rT   rU   rP   )vectorshapesr   arraysrR   sizearrays          r8   
_unflattenrG  1  s    	AF  wuBG,,,qAH~&..u55e	T	v;;!MMr:   c                   D    e Zd ZdZddZd Zd Zd Zd Zde	fd	Z
d
 ZdS )TensorFlowVariablesa  A class used to set and get weights for Tensorflow networks.

    Attributes:
        sess (tf.Session): The tensorflow session used to run assignment.
        variables (Dict[str, tf.Variable]): Extracted variables from the loss
            or additional variables that are passed in.
        placeholders (Dict[str, tf.placeholders]): Placeholders for weights.
        assignment_nodes (Dict[str, tf.Tensor]): Nodes that assign weights.
    Nc                    || _         t          |          }t          |          }g t          |          }t	          |          dk    r|                                }|*t          |d          r|j        }|j        D ]0}||vr*|	                    |           |
                    |           1|j        D ]0}||vr*|	                    |           |
                    |           1d|j        j        v sd|j        j        v r	                    |j        j                   t	          |          dk    t                      | _        fdt                                           D             }	||	|z  }	t                                           s|	D ]}
|
| j        |
j        j        j        <   i | _        i | _        | j                                        D ]\  }}t                               |                                j        |                                                                d|z             | j        |<   |                    | j        |                   | j        |<   dS |	D ]}
|
| j        |
j        <   dS )	aU  Creates TensorFlowVariables containing extracted variables.

        The variables are extracted by performing a BFS search on the
        dependency graph with loss as the root node. After the tree is
        traversed and those variables are collected, we append input_variables
        to the collected variables. For each variable in the list, the
        variable has a placeholder and assignment operation created for it.

        Args:
            output (tf.Operation, List[tf.Operation]): The tensorflow
                operation to extract all variables from.
            sess (Optional[tf.Session]): Optional tf.Session used for running
                the get and set methods in tf graph mode.
                Use None for tf eager.
            input_variables (List[tf.Variables]): Variables to include in the
                list.
        r   NopVariable	VarHandlec                 :    g | ]}|j         j        j        v |S rz   )rK  node_defrh   )ri   r6   variable_namess     r8   rk   z0TensorFlowVariables.__init__.<locals>.<listcomp>y  s1     
 
 
1C~1U1UA1U1U1Ur:   Placeholder_)rh   )sessr   r   setrP   popleftr   rK  rG   rU   addcontrol_inputsrO  rh   r   r$  r   global_variablesr   r   assignment_nodesr,   r   r(   r   	get_shaper   r%  )selfoutputrR  input_variablesqueueexplored_inputstf_objinput_opcontrolvariable_listr6   r5   varrP  s                @r8   __init__zTensorFlowVariables.__init__I  s   $ 	F##ff++ %jjAoo]]__F~ vt$$ #"M 2 2?22LL***#''111 "0 1 1/11LL)))#''000V_///;&/BT3T3T%%fo&:;;;) %jjAoo* %
 
 
 
++--
 
 
 &_,M$$&& 	+" 7 756qt}122 "D$&D! ...00 L L3'*IIKK%MMOO++--'!+ (7 ( (!!$
 ,/::d6G6J+K+K%a((L L # + +)*qv&&+ +r:   c                 b    t          d | j                                        D                       S )zReturns the total length of all of the flattened variables.

        Returns:
            The length of all flattened variables concatenated.
        c              3      K   | ]<}t          j        |                                                                          V  =d S r   )r   r@  rY  r   ri   r6   s     r8   	<genexpr>z4TensorFlowVariables.get_flat_size.<locals>.<genexpr>  s@      UU271;;==002233UUUUUUr:   )sumr$  r2   rZ  s    r8   get_flat_sizez!TensorFlowVariables.get_flat_size  s/     UUT^=R=R=T=TUUUUUUr:   c                       j         s5t          j        d  j                                        D                       S t          j         fd j                                        D                       S )zGets the weights and returns them as a flat array.

        Returns:
            1D Array containing the flattened weights.
        c                 Z    g | ](}|                                                                 )S rz   )numpyrO   rg  s     r8   rk   z0TensorFlowVariables.get_flat.<locals>.<listcomp>  s,    FFF""$$FFFr:   c                 j    g | ]/}|                     j                                                   0S ))session)evalrR  rO   ri   r6   rZ  s     r8   rk   z0TensorFlowVariables.get_flat.<locals>.<listcomp>  s5    RRRQQVVDIV&&..00RRRr:   )rR  r   concatenater$  r2   rj  s   `r8   get_flatzTensorFlowVariables.get_flat  s}     y 	>FFdn.C.C.E.EFFF   ~RRRR$.:O:O:Q:QRRR
 
 	
r:   c           	          d  j                                         D             }t          ||          } j        sDt	           j                                         |          D ]\  }}|                    |           dS  fd j                                         D             } j                            t           j	                                                  t          t	          ||                               dS )a;  Sets the weights to new_weights, converting from a flat array.

        Note:
            You can only set all weights in the network using this function,
            i.e., the length of the array must match get_flat_size.

        Args:
            new_weights (np.ndarray): Flat array containing weights.
        c                 Z    g | ](}|                                                                 )S rz   )rY  r   rg  s     r8   rk   z0TensorFlowVariables.set_flat.<locals>.<listcomp>  s,    KKKa!++--''))KKKr:   c                 0    g | ]\  }}j         |         S rz   )r   )ri   r5   r6   rZ  s      r8   rk   z0TensorFlowVariables.set_flat.<locals>.<listcomp>  s%    TTTTQD-a0TTTr:   r   N)r$  r2   rG  rR  r3   r%  r,   r   r1   rX  r   )rZ  new_weightsrC  rD  r6   r   r   s   `      r8   set_flatzTensorFlowVariables.set_flat  s    LK4>3H3H3J3JKKKK00y 	DN1133V<<  1  UTTTT^=Q=Q=S=STTTLIMMT*113344s<8899      r:   c                 \    | j         s| j        S | j                             | j                  S )zReturns a dictionary containing the weights of the network.

        Returns:
            Dictionary mapping variable names to their weights.
        )rR  r$  r   rj  s    r8   get_weightszTensorFlowVariables.get_weights  s,     y 	">!y}}T^,,,r:   ry  c                     | j         <| j                                        D ] \  }}|                    ||                    !dS |                     |          \  }}| j                             ||           dS )a  Sets the weights to new_weights.

        Note:
            Can set subsets of variables as well, by only passing in the
            variables you want to be set.

        Args:
            new_weights: Dictionary mapping variable names to their
                weights.
        Nrx  )rR  r$  r,   r%  _assign_weightsr   )rZ  ry  rh   rc  assign_listr   s         r8   set_weightszTensorFlowVariables.set_weights  s     9!^1133 . .	c

;t,----. . &*%9%9+%F%F"KIMM+M;;;;;r:   c                    	
 g i t           j                                                  d 	 fd}|                                D ]j\  
v r |
           	fdD             } 
fdt	          |                                d           D             }|r ||d         
           ks
J d            t                    t          |          k    s
J d	             fd
D             fS )a'  Sets weigths using exact or closest assignable variable name

        Args:
            weights: Dictionary mapping variable names to their
                weights.
        Returns:
            Tuple[List, Dict]: assigned variables list, dict of
                placeholders and weights
        c                 :    t          fd| D                       S )Nc                     g | ]}|v |	S rz   rz   )ri   el2s     r8   rk   zOTensorFlowVariables._assign_weights.<locals>.nb_common_elem.<locals>.<listcomp>  s    111abr:   )rP   )l1r  s    `r8   nb_common_elemz;TensorFlowVariables._assign_weights.<locals>.nb_common_elem  s&    11112111222r:   c                 |    |j         |          <                       |                                |            d S r   )r   rU   remove)rh   r(   
assignableassignedr   rZ  s     r8   r%  z3TensorFlowVariables._assign_weights.<locals>.assign  sA    16Id'-.OOD!!!d#####r:   c           	      v    i | ]5}|                      d           |                     d                     6S )/)split)ri   rc  rh   r  s     r8   
<dictcomp>z7TensorFlowVariables._assign_weights.<locals>.<dictcomp>  sK        

33HH  r:   c                 \    g | ](\  }}|d k    j         j        |         j         k    &|)S )r   )rR   rX  )ri   	close_varcnrZ  r(   s      r8   rk   z7TensorFlowVariables._assign_weights.<locals>.<listcomp>  sG       %	2Avv%+1Fy1Q1W"W"W "W"W"Wr:   c                     | d          S )Nr?   rz   )r   s    r8   r   z5TensorFlowVariables._assign_weights.<locals>.<lambda>  s    qQRte r:   )keyr   zNo variables in the input matched those in the network. Possible cause: Two networks were defined in the same TensorFlow graph. To fix this, place each network definition in its own tf.Graph.z^All weights couldn't be assigned because no variable had an exact/close name or had same shapec                 *    g | ]}j         |         S rz   )rX  rr  s     r8   rk   z7TensorFlowVariables._assign_weights.<locals>.<listcomp>  s!    ;;;Q%a(;;;r:   )rS  rX  r4   r,   sortedrP   )rZ  weightsr%  commonselectr  r  r   rh   r  r(   s   `    @@@@@@r8   r~  z#TensorFlowVariables._assign_weights  s    	.335566
	3 	3 	3	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$
 #==?? 	- 	-KD%z!!tU####    )      )/OO)T)T)T  
  -F6!9e,,, 	
 	
.	
 	
x 8}}G,,,8 -,,
 <;;;(;;;YFFr:   )NN)__name__
__module____qualname____doc__rd  rk  rt  rz  r|  r   r  r~  rz   r:   r8   rI  rI  =  s         G+ G+ G+ G+RV V V
 
 
   ,
- 
- 
-<t < < < <$3G 3G 3G 3G 3Gr:   rI  )NF)r   )F)r   r   )r   r   r&   r   )r;   r   r&   r   )r  r  r  N)Qloggingcollectionsr   r   typingr   r   r   r   r	   r
   r   	gymnasiumr   rn  r   rN   gymnasium.spacesr   r   ray.rllib.utilsr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.frameworkr   ray.rllib.utils.numpyr   "ray.rllib.utils.spaces.space_utilsr   ray.rllib.utils.typingr   r   r   r   r   r   r   %ray.rllib.algorithms.algorithm_configr   ray.rllib.core.learner.learnerr   r   r   r   r    r   r"   	getLoggerr  r3  r   r-   rl   floatr|   r9   rF   boolrb   rt   Spacer   r   r   r   r   r   rW   intr   r   r  r  r  r)  r.  r>  rG  rI  rz   r:   r8   <module>r     sj    * * * * * * * * L L L L L L L L L L L L L L L L L L          4 4 4 4 4 4 4 4 & & & & & & ? ? ? ? ? ? ? ? 3 3 3 3 3 3 . . . . . . I I I I I I                   4EEEEEE888888>>>>>>CCCCCC333333		8	$	$}R  "&- - -- - 	-
 e_- - - -` E* EJ E: E E E E$  ,0e eeK(e e 	e e e eP ?c ? ? ? ?,  "&7
 7
 7
CI7
 C=7
 3-	7

 7
 7
 7
 7
 7
 7
t .:.#%??@. 
%>
?@. . . .b  * U Z    0 /z /j / / / / DI[ [m,[=A[[ [ [ [@ 
 	% %%% =!% 	%
 % % % %P 8Nz 8N#) 8N
 8N 8N 8N 8Nv  j      " CH ))*<@	-   * = = = = ? ? ? ?,  
Y YYY Y 	Y Y Y Yx $$$ 
$ 
	$ $ $ $* %5 *    2 

'1
	
 
 
 
,	 	 	 SG SG SG SG SG SG SG SG SG SGr:   