
    &`i'B              +          d dl mZmZmZmZmZmZmZ d dlZ	d dl
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(  e"            \  Z)Z*Z+eddddddddddddddddddedddde,deeeee         egee(ee(         f         f         deedge&f                  deeeegee,e(f         f                  deeee&gdf                  deeede(ge'f                  deeede'gdf                  deeeee'gee,e(f         f                  deeegee,e(f         f                  deeegee,e(f         f                  deeee	j-        e	j-        e&gdf                  deeee	j-        e	j-        e&gdf                  deeee	j.        j-        e	j.        j-        e&gdf                  d eeee	j-        e	j-        e&gdf                  d!eeee	j.        j-        e	j.        j-        e&gef                  d"eee(ee(         gee(e(f         f                  d#eeeee(e(e(gee(e/ee(         f         f                  d$eee/                  d%eeege0f                  d&ee         f(d'            Z1dS )(    )CallableDictListOptionalTupleTypeUnionN)DEPRECATED_VALUEdeprecation_warning)ModelV2)TFActionDistribution)eager_tf_policy)DynamicTFPolicy)Policy)SampleBatch)TFPolicy)
add_mixins
force_list)OldAPIStackoverride)try_import_tf)LEARNER_STATS_KEY)AlgorithmConfigDictModelGradients
TensorType)get_default_configpostprocess_fnstats_fnoptimizer_fncompute_gradients_fnapply_gradients_fngrad_stats_fnextra_action_out_fnextra_learn_fetches_fnvalidate_spacesbefore_initbefore_loss_init
after_init
make_modelaction_sampler_fnaction_distribution_fnmixinsget_batch_divisibility_reqobs_include_prev_action_rewardextra_action_fetches_fngradients_fnnameloss_fnr   r   r   ztf.keras.optimizers.Optimizerr    r!   ztf.Operationr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   returnc                  	
 t                                                      t          t          |          |t          k    rt          dd           |t          ddd           |t          dd	d            G 	
fd
d          }fd}fd}t          |          |_        t          |          |_        | |_	        | |_
        |S )aS  Helper function for creating a dynamic tf policy at runtime.

    Functions will be run in this order to initialize the policy:
        1. Placeholder setup: postprocess_fn
        2. Loss init: loss_fn, stats_fn
        3. Optimizer init: optimizer_fn, gradients_fn, apply_gradients_fn,
                           grad_stats_fn

    This means that you can e.g., depend on any policy attributes created in
    the running of `loss_fn` in later functions such as `stats_fn`.

    In eager mode, the following functions will be run repeatedly on each
    eager execution: loss_fn, stats_fn, gradients_fn, apply_gradients_fn,
    and grad_stats_fn.

    This means that these functions should not define any variables internally,
    otherwise they will fail in eager mode execution. Variable should only
    be created in make_model (if defined).

    Args:
        name: Name of the policy (e.g., "PPOTFPolicy").
            loss_fn (Callable[[
                Policy, ModelV2, Type[TFActionDistribution], SampleBatch],
                Union[TensorType, List[TensorType]]]): Callable for calculating a
                loss tensor.
            get_default_config (Optional[Callable[[None], AlgorithmConfigDict]]):
                Optional callable that returns the default config to merge with any
                overrides. If None, uses only(!) the user-provided
                PartialAlgorithmConfigDict as dict for this Policy.
            postprocess_fn (Optional[Callable[[Policy, SampleBatch,
                Optional[Dict[AgentID, SampleBatch]], Episode], None]]):
                Optional callable for post-processing experience batches (called
                after the parent class' `postprocess_trajectory` method).
            stats_fn (Optional[Callable[[Policy, SampleBatch],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                TF tensors to fetch given the policy and batch input tensors. If
                None, will not compute any stats.
            optimizer_fn (Optional[Callable[[Policy, AlgorithmConfigDict],
                "tf.keras.optimizers.Optimizer"]]): Optional callable that returns
                a tf.Optimizer given the policy and config. If None, will call
                the base class' `optimizer()` method instead (which returns a
                tf1.train.AdamOptimizer).
            compute_gradients_fn (Optional[Callable[[Policy,
                "tf.keras.optimizers.Optimizer", TensorType], ModelGradients]]):
                Optional callable that returns a list of gradients. If None,
                this defaults to optimizer.compute_gradients([loss]).
            apply_gradients_fn (Optional[Callable[[Policy,
                "tf.keras.optimizers.Optimizer", ModelGradients],
                "tf.Operation"]]): Optional callable that returns an apply
                gradients op given policy, tf-optimizer, and grads_and_vars. If
                None, will call the base class' `build_apply_op()` method instead.
            grad_stats_fn (Optional[Callable[[Policy, SampleBatch, ModelGradients],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                TF fetches given the policy, batch input, and gradient tensors. If
                None, will not collect any gradient stats.
            extra_action_out_fn (Optional[Callable[[Policy],
                Dict[str, TensorType]]]): Optional callable that returns
                a dict of TF fetches given the policy object. If None, will not
                perform any extra fetches.
            extra_learn_fetches_fn (Optional[Callable[[Policy],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                extra values to fetch and return when learning on a batch. If None,
                will call the base class' `extra_compute_grad_fetches()` method
                instead.
            validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable that takes the
                Policy, observation_space, action_space, and config to check
                the spaces for correctness. If None, no spaces checking will be
                done.
            before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable to run at the
                beginning of policy init that takes the same arguments as the
                policy constructor. If None, this step will be skipped.
            before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], None]]): Optional callable to
                run prior to loss init. If None, this step will be skipped.
            after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable to run at the end of
                policy init. If None, this step will be skipped.
            make_model (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], ModelV2]]): Optional callable
                that returns a ModelV2 object.
                All policy variables should be created in this function. If None,
                a default ModelV2 object will be created.
            action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
                Tuple[TensorType, TensorType]]]): A callable returning a sampled
                action and its log-likelihood given observation and state inputs.
                If None, will either use `action_distribution_fn` or
                compute actions by calling self.model, then sampling from the
                so parameterized action distribution.
            action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
                TensorType, TensorType],
                Tuple[TensorType, type, List[TensorType]]]]): Optional callable
                returning distribution inputs (parameters), a dist-class to
                generate an action distribution object from, and internal-state
                outputs (or an empty list if not applicable). If None, will either
                use `action_sampler_fn` or compute actions by calling self.model,
                then sampling from the so parameterized action distribution.
            mixins (Optional[List[type]]): Optional list of any class mixins for
                the returned policy class. These mixins will be applied in order
                and will have higher precedence than the DynamicTFPolicy class.
            get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
                Optional callable that returns the divisibility requirement for
                sample batches. If None, will assume a value of 1.

    Returns:
        Type[DynamicTFPolicy]: A child class of DynamicTFPolicy based on the
            specified args.
    r.   T)olderrorNr/   r#   )r5   newr6   r0   r    c                   J   e Zd Z	 	 d	 
fd	Z ee          	 d	fd	            Z ee          fd            Z ee          fd            Z	 ee          fd            Z
 ee          fd            Z ee          	fd            ZdS )
#build_tf_policy.<locals>.policy_clsNc                     r | |||           
r 
| |||           fd}t          j        | ||||||           	r 	| |||           d| _        d S )Nc                     r | |||           | j         ri }n |           }t          | d          r| j                            |           d S || _        d S )N_extra_action_fetches)	_is_towerhasattrr<   update)policy	obs_spaceaction_spaceconfigextra_action_fetchesr'   r#   s        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/tf_policy_template.pybefore_loss_init_wrapperzNbuild_tf_policy.<locals>.policy_cls.__init__.<locals>.before_loss_init_wrapper   s    # N$$VYfMMM&.&2B.+-((+>+>v+F+F(6#:;; H0778LMMMMM3GF000    )rA   rB   rC   r2   r   r"   r'   r)   r*   r+   existing_inputsexisting_modelr-   r   )r   __init__global_timestep)selfrA   rB   rC   rI   rH   rF   r+   r*   r(   r&   r'   r#   r-   r"   r2   r)   r   r%   s          rE   rJ   z,build_tf_policy.<locals>.policy_cls.__init__   s      GivFFF CD)\6BBBH H H H H H $#)!+!9%"3'= /-+E   "  B
4L&AAA $%D   rG   c                 R    t          j        | |          }r | |||          S |S N)r   postprocess_trajectory)rL   sample_batchother_agent_batchesepisoder   s       rE   rO   z:build_tf_policy.<locals>.policy_cls.postprocess_trajectory
  s=    
 "8|LLL X%~dL:MwWWWrG   c                     r | | j                   }n                    |           }t          |          }| j        r| j                            |          }|sd S | j         d         r|S |d         S N%_tf_policy_handles_more_than_one_lossr   )rC   	optimizerr   explorationget_exploration_optimizer)rL   
optimizersbaser   s     rE   rV   z-build_tf_policy.<locals>.policy_cls.optimizer  s     2)\$<<

!^^D11
#J//J T!-GG
SS
  %t DE %!! "!}$rG   c                     t          |          }t          |          }r3| j        d         r | ||          S  | |d         |d                   S                     | ||          S rT   )r   rC   	gradients)rL   rV   lossrY   lossesrZ   r    s        rE   r\   z-build_tf_policy.<locals>.policy_cls.gradients)  s    #I..J%%F# 	@ ;FG P//j&III 0/jmVAYOOO~~dJ???rG   c                 P    r | ||          S                      | ||          S rN   )build_apply_op)rL   rV   grads_and_varsr!   rZ   s      rE   r`   z2build_tf_policy.<locals>.policy_cls.build_apply_op9  s9    ! L))$	>JJJ**4NKKKrG   c                 N    t                              |           fi | j        S rN   )dictextra_compute_action_fetchesr<   )rL   rZ   s    rE   rd   z@build_tf_policy.<locals>.policy_cls.extra_compute_action_fetches@  s6    11$77 ;?;U  rG   c                 l    rt          t          i ifi  |           S                     |           S rN   )rc   r   extra_compute_grad_fetches)rL   rZ   r$   s    rE   rf   z>build_tf_policy.<locals>.policy_cls.extra_compute_grad_fetchesF  sI    % 
= .3TT7M7Md7S7STTT66t<<<rG   )NN)__name__
__module____qualname__rJ   r   r   rO   r   rV   r\   r`   rd   rf   )r+   r*   r(   r!   rZ   r&   r'   r    r#   r$   r-   r"   r2   r)   r   r   r   r%   s   rE   
policy_clsr9      s          1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	% 1	%f 
&		BF	  	  	  	  	  
		  
(			% 	% 	% 	% 	% 
		%( 
(			@ 	@ 	@ 	@ 	@ 
		@ 
(			L 	L 	L 	L 	L 
		L 
(			 	 	 	 
		
 
(			= 	= 	= 	= 	= 
		= 	= 	=rG   rj   c            	      2    t          di t          fi | S )a  Allows creating a TFPolicy cls based on settings of another one.

        Keyword Args:
            **overrides: The settings (passed into `build_tf_policy`) that
                should be different from the class that this method is called
                on.

        Returns:
            type: A new TFPolicy sub-class.

        Examples:
        >> MySpecialDQNPolicyClass = DQNTFPolicy.with_updates(
        ..    name="MySpecialDQNPolicyClass",
        ..    loss_function=[some_new_loss_function],
        .. )
         )build_tf_policyrc   )	overridesoriginal_kwargss    rE   with_updatesz%build_tf_policy.<locals>.with_updatesT  s)    " DDo!C!C!C!CDDDrG   c                  &    t          j        di  S )Nrl   )r   _build_eager_tf_policy)ro   s   rE   as_eagerz!build_tf_policy.<locals>.as_eagerg  s    5HHHHHrG   )localscopyr   r   r
   r   staticmethodrp   rs   rg   ri   )r1   r2   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   rj   rp   rs   rZ   ro   s    ` ``````````````` `      @@rE   rm   rm      s   V hhmmooOov..D%)999 @MMMM*)/DD	
 	
 	
 	
 4JRVWWWW|= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |= |=T |= |= |=|E E E E E&I I I I I +<88J&x00JJ"JrG   )2typingr   r   r   r   r   r   r	   	gymnasiumgymray._common.deprecationr
   r   ray.rllib.models.modelv2r   "ray.rllib.models.tf.tf_action_distr   ray.rllib.policyr   "ray.rllib.policy.dynamic_tf_policyr   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.tf_policyr   ray.rllib.utilsr   r   ray.rllib.utils.annotationsr   r   ray.rllib.utils.frameworkr   $ray.rllib.utils.metrics.learner_infor   ray.rllib.utils.typingr   r   r   tf1tftfvstrSpacespacestypeintrm   rl   rG   rE   <module>r      s5   E E E E E E E E E E E E E E E E E E            - , , , , , C C C C C C , , , , , , > > > > > > * * * * * * 5 5 5 5 5 5 / / / / / / 2 2 2 2 2 2 2 2 = = = = = = = = 3 3 3 3 3 3 B B B B B B          }R  KOQU 	 	
 	 	OSRV 	 	
 	 	
 	 	 	#'DH#3 wP P P
P 	$34kBj$z**+	-P !4&2E*E!FGP x 5tCO7L LMNP &-.0OOPP #&9:FVWP  !4nE~U	
!P* &+~6S*_8MMN+P0 "(F8T#z/5J+J"KL1P2 %XvhS*_8M.M%NO3P4 &#)SY0CDdJK5P: &#)SY0CDdJK;P@ SZ%sz'79LMtS	
APJ &#)SY0CDdJKKPP SZ%sz'79LMwV	
QPZ  *d:./z:7M1NNO[P` %Wj*jA*dD$4457	
aPl T$Z mPn !)6(C-)@ AoPx 
/yP P P P P PrG   