
    &`iN              3       z   d dl mZmZmZmZmZmZmZmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z)  e             \  Z*Z+ e!            \  Z,Z+edddddddddddddddddddddde-de-deeeeee         egee)ee)         f         f                  deeg e'f                  deeeegee-e)f         f                  deeeeeeeef                  ee         gef                  deeeee-e)f         ee)         eegee-e)f         f                  deeede)gee-e)f         f                  deeegee-e)f         f                  deeee'gdf                  deeee
j.        e
j.        e'gdf                  deeee
j.        e
j.        e'gdf                  deeee
j/        j.        e
j/        j.        e'gdf                  deeee
j.        e
j.        e'gdf                  d eeee
j/        j.        e
j/        j.        e'gdf                  d!eee)ee)         gee)e)f         f                  d"eeeee)e)e)gee)e0ee)         f         f                  d#eeee
j/        j.        e
j/        j.        e'gef                  d$eeee
j/        j.        e
j/        j.        e'geeee         f         f                  d%eeeegee(e1f         f                  d&eeedgdf                  d'eee0                  d(eeege2f                  d)ee         f0d*            Z3dS )+    )AnyCallableDictListOptionalTupleTypeUnionN)ModelCatalog)ModelV2)TorchDistributionWrapper)TorchModelV2)Policy)SampleBatch)TorchPolicy)NullContextManager
add_mixins)OldAPIStackoverride)try_import_jaxtry_import_torch)LEARNER_STATS_KEY)convert_to_numpy)AlgorithmConfigDictModelGradients
TensorType)get_default_configstats_fnpostprocess_fnextra_action_out_fnextra_grad_process_fnextra_learn_fetches_fnoptimizer_fnvalidate_spacesbefore_initbefore_loss_init
after_init_after_loss_initaction_sampler_fnaction_distribution_fn
make_modelmake_model_and_action_distcompute_gradients_fnapply_gradients_fnmixinsget_batch_divisibility_reqname	frameworkloss_fnr   r   r   r    r!   ztorch.optim.Optimizerr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   returnc                6  	
 t                                                      t          t          |          } G 	
fdd|          }fd}t	          |          |_        | |_        | |_        |S )a  Helper function for creating a new Policy class at runtime.

    Supports frameworks JAX and PyTorch.

    Args:
        name: name of the policy (e.g., "PPOTorchPolicy")
        framework: Either "jax" or "torch".
            loss_fn (Optional[Callable[[Policy, ModelV2,
                Type[TorchDistributionWrapper], SampleBatch], Union[TensorType,
                List[TensorType]]]]): Callable that returns a loss tensor.
            get_default_config (Optional[Callable[[None], AlgorithmConfigDict]]):
                Optional callable that returns the default config to merge with any
                overrides. If None, uses only(!) the user-provided
                PartialAlgorithmConfigDict as dict for this Policy.
            postprocess_fn (Optional[Callable[[Policy, SampleBatch,
                Optional[Dict[Any, SampleBatch]], Optional[Any]],
                SampleBatch]]): Optional callable for post-processing experience
                batches (called after the super's `postprocess_trajectory` method).
            stats_fn (Optional[Callable[[Policy, SampleBatch],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                values given the policy and training batch. If None,
                will use `TorchPolicy.extra_grad_info()` instead. The stats dict is
                used for logging (e.g. in TensorBoard).
            extra_action_out_fn (Optional[Callable[[Policy, Dict[str, TensorType],
                List[TensorType], ModelV2, TorchDistributionWrapper]], Dict[str,
                TensorType]]]): Optional callable that returns a dict of extra
                values to include in experiences. If None, no extra computations
                will be performed.
            extra_grad_process_fn (Optional[Callable[[Policy,
                "torch.optim.Optimizer", TensorType], Dict[str, TensorType]]]):
                Optional callable that is called after gradients are computed and
                returns a processing info dict. If None, will call the
                `TorchPolicy.extra_grad_process()` method instead.
            # TODO: (sven) dissolve naming mismatch between "learn" and "compute.."
            extra_learn_fetches_fn (Optional[Callable[[Policy],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                extra tensors from the policy after loss evaluation. If None,
                will call the `TorchPolicy.extra_compute_grad_fetches()` method
                instead.
            optimizer_fn (Optional[Callable[[Policy, AlgorithmConfigDict],
                "torch.optim.Optimizer"]]): Optional callable that returns a
                torch optimizer given the policy and config. If None, will call
                the `TorchPolicy.optimizer()` method instead (which returns a
                torch Adam optimizer).
            validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable that takes the
                Policy, observation_space, action_space, and config to check for
                correctness. If None, no spaces checking will be done.
            before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable to run at the
                beginning of `Policy.__init__` that takes the same arguments as
                the Policy constructor. If None, this step will be skipped.
            before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], None]]): Optional callable to
                run prior to loss init. If None, this step will be skipped.
            after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): DEPRECATED: Use `before_loss_init`
                instead.
            _after_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], None]]): Optional callable to
                run after the loss init. If None, this step will be skipped.
                This will be deprecated at some point and renamed into `after_init`
                to match `build_tf_policy()` behavior.
            action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
                Tuple[TensorType, TensorType]]]): Optional callable returning a
                sampled action and its log-likelihood given some (obs and state)
                inputs. If None, will either use `action_distribution_fn` or
                compute actions by calling self.model, then sampling from the
                so parameterized action distribution.
            action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
                TensorType, TensorType], Tuple[TensorType,
                Type[TorchDistributionWrapper], List[TensorType]]]]): A callable
                that takes the Policy, Model, the observation batch, an
                explore-flag, a timestep, and an is_training flag and returns a
                tuple of a) distribution inputs (parameters), b) a dist-class to
                generate an action distribution object from, and c) internal-state
                outputs (empty list if not applicable). If None, will either use
                `action_sampler_fn` or compute actions by calling self.model,
                then sampling from the parameterized action distribution.
            make_model (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], ModelV2]]): Optional callable
                that takes the same arguments as Policy.__init__ and returns a
                model instance. The distribution class will be determined
                automatically. Note: Only one of `make_model` or
                `make_model_and_action_dist` should be provided. If both are None,
                a default Model will be created.
            make_model_and_action_dist (Optional[Callable[[Policy,
                gym.spaces.Space, gym.spaces.Space, AlgorithmConfigDict],
                Tuple[ModelV2, Type[TorchDistributionWrapper]]]]): Optional
                callable that takes the same arguments as Policy.__init__ and
                returns a tuple of model instance and torch action distribution
                class.
                Note: Only one of `make_model` or `make_model_and_action_dist`
                should be provided. If both are None, a default Model will be
                created.
            compute_gradients_fn (Optional[Callable[
                [Policy, SampleBatch], Tuple[ModelGradients, dict]]]): Optional
                callable that the sampled batch an computes the gradients w.r.
                to the loss function.
                If None, will call the `TorchPolicy.compute_gradients()` method
                instead.
            apply_gradients_fn (Optional[Callable[[Policy,
                "torch.optim.Optimizer"], None]]): Optional callable that
                takes a grads list and applies these to the Model's parameters.
                If None, will call the `TorchPolicy.apply_gradients()` method
                instead.
            mixins (Optional[List[type]]): Optional list of any class mixins for
                the returned policy class. These mixins will be applied in order
                and will have higher precedence than the TorchPolicy class.
            get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
                Optional callable that returns the divisibility requirement for
                sample batches. If None, will assume a value of 1.

    Returns:
        Type[TorchPolicy]: TorchPolicy child class constructed from the
            specified args.
    c                       e Zd ZfdZ ee          	 d fd	            Z e          
fd            Z e          fd            Z e          fd            Z	 e          fd            Z
 e          	fd            Z e          fd	            Z e          fd
            Zd Zd Z xZS )&build_policy_class.<locals>.policy_clsc                    || _         x| _        | j         d<   r | ||| j                    r | ||| j                    rE
J d             | |||          | _        t          j        || j         d                   \  }}ngr | |||          \  | _        }nNt          j        || j         d                   \  }}t          j        |||| j         d                   | _        t          }t          | j        |          s
J d            | _        | j        	                    | |||| j        | j         d         rd n|
|d         d         	           | j
                            | j        j
                   p}|r || | j        | j        |           |                     d
| j         d         rd n           	r 	| |||           d| _        d S )Nr2   zAEither `make_model` or `make_model_and_action_dist` must be None!model)r2   )	obs_spaceaction_spacenum_outputsmodel_configr2   z5ERROR: Generated Model must be a TorchModelV2 object!in_evaluationmax_seq_len)
observation_spacer;   configr9   lossaction_distribution_classr)   r*   r?   r0   T)auto_remove_unneeded_view_reqsr   r   )rA   r2   r9   r   get_action_distget_model_v2r   
isinstance
parent_cls__init__view_requirementsupdater@   r;   !_initialize_loss_from_dummy_batchglobal_timestep)selfr:   r;   rA   
dist_class_	logit_dim	model_cls_before_loss_initr(   r*   r)   r'   r%   r&   r2   r0   r3   r+   r,   rH   r   r$   s            t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/policy_template.pyrI   z/build_policy_class.<locals>.policy_cls.__init__   s    DK 9BADNT[5  Lit{KKK  HD)\4;GGG  199% :99 (ZivNN
 , < $+g"6)! ! !
AA , )C)C)\6* *&
JJ
 )5(D $+g"6)) ) )%
I *6'!- )!%W!5'  
 %I
I  G GFG G 
 )DOO$$"+)j![9FTTw*4"3'="7OM:+E %    "))$**FGGG 0 >J  !!$0$2CV  
 22/3!%_!=K8 3   
   H  y,GGG $%D       Nc                     |                                  5  t                                          |||          }r | |||          cd d d            S |cd d d            S # 1 swxY w Y   d S N)_no_grad_contextsuperpostprocess_trajectory)rN   sample_batchother_agent_batchesepisode	__class__r   s       rT   rZ   z=build_policy_class.<locals>.policy_cls.postprocess_trajectoryL  s     &&(( 
$ 
$$ww== "5w    " )>l,? 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ $
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$ 
$s   3A$A$$A(+A(c                 P    r | ||          S                      | ||          S )zCalled after optimizer.zero_grad() and loss.backward() calls.

            Allows for gradient processing before optimizer.step() is called.
            E.g. for gradient clipping.
            )extra_grad_process)rN   	optimizerrB   r!   rH   s      rT   r`   z9build_policy_class.<locals>.policy_cls.extra_grad_process_  s;     % L,,T9dCCC!44T9dKKKrU   c                     r,t           |                     }t          t          i ifi |S                     |           S rW   )r   dictr   extra_compute_grad_fetches)rN   fetchesr"   rH   s     rT   rd   zAbuild_policy_class.<locals>.policy_cls.extra_compute_grad_fetchesk  sU    % C*+A+A$+G+GHH.3??w???!<<TBBBrU   c                 L    r | |          S                      | |          S rW   )compute_gradients)rN   batchr-   rH   s     rT   rg   z8build_policy_class.<locals>.policy_cls.compute_gradientst  s5    # A++D%888!33D%@@@rU   c                 T    r | |           d S                      | |           d S rW   )apply_gradients)rN   	gradientsr.   rH   s     rT   rj   z6build_policy_class.<locals>.policy_cls.apply_gradients{  s@    ! <""433333**4;;;;;rU   c                     |                                  5  r | ||||          }n                    | ||||          }|                     |          cd d d            S # 1 swxY w Y   d S rW   )rX   extra_action_out_convert_to_numpy)rN   
input_dictstate_batchesr9   action_dist
stats_dictr    rH   s         rT   rm   z7build_policy_class.<locals>.policy_cls.extra_action_out  s    &&(( 	: 	:& !4!4j-" "JJ ",!<!<j-" "J --j99	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	:s   A A##A'*A'c                 Z    r | | j                   }n                    |           }|S rW   )rA   ra   )rN   
optimizersr#   rH   s     rT   ra   z0build_policy_class.<locals>.policy_cls.optimizer  s9     8)\$<<

'11$77
rU   c                     |                                  5  r | |          }n| j                            | |          }|                     |          cd d d            S # 1 swxY w Y   d S rW   )rX   rH   extra_grad_inforn   )rN   train_batchrr   r   s      rT   rv   z6build_policy_class.<locals>.policy_cls.extra_grad_info  s    &&(( : : T!)$!<!<JJ!%!@!@{!S!SJ--j99: : : : : : : : : : : : : : : : : :s   ?A""A&)A&c                 f    | j         dk    rt                                          S t                      S Ntorch)r2   rz   no_gradr   )rN   s    rT   rX   z7build_policy_class.<locals>.policy_cls._no_grad_context  s)    ~((}}&%'''rU   c                 :    | j         dk    rt          |          S |S ry   )r2   r   )rN   datas     rT   rn   z8build_policy_class.<locals>.policy_cls._convert_to_numpy  s"    ~(('---KrU   )NN)__name__
__module____qualname__rI   r   r   rZ   r`   rd   rg   rj   rm   ra   rv   rX   rn   __classcell__)r^   r(   r*   r)   r'   r.   r%   r&   r-   r    r!   r"   r2   r0   r3   r+   r,   r#   rH   r   r   r$   s   @rT   
policy_clsr7      sh       S	% S	% S	% S	% S	% S	% S	% S	% S	% S	% S	% S	% S	% S	% S	% S	% S	% S	%j 
&		BF	$ 	$ 	$ 	$ 	$ 	$ 
		$$ 
*				L 		L 		L 		L 		L 
			L 
*			C 	C 	C 	C 	C 
		C 
*			A 	A 	A 	A 	A 
		A 
*			< 	< 	< 	< 	< 
		< 
*		
	: 
	: 
	: 
	: 
	: 
	
	: 
*			 	 	 	 	 
		 
*			: 	: 	: 	: 
		:	( 	( 	(
	 	 	 	 	 	 	rU   r   c            	      2    t          di t          fi | S )a  Creates a Torch|JAXPolicy cls based on settings of another one.

        Keyword Args:
            **overrides: The settings (passed into `build_torch_policy`) that
                should be different from the class that this method is called
                on.

        Returns:
            type: A new Torch|JAXPolicy sub-class.

        Examples:
        >> MySpecialDQNPolicyClass = DQNTorchPolicy.with_updates(
        ..    name="MySpecialDQNPolicyClass",
        ..    loss_function=[some_new_loss_function],
        .. )
         )build_policy_classrc   )	overridesoriginal_kwargss    rT   with_updatesz(build_policy_class.<locals>.with_updates  s)    " "GGD$F$FI$F$FGGGrU   )localscopyr   r   staticmethodr   r~   r   )r1   r2   r3   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   baser   r   r   rH   s    `` ````````````````` `   @@rT   r   r       s=   d hhmmooOJj&))Dr r r r r r r r r r r r r r r r r r r r r r r r r r rT r r rhH H H H H& +<88JJ"JrU   )4typingr   r   r   r   r   r   r	   r
   	gymnasiumgymray.rllib.models.catalogr   ray.rllib.models.modelv2r   (ray.rllib.models.torch.torch_action_distr   $ray.rllib.models.torch.torch_modelv2r   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.torch_policyr   ray.rllib.utilsr   r   ray.rllib.utils.annotationsr   r   ray.rllib.utils.frameworkr   r   $ray.rllib.utils.metrics.learner_infor   ray.rllib.utils.numpyr   ray.rllib.utils.typingr   r   r   jaxrP   rz   strSpacespacestyperc   intr   r   rU   rT   <module>r      sa  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	     1 1 1 1 1 1 , , , , , , M M M M M M = = = = = = * * * * * * 5 5 5 5 5 5 5 5 5 5 5 5 : : : : : : : : = = = = = = = = F F F F F F F F B B B B B B 2 2 2 2 2 2 R R R R R R R R R R			Qq  GKQU 	 	 	RV 	 	 	
 	 	
 	 	 	
 	 	 	 	#'DHs_ _ _
__ Wd#;<kJ*d:../1	
	_ !".A*A!BC_ x 5tCO7L LMN_ c;./0	 	

_. "S*_%Z ( j!#		
/_F $&1:>S*_@UUVG_N %XvhS*_8M.M%NOO_P &-.0GGHQ_V &#)SY0CDdJKW_\ &#)SY0CDdJK]_b SZ%sz'79LMtS	
c_l &#)SY0CDdJKm_r SZ%sz'79LMtS	
s_|  *d:./z:7M1NNO}_B %Wj*jA*dD$4457	
C_N SZ%sz'79LMwV	
O_X !)SZ%sz'79LM'4 899:<	
!Y_d #&+&nd.B(CCDe_j !&12D89k_p T$Z q_r !)6(C-)@ As_t 
+u_ _ _ _ _ _rU   