
    &`i3                        d dl Z d dlZd dlmZmZmZmZmZ d dlZ	d dl
Zd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8  e$            \  Z9Z:Z; e j<        e=          Z>e G d de                      Z?dS )    N)DictListOptionalTupleUnion)
Deprecated)ModelV2)Policy
PolicySpecPolicyState)#pad_batch_to_sequences_of_same_size)SampleBatch)
force_list)OldAPIStackoverride)	summarize))ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL)try_import_tf)'DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICYNUM_AGENT_STEPS_TRAINEDNUM_GRAD_UPDATES_LIFETIMELEARNER_STATS_KEY)normalize_action)_TFRunBuilder)TensorFlowVariablesget_gpu_devices)AlgorithmConfigDictLocalOptimizerModelGradients
TensorType)log_oncec            2       
    e Zd ZdZdZed             Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dhdej        j	        dej        j	        d	e
d
ddededeeee         f         deeeef                  dee         dee         dee         dee         dee         dee         deee                  deee                  dee         dee         dee         dededee         dee         dee         f0 fd Z ee          	 	 	 did!eeeeef         f         dedee         d"eeee         eeef         f         fd#            Z ee          	 	 	 	 	 	 	 djd$eee         ef         d%eee                  d&eee         ef         d'eee         ef         d(eeeef                  dee         dee         fd)            Z ee          	 	 	 	 dkd+eee         ef         d$eee         ef         d%eee                  d&eeee         ef                  d'eeee         ef                  d,ed"efd-            Z ee          d.ed"eeef         fd/            Z ee          d.ed"eeeeef         f         fd0            Z ed1e!d"d2fd3            Z" ee          d4ed"dfd5            Z# ee          d"eeeef         ee         f         fd6            Z$ ee          dld7            Z% ee          d"eeef         fd8            Z& e'd9d*:          d"eeef         fd;            Z( ee          d"efd<            Z) ee          d"efd=            Z* ee          d"e!f fd>            Z+ ee          d1e!d"df fd?            Z, ee          dmd@edAee         d"dfdB            Z- ee          dCed"dfdD            Z. ee          d"ed         fdE            Z/dF Z0dndHZ1d"efdIZ2dJee         deeeef                  d"dfdKZ3dLeeedGf                  d"d fdMZ4d"eeef         fdNZ5d"eeef         fdOZ6d"eeef         fdPZ7d"eeef         fdQZ8d"eee9f         fdRZ:dodTZ;dUee<ee<         f         deeee         f         d"eee         eee                  f         fdVZ=dUee<ee<         f         dWeeee         f         d"dXfdYZ>dZ Z?d[ Z@d\ ZAd] ZBd^ ZCddddddddd_d`ZDda ZEdb ZFdc ZGdd ZHdeedfefdgZI xZJS )pTFPolicyaK  An agent policy and loss implemented in TensorFlow.

    Do not sub-class this class directly (neither should you sub-class
    DynamicTFPolicy), but rather use
    rllib.policy.tf_policy_template.build_tf_policy
    to generate your custom tf (graph-mode or eager) Policy classes.

    Extending this class enables RLlib to perform TensorFlow specific
    optimizations on the policy, e.g., parallelization across gpus or
    fusing multiple graphs together in the multi-agent setting.

    Input tensors are typically shaped like [BATCH_SIZE, ...].

    .. testcode::
        :skipif: True

        from ray.rllib.policy import TFPolicy
        class TFPolicySubclass(TFPolicy):
            ...

        sess, obs_input, sampled_action, loss, loss_inputs = ...
        policy = TFPolicySubclass(
            sess, obs_input, sampled_action, loss, loss_inputs)
        print(policy.compute_actions([1, 0, 2]))
        print(policy.postprocess_trajectory(SampleBatch({...})))

    .. testoutput::

        (array([0, 1, 1]), [], {})
        SampleBatch({"action": ..., "advantages": ..., ...})

    r   c                  J    t           xj        dz  c_        dt           j         S )N   
var_scope_)r$   tf_var_creation_scope_counter     n/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/tf_policy.pynext_tf_var_scope_namezTFPolicy.next_tf_var_scope_nameR   s)     	..!3..DHBDDDr*   N   r&   observation_spaceaction_spaceconfigsessztf1.Session	obs_inputsampled_actionlossloss_inputsmodelsampled_action_logpaction_inputlog_likelihooddist_inputs
dist_classstate_inputsstate_outputsprev_action_inputprev_reward_inputseq_lensmax_seq_lenbatch_divisibility_req
update_opsexploretimestepc                 B	   d| _         t                                          |||           |                                 t	                      }t
                              dt          |           d           |d         sdk    s|s@d t          t          t          j                            pd          D             | _        nt          j        j                                        t          j        j        j        k    rt          j                    }t          |          k     rt'          d| d	 d
          fdt)          |          D             | _        t*          j        | j        v rud| j        t*          j                 _        d| j        t*          j                 _        | j        d                             dd          rd| j        t*          j                 _        |	Ct9          |	t:          t<          j        j         f          sJ d!                    |	                      |	| _"        | j"        | #                                 |dur| $                                nd| _%        || _&        || _'        || _(        || _)        || _*        | +                                | _,        ||ntZ          .                    ddd          | _/        |
| _0        | j0        $t<          j
        1                    | j0                  nd| _2        || _3        || _4        || _5        d| _6        |pg | _7        |pg | _8        || _9        || _:        | j7        r| j9        t'          d          || _;        || _<        d| _=        i | _>        ||n@tZ          .                    t<          ?                    dt<          j@                  dd          | _A        g | _B        d| _C        g | _D        g | _E        d| _F        d| _G        g | _H        d| _I        i | _J        t          |          }t          |          dk    r| L                    ||           || _M        | jM        M| j4        H| j5        C| 5                    | j4        | j"                  N                    | j3                  | _M        dS dS dS dS )a  Initializes a Policy object.

        Args:
            observation_space: Observation space of the policy.
            action_space: Action space of the policy.
            config: Policy-specific configuration data.
            sess: The TensorFlow session to use.
            obs_input: Input placeholder for observations, of shape
                [BATCH_SIZE, obs...].
            sampled_action: Tensor for sampling an action, of shape
                [BATCH_SIZE, action...]
            loss: Scalar policy loss output tensor or a list thereof
                (in case there is more than one loss).
            loss_inputs: A (name, placeholder) tuple for each loss input
                argument. Each placeholder name must
                correspond to a SampleBatch column key returned by
                postprocess_trajectory(), and has shape [BATCH_SIZE, data...].
                These keys will be read from postprocessed sample batches and
                fed into the specified placeholders during loss computation.
            model: The optional ModelV2 to use for calculating actions and
                losses. If not None, TFPolicy will provide functionality for
                getting variables, calling the model's custom loss (if
                provided), and importing weights into the model.
            sampled_action_logp: log probability of the sampled action.
            action_input: Input placeholder for actions for
                logp/log-likelihood calculations.
            log_likelihood: Tensor to calculate the log_likelihood (given
                action_input and obs_input).
            dist_class: An optional ActionDistribution class to use for
                generating a dist object from distribution inputs.
            dist_inputs: Tensor to calculate the distribution
                inputs/parameters.
            state_inputs: List of RNN state input Tensors.
            state_outputs: List of RNN state output Tensors.
            prev_action_input: placeholder for previous actions.
            prev_reward_input: placeholder for previous rewards.
            seq_lens: Placeholder for RNN sequence lengths, of shape
                [NUM_SEQUENCES].
                Note that NUM_SEQUENCES << BATCH_SIZE. See
                policy/rnn_sequencing.py for more information.
            max_seq_len: Max sequence length for LSTM training.
            batch_divisibility_req: pad all agent experiences batches to
                multiples of this value. This only has an effect if not using
                a LSTM model.
            update_ops: override the batchnorm update ops
                to run when applying gradients. Otherwise we run all update
                ops found in the current variable scope.
            explore: Placeholder for `explore` parameter into call to
                Exploration.get_exploration_action. Explicitly set this to
                False for not creating any Exploration component.
            timestep: Placeholder for the global sampling timestep.
        tfzFound z visible cuda devices.
_fake_gpusr   c                     g | ]}d S )z/cpu:0r)   ).0_s     r+   
<listcomp>z%TFPolicy.__init__.<locals>.<listcomp>   s    SSSHSSSr*   r&   z4TFPolicy was not able to find enough GPU IDs! Found z, but num_gpus=.c                 ,    g | ]\  }}|k     d | S )z/gpu:r)   )rJ   irK   num_gpuss      r+   rL   z%TFPolicy.__init__.<locals>.<listcomp>   s(    WWWDAq!h,,KAKK,,,r*   Foutput_configstore_infosTNz]Model classes for TFPolicy other than `ModelV2|tf.keras.Model` not allowed! You passed in {}.r)   is_exploringnamez9seq_lens tensor must be given if state inputs are defined)dtyperE   )O	frameworksuper__init___get_num_gpus_for_policyr   loggerinfolenrangeintmathceildevicesray_privateworker_modeWORKER_MODEget_gpu_ids
ValueError	enumerater   INFOSview_requirementsused_for_compute_actionsused_for_trainingr0   get
isinstancer	   rG   kerasModelformatr6   /_update_model_view_requirements_from_init_state_create_explorationexploration_sess
_obs_input_prev_action_input_prev_reward_input_sampled_action_get_is_training_placeholder_is_trainingtf1placeholder_with_default_is_exploring_sampled_action_logpexp_sampled_action_prob_action_input_dist_inputsr;   _cached_extra_action_out_state_inputs_state_outputs	_seq_lens_max_seq_len_batch_divisibility_req_update_ops	_apply_op_stats_fetcheszerosint64	_timestep_optimizers
_optimizer_grads_and_vars_grads
_variables_optimizer_variables_losses_loss_loss_input_dictr   _initialize_loss_log_likelihoodlogp)selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   gpu_idslossesrP   	__class__s                              @r+   rY   zTFPolicy.__init__X   s   ^ *L&AAA 0022!##AS\\AAABBB , 	X8q===SSeC	(8K8K4L4L4QPQ.R.RSSSDLL |"((**cl.A.MMM/++7||h&& ;; ;/7; ; ;  
 XWWWIg4F4FWWWDL  666QVD";#45NJOD";#45G{?+//uEE SNR&{'89K}
57BHN2K L L}}--3VE]] }L 
:!@@BBB :A9M9M433555SW
#"3"3- ==?? " G--dB^-LL 	
 %8! (4 GKK1222 	!
 *'$(,%)/R+1r!' 	$."8K   (>$%  # H--28,,bz .   	 24LNCE  DH!  
 "D!!v;;??!!&+666  . (!-+#'??43Ddj#Q#Q#V#V"$ $D   	 )(--++r*   
input_dictreturnc                 (   ||n| j         d         }||n| j        }t          |t                    r|                    d           nd|d<   t          |                                 d          }|t          j                 }|                     ||||          }|	                    |          }	| xj        t          |t                    rt          |          n0t          |t                    rt          |          n|j        d         z  c_        |	S )NrD   Fis_trainingcompute_actions_from_input_dictr   rD   rE   r   )r0   global_timesteprp   r   set_trainingr   get_sessionOBS_build_compute_actionsro   listr]   shape)
r   r   rD   rE   episodekwargsbuilder	obs_batchto_fetchfetcheds
             r+   r   z(TFPolicy.compute_actions_from_input_dict+  s     %0''dk)6L'3889M j+.. 	.##E**** ).J}% 0 0 2 24UVV{/	..
Gh / 
 

 ++h'' 	)T**$C	NNN *k22$Z#	
 r*   r   state_batchesprev_action_batchprev_reward_batch
info_batchc	                 .   ||n| j         d         }||n| j        }t          |                                 d          }
t          j        |ddi}|rt          |          D ]\  }}||d| <   |||t          j        <   |||t          j        <   | 	                    |
|||          }|

                    |          }| xj        t          |t                    rt          |          n$t          j        |          d         j        d         z  c_        |S )NrD   compute_actionsr   F	state_in_r   r   )r0   r   r   r   r   r   rj   PREV_ACTIONSPREV_REWARDSr   ro   rp   r   r]   treeflattenr   )r   r   r   r   r   r   episodesrD   rE   r   r   r   rO   sr   r   s                   r+   r   zTFPolicy.compute_actionsR  s;    %0''dk)6L'3889M 0 0 2 24EFF!oy-G
 	0!-00 0 01./
?q??++(3DJ{/0(3DJ{/0..
Gh / 
 

 ++h'' 	)T**5C	NNNi((+1!4	
 r*   Tactionsactions_normalizedc                    | j         t          d          | j                            d|                                            t          |                                 d          }|du r"| j        d         rt          || j                  }|	                    | j
        |i           |	                    | j        |i           |pg }t          | j                  t          |          k    r(t          d                    | j        |                    |	                    t          t!          | j        |                               |r;|	                    | j        t%          j        t          |                    i           | j        ||	                    | j        |i           | j        ||	                    | j        |i           |                    | j         g          }	|                    |	          d         S )NzACannot compute log-prob/likelihood w/o a self._log_likelihood op!F)rD   tf_sesscompute_log_likelihoodsnormalize_actionsz:Must pass in RNN state batches for placeholders {}, got {}r   )r   ri   rv   before_compute_actionsr   r   r0   r   action_space_structadd_feed_dictr   rx   r]   r   rs   dictzipr   nponesry   rz   add_fetchesro   )
r   r   r   r   r   r   r   r   r   fetchess
             r+   r   z TFPolicy.compute_log_likelihoods}  s    'S  
 	//4#3#3#5#5 	0 	
 	
 	
   0 0 2 24MNN &&4;7J+K&&w0HIIG 	t17;<<<t	:;;;%+t!""c-&8&888LSS&   
 	d3t'9=#I#IJJKKK 	M!!4>273y>>3J3J"KLLL".3D3P!!4#:<M"NOOO".3D3P!!4#:<M"NOOO%%t';&<=={{7##A&&r*   postprocessed_batchc                    |                                  sJ |                    d           t          |                                 d          }i }| j                            | ||           |                     ||          }|                    |          }| xj        dz  c_        |	                    d|t          |j        t          | j        t          | j        dz
  |j        pdz
  i           |S )NTlearn_on_batch)policytrain_batchresultr&   custom_metricsr   )loss_initializedr   r   r   	callbackson_learn_on_batch_build_learn_on_batchro   num_grad_updatesupdater   countr   r   )r   r   r   learn_statsr   statss         r+   r   zTFPolicy.learn_on_batch  s   $$&&&&& 	((... 0 0 2 24DEE ((%8 	) 	
 	
 	
 ,,W6IJJG$$" +')<)B)4+@7)*;@qB
	
 	
 	
 r*   c                     |                                  sJ |                    d           t          |                                 d          }|                     ||          }|                    |          S )NTcompute_gradients)r   r   r   r   _build_compute_gradientsro   )r   r   r   r   s       r+   r   zTFPolicy.compute_gradients  st     $$&&&&&((... 0 0 2 24GHH//9LMM{{7###r*   stater
   c                    |                      d          }|t          d          t          j        |          }t                              t                                                    5  |                    |j	        |j
        |j                  }ddd           n# 1 swxY w Y   |                    |            |S )a  Recovers a TFPolicy from a state object.

        The `state` of an instantiated TFPolicy can be retrieved by calling its
        `get_state` method. Is meant to be used by the Policy.from_state() method to
        aid with tracking variable creation.

        Args:
            state: The state to recover a new TFPolicy instance from.

        Returns:
            A new TFPolicy instance.
        policy_specNzJNo `policy_spec` key was found in given `state`! Cannot create new Policy.)ro   ri   r   deserializer~   variable_scoper$   r,   policy_classr.   r/   r0   	set_state)r   serialized_pol_specpol_spec
new_policys       r+   _tf1_from_state_helperzTFPolicy._tf1_from_state_helper  s    /4ii.F.F&,   )*=>> ? ? A ABB 		 		!.. *% J		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 	U### s   ,'BB#&B#	gradientsc                     |                                  sJ t          |                                 d          }|                     ||          }|                    |           d S )Napply_gradients)r   r   r   _build_apply_gradientsro   )r   r   r   r   s       r+   r   zTFPolicy.apply_gradients  sc    $$&&&&& 0 0 2 24EFF--gyAAGr*   c                 4    | j                                         S N)r   get_weightsr   s    r+   r   zTFPolicy.get_weights  s    **,,,r*   c                 6    | j                             |          S r   )r   set_weights)r   weightss     r+   r   zTFPolicy.set_weights  s    **7333r*   c                 \    | j                             |                                           S )N)r1   )rv   	get_stater   r   s    r+   get_exploration_statezTFPolicy.get_exploration_state  s(    ))t/?/?/A/A)BBBr*   r   )newerrorc                 *    |                                  S r   )r   r   s    r+   get_exploration_infozTFPolicy.get_exploration_info  s    ))+++r*   c                 2    t          | j                  dk    S )Nr   r]   r   r   s    r+   is_recurrentzTFPolicy.is_recurrent  s    4%&&**r*   c                 *    t          | j                  S r   r  r   s    r+   num_state_tensorszTFPolicy.num_state_tensors!  s    4%&&&r*   c                 H   t                                                      }t          | j        j                  dk    r4|                                                     | j        j                  |d<   | j                            |                                           |d<   |S )Nr   r   _exploration_state)rX   r   r]   r   	variablesr   runrv   )r   r   r   s     r+   r   zTFPolicy.get_state%  s     !!##t(233a77,0,<,<,>,>,B,B)3- -E() '+&6&@&@AQAQASAS&T&T"#r*   c                 X   |                     dd           }|| j                            |           t          | d          r8d|v r4| j                            |d         |                                            |d         | _        t                                          |           d S )Nr   rv   r  )r   r1   r   )	ro   r   r   hasattrrv   r   r   r   rX   )r   r   optimizer_varsr   s      r+   r   zTFPolicy.set_state2  s     #94@@%%11.AAA4'' 	,@E,I,I&&018H8H8J8J '   
  %%67 	%     r*   
export_dironnxc                    |r	 ddl }n"# t          $ r}t          d          |d}~ww xY w|                                 j                                        5  |                                 }|t          j        j	        j
                 }d |j                                        D             }d |j                                        D             }ddl m}	 |	                    |                                 ||          }
ddd           n# 1 swxY w Y   t                              t"                                                    5 }t"                              |
d	
           |j                            |j        |||          }|                    d          }|j                            |di |           ddd           dS # 1 swxY w Y   dS t3          | d          rt3          | j        d          rt7          | j        j        t"          j        j                  r|                                 j                                        5  	 | j        j                            |d           n/# t@          $ r" tB          "                    tF                     Y nw xY wddd           dS # 1 swxY w Y   dS tB          "                    tF                     dS )z2Export tensorflow graph to export_dir for serving.r   NzmConverting a TensorFlow model to ONNX requires `tf2onnx` to be installed. Install with `pip install tf2onnx`.c                 "    g | ]\  }}|j         S r)   rT   rJ   kvs      r+   rL   z)TFPolicy.export_model.<locals>.<listcomp>W  s    ???TQ!&???r*   c                 "    g | ]\  }}|j         S r)   rT   r  s      r+   rL   z)TFPolicy.export_model.<locals>.<listcomp>X  s    AAAda16AAAr*   )	tf_loader)input_namesoutput_names)graph rT   )r  r  inputs_as_nchw
onnx_modelr6   )	feed_dictmodel_proto
base_modelrG   )filepathsave_format)$tf2onnxImportErrorRuntimeErrorr   r  
as_default_build_signature_defr~   saved_modelsignature_constants!DEFAULT_SERVING_SIGNATURE_DEF_KEYinputsitemsoutputsr  freeze_sessionSessionrG   Graphimport_graph_deftfonnxprocess_tf_graph
make_modelutilssave_onnx_modelr  r6   rp   r  rq   rr   save	Exceptionr[   warningr   )r   r  r  r!  esignature_def_mapsdr)  r+  r  frozen_graph_defsessiongr  s                 r+   export_modelzTFPolicy.export_modelD  s     4	F   "-  	 !!##)4466  $($=$=$?$?!&O7Y @?RY__->->???AAbj.>.>.@.@AAA------#,#;#;$$&&F $< $ $                288::.. '##$42#>>>N33M &!(#)	 4    ll<88--2; .                    " D'""	F
L11	F 4:0"(.AA	F
 !!##)4466 N NNJ)..
PT.UUUU  N N NNN#LMMMMMNN N N N N N N N N N N N N N N N N N NNDEEEEEsl   
 
)$)B'D

DDA3GGGJ-!I10J-1)JJ-JJ--J14J1import_filec                    | j         t          d          |                                 j                                        5  |                                                                 5  | j                             |          cddd           cddd           S # 1 swxY w Y   	 ddd           dS # 1 swxY w Y   dS )zImports weights into tf model.NzNo `self.model` to import into!)r6   NotImplementedErrorr   r  r$  import_from_h5)r   r?  s     r+   import_model_from_h5zTFPolicy.import_model_from_h5}  sU    :%&GHHH %0022 	> 	>!!##..00 > >z00==> > > > > > >	> 	> 	> 	> 	> 	> 	> 	>> > > > > > > > >	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s6   'B5)BB5B 	 B5#B 	$B55B9<B9c                     | j         S )z6Returns a reference to the TF session for this policy.)rw   r   s    r+   r   zTFPolicy.get_session  s     zr*   c                     | j         t          d          t          | j         t          j        j                  r| j         j        S | j                                         S )z9Return the list of all savable variables for this policy.Nz%No `self.model` to get variables for!)r6   rA  rp   rG   rq   rr   r  r   s    r+   r  zTFPolicy.variables  sP    :%&MNNN
BHN33 	*:'':'')))r*   tf1.placeholderc                     |t           j        k    r| j        S |t           j        k    r| j        S |t           j        k    r| j        S | j        s
J d            | j        |         S )a  Returns the given action or loss input placeholder by name.

        If the loss has not been initialized and a loss input placeholder is
        requested, an error is raised.

        Args:
            name: The name of the placeholder to return. One of
                SampleBatch.CUR_OBS|PREV_ACTION/REWARD or a valid key from
                `self._loss_input_dict`.

        Returns:
            tf1.placeholder: The placeholder under the given str key.
        zUYou need to populate `self._loss_input_dict` before `get_placeholder()` can be called)r   CUR_OBSrx   r   ry   r   rz   r   )r   rU   s     r+   get_placeholderzTFPolicy.get_placeholder  sz     ;&&&?"[---**[---**$ 	
 	
0	
 	
$ $T**r*   c                 2    t          | j                  dk    S )z7Returns whether the loss term(s) have been initialized.r   )r]   r   r   s    r+   r   zTFPolicy.loss_initialized  s    4<  1$$r*   r   c                     t          |           _         fd j                                        D              _        t	           j                  D ]"\  }}| j        d                    |          <   # j        rt           j        t          j
        j                  sft           j                            | j                             _         j                            d j                                        i           n| _         j         j        d         nd _         j        sAt                                                      _         j        r j        d         nd _         j        d         r|g  _        g  _                              j         j                  D ]L}d |D             } j                            |            j                            d |D                        MnEd	                       j         j                  D              _        d
  j        D              _         j        r:t7          g                                                                              _        t?           j                   dk    r j!        sLtD          #                    tD          j$        j%        tD          &                                j'                   _!         j!        r2tP          )                    d                     j!                             tD          *                     j!                  5   +                     j        d         r j        n j         j                   _,        ddd           n# 1 swxY w Y   t[          d          r0tP          .                    dt_           j                   d                                            0                    tD          1                                           t7          d  j        D                                                         _2        dS )aX  Initializes the loss op from given loss tensor and placeholders.

        Args:
            loss (List[TensorType]): The list of loss ops returned by some
                loss function.
            loss_inputs (List[Tuple[str, TensorType]]): The list of Tuples:
                (name, tf1.placeholders) needed for calculating the loss.
        c                 D    i | ]\  }}|j         v|j        k    ||S r)   )r   r   )rJ   r  r  r   s      r+   
<dictcomp>z-TFPolicy._initialize_loss.<locals>.<dictcomp>  sC     (
 (
 (
1+++T^0C0C q0C0C0Cr*   state_in_{}r6   Nr   %_tf_policy_handles_more_than_one_lossc                      g | ]\  }}|||fS r   r)   rJ   r=  r  s      r+   rL   z-TFPolicy._initialize_loss.<locals>.<listcomp>  s!    GGGfq!Aq6r*   c                     g | ]\  }}|S r)   r)   rJ   r=  rK   s      r+   rL   z-TFPolicy._initialize_loss.<locals>.<listcomp>  s    #<#<#<&1aA#<#<#<r*   c                      g | ]\  }}|||fS r   r)   rQ  s      r+   rL   z-TFPolicy._initialize_loss.<locals>.<listcomp>  s-     $ $ $Q= A ==r*   c                     g | ]\  }}|S r)   r)   rS  s      r+   rL   z-TFPolicy._initialize_loss.<locals>.<listcomp>  s    @@@!Q1@@@r*   r&   )scopez'Update ops to run on apply gradient: {})	optimizergrads_and_vars	loss_usedz/These tensors were used in the loss functions:

c                 @    g | ]}|                                 D ]}|S r)   )r  )rJ   or  s      r+   rL   z-TFPolicy._initialize_loss.<locals>.<listcomp>  s-    @@@1!++--@@QQ@@@@r*   )3r   r   r*  _loss_input_dict_no_rnnrj   r   rs   r6   rp   rG   rq   rr   r   custom_lossr   r   r   metricsr   r   rW  r   r0   r   r   r   appendr   r   r  r   r]   rb   r   r~   get_collection	GraphKeys
UPDATE_OPSget_variable_scoperU   r[   r\   control_dependenciesbuild_apply_opr   r"   debugr   r	  global_variables_initializerr   )r   r   r5   rO   phgroupg_and_vs   `      r+   r   zTFPolicy._initialize_loss  sz    !%[ 1 1(
 (
 (
 (
-3355(
 (
 (
$
 t122 	@ 	@EAr=?D!-"6"6q"9"9::: 	"jRX^DD 	"%
&&vt/DEE DL &&1C1C1E1E'FGGGG!DL(,(@T\!__d
 	P)$..*:*:;;D595EOd.q114DO ;>? 	A#%D DK(8$,GG > >GGGGG$++G444""#<#<G#<#<#<====>$ $"nnT_djII$ $ $D 
 A@4+?@@@DK: 	1D$$&&(8(8 DO
 t|!!# #&#5#5M,C4J4J4L4L4Q $6 $ $   =DDTEUVV   ))$*:;;  !%!4!4{#JK)d..#'#7	 "5 " "               K   	LL:t455: : :  
 	s??AABBB %8@@(@@@$BRBRBTBT%
 %
!!!s   ;NNNexisting_inputsc                     t           )a  Creates a copy of self using existing input placeholders.

        Optional: Only required to work with the multi-GPU optimizer.

        Args:
            existing_inputs (List[Tuple[str, tf1.placeholder]]): Dict mapping
                names (str) to tf1.placeholders to re-use (share) with the
                returned copy of self.

        Returns:
            TFPolicy: A copy of self.
        )rA  )r   rl  s     r+   copyzTFPolicy.copy  s
     "!r*   c                     i S )zExtra dict to pass to the compute actions session run.

        Returns:
            Dict[TensorType, TensorType]: A feed dict to be added to the
                feed_dict passed to the compute_actions session.run() call.
        r)   r   s    r+   extra_compute_action_feed_dictz'TFPolicy.extra_compute_action_feed_dict  	     	r*   c                 P    | j         s|                                 | _         | j         S r   )r   extra_action_out_fnr   s    r+   extra_compute_action_fetchesz%TFPolicy.extra_compute_action_fetches&  s-    
 , 	G,0,D,D,F,FD),,r*   c                     i }| j         (| j        |t          j        <   | j         |t          j        <   | j        | j        |t          j        <   |S )aS  Extra values to fetch and return from compute_actions().

        By default we return action probability/log-likelihood info
        and action distribution inputs (if present).

        Returns:
             Dict[str, TensorType]: An extra fetch-dict to be passed to and
                returned from the compute_actions() call.
        )r   r   r   ACTION_PROBACTION_LOGPr   ACTION_DIST_INPUTS)r   extra_fetchess     r+   rs  zTFPolicy.extra_action_out_fn/  sQ     $0595NM+12595NM+12(<@<MM+89r*   c                     i S )zExtra dict to pass to the compute gradients session run.

        Returns:
            Dict[TensorType, TensorType]: Extra feed_dict to be passed to the
                compute_gradients Session.run() call.
        r)   r   s    r+   extra_compute_grad_feed_dictz%TFPolicy.extra_compute_grad_feed_dictC  rq  r*   c                     t           i iS )zExtra values to fetch and return from compute_gradients().

        Returns:
            Dict[str, any]: Extra fetch dict to be added to the fetch dict
                of the compute_gradients Session.run() call.
        r   r   s    r+   extra_compute_grad_fetchesz#TFPolicy.extra_compute_grad_fetchesL  s     "2&&r*   tf.keras.optimizers.Optimizerc                     t          | d          r4d| j        v r+t          j                            | j        d                   S t          j                                        S )zTF optimizer to use for policy optimization.

        Returns:
            tf.keras.optimizers.Optimizer: The local optimizer to use for this
                Policy's Model.
        r0   lr)learning_rate)r  r0   r~   trainAdamOptimizerr   s    r+   rW  zTFPolicy.optimizerU  sV     4"" 	-tt{':':9**T9J*KKK9**,,,r*   rW  c                    t          |          }t          |          }| j        d         rBg }t          ||          D ]-\  }}|                    |                    |                     .dS |d                             |d                   S )a  Override this for a custom gradient computation behavior.

        Args:
            optimizer (Union[LocalOptimizer, List[LocalOptimizer]]): A single
                LocalOptimizer of a list thereof to use for gradient
                calculations. If more than one optimizer given, the number of
                optimizers must match the number of losses provided.
            loss (Union[TensorType, List[TensorType]]): A single loss term
                or a list thereof to use for gradient calculations.
                If more than one loss given, the number of loss terms must
                match the number of optimizers provided.

        Returns:
            Union[List[ModelGradients], List[List[ModelGradients]]]: List of
                ModelGradients (grads and vars OR just grads) OR List of List
                of ModelGradients in case we have more than one
                optimizer/loss.
        rO  r   N)r   r0   r   r`  r   )r   rW  r4   
optimizersr   gradsoptimloss_s           r+   r   zTFPolicy.gradientsa  s    .  	**
D!! ;>? 	>E #J 7 7 = =uU44U;;<<<<= = a=226!9===r*   rX  ztf.Operationc                    t          |          }| j        d         r}g }t          |          D ]Q\  }}|                    |                    ||         t
          j                                                             Rt          	                    |          S |d                             |t
          j                                                  S )aT  Override this for a custom gradient apply computation behavior.

        Args:
            optimizer (Union[LocalOptimizer, List[LocalOptimizer]]): The local
                tf optimizer to use for applying the grads and vars.
            grads_and_vars (Union[ModelGradients, List[ModelGradients]]): List
                of tuples with grad values and the grad-value's corresponding
                tf.variable in it.

        Returns:
            tf.Operation: The tf op that applies all computed gradients
                (`grads_and_vars`) to the model(s) via the given optimizer(s).
        rO  )global_stepr   )
r   r0   rj   r`  r   r~   r  get_or_create_global_steprG   rj  )r   rW  rX  r  opsrO   r  s          r+   rf  zTFPolicy.build_apply_op  s    $  	**
 ;>? 	C%j11  5 

))&q)$'I$G$G$I$I *      88C==  a=00CI,O,O,Q,Q 1   r*   c                 t    t          | d          s"t                              ddd          | _        | j        S )zGet the placeholder for _is_training, i.e., for batch norm layers.

        This can be called safely before __init__ has run.
        r}   Fr)   r   rT   )r  r~   r   r}   r   s    r+   r|   z%TFPolicy._get_is_training_placeholder  sE    
 t^,, 	 # < <r != ! !D   r*   c                 8   t          d          r| j        d         rA| j        D ]7}|D ]2\  }}t                              d                    |                     38d S | j        D ]4\  }}t                              d                    |                     3d S d S )N	grad_varsrO  zOptimizing variable {})r"   r0   r   r[   r\   rs   )r   rj  rK   r  s       r+   _debug_varszTFPolicy._debug_vars  s    K   	D{BC D!1 H HE % H H1$<$C$CA$F$FGGGGHH H !0 D DDAqKK 8 ? ? B BCCCC	D 	DD Dr*   c                 f    |                                  }d |                                D             S )zvExtra input signatures to add when exporting tf model.
        Inferred from extra_compute_action_feed_dict()
        c                 b    i | ],}|j         t          j        j                            |          -S r)   )rU   r~   r&  r3  build_tensor_info)rJ   r  s     r+   rM  z7TFPolicy._extra_input_signature_def.<locals>.<dictcomp>  s<     
 
 
CDAFCO);;A>>
 
 
r*   )rp  keys)r   r  s     r+   _extra_input_signature_defz#TFPolicy._extra_input_signature_def  s@     7799	
 
HQHXHX
 
 
 	
r*   c                 l    |                                  fd                                D             S )zuExtra output signatures to add when exporting tf model.
        Inferred from extra_compute_action_fetches()
        c                 f    i | ]-}|t           j        j                            |                   .S r)   )r~   r&  r3  r  )rJ   r  r   s     r+   rM  z8TFPolicy._extra_output_signature_def.<locals>.<dictcomp>  sB     
 
 
 s$66wqzBB
 
 
r*   )rt  r  r   r   s    @r+   _extra_output_signature_defz$TFPolicy._extra_output_signature_def  sI     3355
 
 
 
\\^^
 
 
 	
r*   c                    |                                  }t          j        j                            | j                  |d<   | j        6t          j        j                            | j                  |t          j        <   | j	        ,t          j        j                            | j	                  |d<   | j
        ,t          j        j                            | j
                  |d<   t          j        j                            | j                  |d<   | j        ,t          j        j                            | j                  |d<   | j        D ].}t          j        j                            |          ||j        <   /|                                 }t!          t"          j                            | j                            D ]?\  }}t          j        j                            |          |d                    |          <   @| j        D ].}t          j        j                            |          ||j        <   /t          j        j                            ||t          j        j        j                  }t          j        j        j        }||i}	|	S )z9Build signature def map for tensorflow SavedModelBuilder.observationsNprev_actionprev_rewardr   rE   z
actions_{})r  r~   r&  r3  r  rx   r   r   SEQ_LENSry   rz   r}   r   r   rU   r  rj   rG   nestr   r{   rs   r   signature_def_utilsbuild_signature_defr'  PREDICT_METHOD_NAMEr(  )
r   input_signaturestate_inputoutput_signaturerO   astate_outputsignature_defsignature_def_keyr9  s
             r+   r%  zTFPolicy._build_signature_def  sk    99;;*-/*?*Q*QO+
 +
' >% %77GG $ ".-0_-B-T-T'. .OM* ".-0_-B-T-T'. .OM* *-)>)P)P*
 *
& >%*-/*?*Q*Q+ +OJ'  - 	 	K030E0W0W1 1OK,--
  ;;==bgood.BCCDD 	; 	;DAq %77:: ##A&&  !/ 	F 	FL %77EE !  ;OOO/C
 
 O/Q 	 />  r*   )r   r   r   r   r   r   rD   rE   c          	         ||n| j         d         }|	|	n| j        }	| j                            |	||                                                                |                                            t          | d          rI|                                D ]2\  }
}|
| j	        v r$t          j        fd| j	        |
         |           3n:                    | j        |t          j                 i           t          j        |v r,                    | j        |t          j                 i           t          j        |v r,                    | j        |t          j                 i           g }d}d                    |          |v rJ|                    |d                    |                              |dz  }d                    |          |v J                    t+          t-          | j        |                               d|v rOt          j        |vrA                    | j        t5          j        t9          |d                             i                               | j        |i           |	                    | j        |	i           | j        g| j         z   | !                                gz   }"                    |          }|d         |dd	         |d	         fS )
NrD   )rE   rD   r   _input_dictc                 2                         | |i          S r   )r   )r  r  r   s     r+   <lambda>z1TFPolicy._build_compute_actions.<locals>.<lambda>)  s    W%:%:Aq6%B%B r*   r   rN  r&   
state_in_0)#r0   r   rv   r   r   r   rp  r  r*  r  r   map_structurerx   r   r   r   ry   r   rz   rs   r`  r   r   r   r  r   r   r   r]   r   r   r{   r   rt  r   )r   r   r   r   r   r   r   r   rD   rE   keyvaluerO   r   r   s    `             r+   r   zTFPolicy._build_compute_actions  s     %0''dk)6L'3889M 	//w8H8H8J8J 	0 	
 	
 	
 	dAACCDDD 4'' 	P(..00  
U$***&BBBB(-   !!4?J{4O"PQQQ':55%%,j9Q.RS   ':55%%,j9Q.RS   MA&&q))Z77$$Z0D0DQ0G0G%HIIIQ  &&q))Z77 !!$s4+=}'M'M"N"NOOO:%%+*>j*P*P!!Z-E)F)F!G!GH   	t17;<<<!!4>8"<=== !"!"002234 	 %%h//qz71R4='"+55r*   c                 H   |                                   |                    |                                            |                    |                     |d                     |                    | j        |                                 g          }|d         |d         fS )NFshuffler   r&   )r  r   r{  _get_loss_inputs_dictr   r   _get_grad_and_stats_fetchesr   r   r   r   s       r+   r   z!TFPolicy._build_compute_gradientsS  s    d??AABBB&&':E&JJ	
 	
 	
 %%t{D4T4T4V4V&WXXqz71:%%r*   c                    t          |          t          | j                  k    r(t          d                    || j                            |                    | j        di           |                    t          t          | j        |                               |                    | j	        g          }|d         S )Nz6Unexpected number of gradients to apply, got {} for {}Tr   )
r]   r   ri   rs   r   r}   r   r   r   r   )r   r   r   r   s       r+   r   zTFPolicy._build_apply_gradients\  s    y>>S----HOOt{   
 	t0$7888d3t{I#>#>??@@@%%t~&677qzr*   c                 8   |                                   |                    |                                            |                    |                     |d                     |                    | j        |                                 g          }|d         S )NFr  r&   )r  r   r{  r  r   r   r  r  s       r+   r   zTFPolicy._build_learn_on_batchh  s    d??AABBB&&':E&JJ	
 	
 	
 %%0022
 
 qzr*   c                     |                                  }t          |vrt          d          | j        r%t	          | j        fi |t                   |t          <   |S )Nz0Grad fetches should contain 'stats': {...} entry)r}  r   ri   r   r   r  s     r+   r  z$TFPolicy._get_grad_and_stats_fetchesw  sm    1133G++OPPP 	)-#* *'./@'A* *G%& r*   r   r  c           
      @   t          |t                    r|j        sHt          || j        || j        t          | j                                                  | j	                   |
                    d           i | j                                        D ]%\  }}t          j        fd|||                   }~&d t          t!          | j                            D             }|D ]}||         | j        |         <   |r|t          j                 | j        <   S )ax  Return a feed dict from a batch.

        Args:
            train_batch: batch of data to derive inputs from.
            shuffle: whether to shuffle batch sequences. Shuffle may
                be done in-place. This only makes sense if you're further
                applying minibatch SGD after getting the outputs.

        Returns:
            Feed dict of data.
        )rA   r  rB   feature_keysrl   Tc                 0                         | |          S r   )__setitem__)ri  r  r  s     r+   r  z0TFPolicy._get_loss_inputs_dict.<locals>.<lambda>  s    i33B:: r*   c                 8    g | ]}d                      |          S )rN  )rs   )rJ   rO   s     r+   rL   z2TFPolicy._get_loss_inputs_dict.<locals>.<listcomp>  s&    VVV!m**1--VVVr*   )rp   r   zero_paddedr   r   r   r   r]  r  rl   r   r   r*  r   r  r^   r]   r   r  r   )r   r   r  r  placeholdersr  
state_keysr  s          @r+   r  zTFPolicy._get_loss_inputs_dict  sN    +{33 	;;R 	/ -'+'C!$">"C"C"E"EFF"&"8    	  &&& 	!%!6!<!<!>!> 	 	C"::::C  A
 VVuSAS=T=T7U7UVVV
 	E 	EC4?4DId+C011 	J(3K4H(IIdn%r*   )NNNNNNNNNNNr-   r&   NNN)NNN)NNNNNNN)NNNT)r   Nr   )r   rF  )r   r~  )K__name__
__module____qualname____doc__r(   staticmethodr,   gymspacesSpacer   r!   r   r   r   strr   r	   typer_   rY   r   r
   r   r   boolr   r   r   r   r   r    r   r   r   r   r   r   r   r   r   r  r  r   r   r>  rC  r   r  rI  r   r   rn  rp  rt  rs  r{  anyr}  rW  r   r   rf  r|   r  r  r  r%  r   r   r   r   r  r  __classcell__)r   s   @r+   r$   r$   )   s        J %&!E E \E $(48-1/3,0%)37482626)-&''+(,)-3Q Q:+Q j&Q $	Q
 Q Q #Q JZ 001Q %Z01Q  Q &j1Q z*Q !,Q j)Q TNQ  tJ/0!Q"  Z 01#Q$ $J/%Q& $J/'Q( :&)Q* +Q, !$-Q. $/Q0 *%1Q2 :&3Q Q Q Q Q Qf Xf "&$ $+tCO'<<=$ $ 3-	$ 
z4
+T#z/-BB	C$ $ $ $L Xf 59AEAE04"&"&( (j):56(  Z 01( !j!1:!=>	(
 !j!1:!=>( T#t)_-( $( 3-( ( ( (T Xf
 59KOKO#'0' 0'tJ'340' j):560'  Z 01	0'
 $E$z*:J*F$GH0' $E$z*:J*F$GH0' !0' 
0' 0' 0' 0'd Xf +  $sJBW        D Xf$#.$	~tCO44	5$ $ $ $ %k %h % % % \%N Xf D     Xf-U4Z#8$z:J#JK - - - - Xf4 4 4 4 XfCtCO'< C C C C Z+4888,d3
?&; , , , 98, Xf+d + + + + Xf'3 ' ' ' ' Xf
; 
 
 
 
 
 
 Xf!{ !t ! ! ! ! ! !" Xf6F 6Fs 6F(3- 6F4 6F 6F 6F 6Fp Xf> > > > > > XfXm4    * * *+ + + +6%$ % % % %W
:&W
59%Z:P5QW
	W
 W
 W
 W
r"Ds4E/E)F$G "J " " " "Z5K0L    -d3
?.C - - - -T#z/%:    (d:z3I.J    'DcN ' ' ' '
- 
- 
- 
-!>n)==>!> JZ 001!> 
tN#T$~*>%??	@	!> !> !> !>F%n)==>% nd>.BBC% 
	% % % %N	! 	! 	!D D D
 
 

 
 
7! 7! 7!z E6 E6 E6 E6 E6N& & &
 
 
    , ,t , , , , , , , ,r*   r$   )@loggingr`   typingr   r   r   r   r   	gymnasiumr  numpyr   r   rc   ray._common.deprecationr   ray.rllib.models.modelv2r	   ray.rllib.policy.policyr
   r   r   ray.rllib.policy.rnn_sequencingr   ray.rllib.policy.sample_batchr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.debugr   ray.rllib.utils.errorr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r   r   $ray.rllib.utils.metrics.learner_infor   "ray.rllib.utils.spaces.space_utilsr   ray.rllib.utils.tf_run_builderr   ray.rllib.utils.tf_utilsr   r   ray.rllib.utils.typingr   r   r    r!   ray.util.debugr"   r~   rG   tfv	getLoggerr  r[   r$   r)   r*   r+   <module>r     sq     5 5 5 5 5 5 5 5 5 5 5 5 5 5          



 . . . . . . , , , , , , C C C C C C C C C C O O O O O O 5 5 5 5 5 5 & & & & & & = = = = = = = = + + + + + + K K K K K K 3 3 3 3 3 3         
 C B B B B B ? ? ? ? ? ? 8 8 8 8 8 8 I I I I I I I I            $ # # # # #}R		8	$	$ C C C C Cv C C C C Cr*   