
    &`i                        d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9  e)            \  Z:Z;Z< e j=        e>          Z?e! G d de                      Z@dS )    N)OrderedDict)DictListOptionalTupleTypeUnion)ModelCatalog)ModelV2)TFActionDistributionTFMultiGPUTowerStack)Policy)SampleBatch)TFPolicy)ViewRequirement)
force_list)OldAPIStackOverrideToImplementCustomLogic5OverrideToImplementCustomLogic_CallToSuperRecommendedis_overriddenoverride)	summarize)try_import_tf)'DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICYNUM_GRAD_UPDATES_LIFETIME)LEARNER_STATS_KEY)get_dummy_batch_for_space)get_placeholder)AlgorithmConfigDictLocalOptimizerModelGradients
TensorType)log_oncec                       e Zd ZdZddddej        j        dej        j        dedee	e
df                  d	ee         f
 fd
Zed             Zedej        j        dej        j        defd            Ze ee          deedf         dee         dedeeee         f         fd                        Zedede	e
ef         fd            Zededede	e
ef         fd            Zedefd            Zedededefd            Zedddeddfd            Z edededede!eeeee         f         fd            Z"edededede!ee#ee         f         fd             Z$ede%fd!            Z& ee'          e(de	e
ef         f fd"                        Z)e(de	e
ef         fd#            Z* ee'          d$             Z+ ee          e(	 	 dAd%ed&ee         fd'                        Z, ee'          ededed         f         f fd(                        Z-d) Z.d* Z/de	e
df         fd+Z0de	e
df         de!ee%ef         ee1ef         f         fd,Z2d- Z3d.ee%ef         d/ee1ef         de!eeee#e	e
ef         f         fd0Z4d1 Z5d2 Z6 ee          	 dBd4e1ddfd5            Z7defd6Z8 ee'          dee!e
df                  de'fd7            Z9 ee          dee         fd8            Z: ee          	 dCd:ed;e%de%fd<            Z; ee          dCd;e%de%fd=            Z< ee          dDd>e%d;e%fd?            Z= ee'           fd@            Z> xZ?S )EDynamicTFPolicyV2zA TFPolicy that auto-defines placeholders dynamically at runtime.

    This class is intended to be used and extended by sub-classing.
    Nexisting_inputsexisting_model	obs_spaceaction_spaceconfigr(   ztf1.placeholderr)   c                h   || _         || _        || _        d| _        d | _        |d u| _        |                     |||           |                                 | _        |rlt          |t                    rW|d         | _        t          dt          |                    D ]+}t          | ||         d         ||         d                    ,n|                                 | _        |                                  |                     |           |                                  |                     |          \  }}|                     ||          \  }	}
}| _        t,                                          p4t,                              t-          j        d
i | j        d                   }|                                 }t6          j        | j        j        v r| j        t6          j                 nd }t6          j        | j        j        v r| j        t6          j                 nd }tA                      !                    ||||| j        t6          j"                 | j        t6          j#                 |	|
|| j        d g | j        | j$        | j%        ||| j        |d         &                    dd          |||	           d S )Ntfr      tf_session_args)r,   modelmax_seq_len   )observation_spacer+   r,   sess	obs_inputaction_inputsampled_actionsampled_action_logpdist_inputs
dist_classlossloss_inputsr1   state_inputsstate_outputsprev_action_inputprev_reward_inputseq_lensr2   batch_divisibility_reqexploretimestep )'r4   r+   r,   	framework	_seq_lens	_is_towervalidate_spaces_init_dist_classr;   
isinstancelistr1   rangelensetattr
make_model/_update_model_view_requirements_from_init_state_init_state_inputs_init_view_requirements _init_input_dict_and_dummy_batch_init_action_fetches_policy_extra_action_fetchestf1get_default_sessionSessionConfigProtoget_batch_divisibility_reqr   PREV_ACTIONS_input_dictaccessed_keysPREV_REWARDSsuper__init__OBSACTIONS_state_inputs
_state_outget)selfr*   r+   r,   r(   r)   irE   rD   r8   r9   r:   r5   rC   r@   rA   	__class__s                   y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/dynamic_tf_policy_v2.pyrb   zDynamicTFPolicyV2.__init__6   s    "+((4Yf===//11 	+j>> 	+'*DJ1c.1122 J JnQ/2N14Ea4HIIIIJ **DJ<<>>>000$$&&& AA/RR' %%h88	
- &&(( 
CKK?DDT[1B%CDD -8 -
 -
 "&!@!@!B!B '4+;+III [566 	 '4+;+III [566 	 	'%&{7)+*=>) 3#*+///^w++M2>>#9- 	 	
 	
 	
 	
 	
    c                      d S NrF   rF   rl   rk   #enable_eager_execution_if_necessaryz5DynamicTFPolicyV2.enable_eager_execution_if_necessary   s	     	rl   c                     i S rn   rF   )rh   r*   r+   r,   s       rk   rJ   z!DynamicTFPolicyV2.validate_spaces   	     	rl   r1   ztf.keras.Modelr;   train_batchreturnc                     t           )a1  Constructs loss computation graph for this TF1 policy.

        Args:
            model: The Model to calculate the loss for.
            dist_class: The action distr. class.
            train_batch: The training data.

        Returns:
            A single loss tensor or a list of loss tensors.
        )NotImplementedError)rh   r1   r;   rr   s       rk   r<   zDynamicTFPolicyV2.loss   s
    $ "!rl   c                     i S )zStats function. Returns a dict of statistics.

        Args:
            train_batch: The SampleBatch (already) used for training.

        Returns:
            The stats dict.
        rF   )rh   rr   s     rk   stats_fnzDynamicTFPolicyV2.stats_fn   s	     	rl   gradsc                     i S )zGradient stats function. Returns a dict of statistics.

        Args:
            train_batch: The SampleBatch (already) used for training.

        Returns:
            The stats dict.
        rF   )rh   rr   rx   s      rk   grad_stats_fnzDynamicTFPolicyV2.grad_stats_fn   s	     	rl   c                     t          j        | j        | j        d                   \  }}t          j        | j        | j        || j        d         d          S )zoBuild underlying model for this Policy.

        Returns:
            The Model for the Policy to use.
        r1   r.   )r*   r+   num_outputsmodel_configrG   )r
   get_action_distr+   r,   get_model_v2r4   )rh   _	logit_dims      rk   rQ   zDynamicTFPolicyV2.make_model   sa     $3t{73
 
9 (,*!W-
 
 
 	
rl   	optimizerr<   c                     dS )a  Gradients computing function (from loss tensor, using local optimizer).

        Args:
            policy: The Policy object that generated the loss tensor and
                that holds the given local optimizer.
            optimizer: The tf (local) optimizer object to
                calculate the gradients with.
            loss: The loss tensor for which gradients should be
                calculated.

        Returns:
            ModelGradients: List of the possibly clipped gradients- and variable
                tuples.
        NrF   )rh   r   r<   s      rk   compute_gradients_fnz&DynamicTFPolicyV2.compute_gradients_fn   s	    $ trl   ztf.keras.optimizers.Optimizerztf.Operationc                     dS )aY  Gradients computing function (from loss tensor, using local optimizer).

        Args:
            optimizer: The tf (local) optimizer object to
                calculate the gradients with.
            grads: The gradient tensor to be applied.

        Returns:
            "tf.Operation": TF operation that applies supplied gradients.
        NrF   )rh   r   rx   s      rk   apply_gradients_fnz$DynamicTFPolicyV2.apply_gradients_fn   s	      trl   	obs_batchstate_batchesc                    dS )ae  Custom function for sampling new actions given policy.

        Args:
            model: Underlying model.
            obs_batch: Observation tensor batch.
            state_batches: Action sampling state batch.

        Returns:
            Sampled action
            Log-likelihood
            Action distribution inputs
            Updated state
        )NNNNrF   rh   r1   r   r   kwargss        rk   action_sampler_fnz#DynamicTFPolicyV2.action_sampler_fn   s
    , &%rl   c                    dS )aC  Action distribution function for this Policy.

        Args:
            model: Underlying model.
            obs_batch: Observation tensor batch.
            state_batches: Action sampling state batch.

        Returns:
            Distribution input.
            ActionDistribution class.
            State outs.
        )NNNrF   r   s        rk   action_distribution_fnz(DynamicTFPolicyV2.action_distribution_fn  s
    *  rl   c                     dS )zrGet batch divisibility request.

        Returns:
            Size N. A sample batch must be of size K*N.
        r/   rF   rh   s    rk   r\   z,DynamicTFPolicyV2.get_batch_divisibility_req.  s	     qrl   c                 |    t                                                      }|                    | j                   |S )zExtra values to fetch and return from compute_actions().

        Returns:
             Dict[str, TensorType]: An extra fetch-dict to be passed to and
                returned from the compute_actions() call.
        )ra   extra_action_out_fnupdaterW   )rh   extra_action_fetchesrj   s     rk   r   z%DynamicTFPolicyV2.extra_action_out_fn8  s8      %ww::<<##D$EFFF##rl   c                     i S )zExtra stats to be reported after gradient computation.

        Returns:
             Dict[str, TensorType]: An extra fetch-dict.
        rF   r   s    rk   extra_learn_fetches_fnz(DynamicTFPolicyV2.extra_learn_fetches_fnE  rq   rl   c                 N    t          t          i ifi |                                 S rn   )dictr   r   r   s    rk   extra_compute_grad_fetchesz,DynamicTFPolicyV2.extra_compute_grad_fetchesN  s*    &+MMt/J/J/L/LMMMrl   sample_batchother_agent_batchesc                 ,    t          j        | |          S )a  Post process trajectory in the format of a SampleBatch.

        Args:
            sample_batch: sample_batch: batch of experiences for the policy,
                which will contain at most one episode trajectory.
            other_agent_batches: In a multi-agent env, this contains a
                mapping of agent ids to (policy, agent_batch) tuples
                containing the policy and experiences of the other agents.
            episode: An optional multi-agent episode object to provide
                access to all of the internal episode state, which may
                be useful for model-based or multi-agent algorithms.

        Returns:
            The postprocessed sample batch.
        )r   postprocess_trajectory)rh   r   r   episodes       rk   r   z(DynamicTFPolicyV2.postprocess_trajectoryR  s    . ,T<@@@rl   c                 D    t                                                      S )zTF optimizer to use for policy optimization.

        Returns:
            A local optimizer or a list of local optimizers to use for this
                Policy's Model.
        )ra   r   )rh   rj   s    rk   r   zDynamicTFPolicyV2.optimizerk  s     ww  """rl   c                     t          | j                  st          | j                  r%t          | j                  st	          d          d S t          j        | j        | j        d                   \  }}|S )NzT`make_model` is required if `action_sampler_fn` OR `action_distribution_fn` is givenr1   )	r   r   r   rQ   
ValueErrorr
   r~   r+   r,   )rh   r;   r   s      rk   rK   z"DynamicTFPolicyV2._init_dist_classx  s    /00 	M'5
 5
 	 !11  8   4(8!4;w#7 MJ rl   c                    t          | dd           rd S |                                 | _        | j                            | j        j                   t
          j        | j        v rd| j        t
          j                 _        d S d S )Nview_requirementsF)getattr_get_default_view_requirementsr   r   r1   r   INFOSused_for_trainingr   s    rk   rT   z)DynamicTFPolicyV2._init_view_requirements  s    4,d33 	F "&!D!D!F!F 	%%dj&BCCC 666JOD";#45GGG 76rl   c                 Z   |rEd |                                 D             | _        | j        r|t          j                 | _        dS dS d | j        j                                         D             | _        | j        r/t                              t          j
        dgd          | _        dS dS )zjInitialize input placeholders.

        Args:
            existing_inputs: existing placeholders.
        c                 B    g | ]\  }}|                     d           |S )	state_in_)
startswith).0kvs      rk   
<listcomp>z8DynamicTFPolicyV2._init_state_inputs.<locals>.<listcomp>  s=     " " "ak9R9R"" " "rl   c           	          g | ]J\  }}|                     d           t          |j        t          |j        t
                     |          KS )r   )space	time_axisname)r   r   r   rL   shiftint)r   r   vrs      rk   r   z8DynamicTFPolicyV2._init_state_inputs.<locals>.<listcomp>  si     " " " Ar<<,,"(",RXs";";;  " " "rl   NrB   )dtypeshaper   )itemsre   r   SEQ_LENSrH   r1   r   rX   placeholderr.   int32)rh   r(   s     rk   rS   z$DynamicTFPolicyV2._init_state_inputs  s      	" "-3355" " "D ! G!01E!FG G" " "Z9??AA" " "D ! !$(4&z "1 " " rl   c                    | j         r7|J |d         }d}|                     | j        |          \  | _        | _        nt
                              t                              dt          j	                  dd          }t
                              ddd          }|                     | j        i           \  | _        | _        | j        
                    |                                            ||fS )	aQ  Initialized input_dict and dummy_batch data.

        Args:
            existing_inputs: When copying a policy, this specifies an existing
                dict of placeholders to use instead of defining new ones.

        Returns:
            timestep: training timestep.
            explore: whether this policy should explore.
        NrE   FrF   )r   )r   Tis_exploring)rI   "_create_input_dict_and_dummy_batchr   r^   _dummy_batchrX   placeholder_with_defaultr.   zerosint64set_training_get_is_training_placeholder)rh   r(   rE   rD   s       rk   rU   z2DynamicTFPolicyV2._init_input_dict_and_dummy_batch  s    > 	T"...&z2HG 77&  !! 3328,,bz 4  H 224.2QQG 778NPRSS ! 	%%d&G&G&I&IJJJ  rl   c                    i }|                                 D ]%\  }}t          j        d|          }|1| j        t	          |                    d                             ||<   N|                    d          rd|t          j        k    ru||v r||         ||<   t          |j
        t                     }|j        r~| j                            d          r|t          j        t          j        fv rd}n,|t          j        t          j        fv r| j        d         rd}nd}t%          |j        |||	          ||<   '|                     d
          }	t          || j                  |	fS )a4  Creates input_dict and dummy_batch for loss initialization.

        Used for managing the Policy's input placeholders and for loss
        initialization.
        Input_dict: Str -> tf.placeholders, dummy_batch: str -> np.arrays.

        Args:
            view_requirements: The view requirements dict.
            existing_inputs (Dict[str, tf.placeholder]): A dict of already
                existing placeholders.

        Returns:
            Tuple[Dict[str, tf.placeholder], Dict[str, np.ndarray]]: The
                input_dict/dummy_batch tuple.
        zstate_in_(\d+)Nr/   
state_out__disable_action_flatteningF_disable_preprocessor_apiT)r   r   r   flatten    
batch_size)rB   )r   rematchre   r   groupr   r   ACTION_DIST_INPUTSrL   r   r   r,   rg   rd   r]   rc   NEXT_OBSr   r   '_get_dummy_batch_from_view_requirementsrH   )
rh   r   r(   
input_dictview_colview_reqmor   r   dummy_batchs
             rk   r   z4DynamicTFPolicyV2._create_input_dict_and_dummy_batch  s     
"3"9"9";"; (	 (	Hh+X66B~'+'9#bhhqkk:J:J'K
8$$$$\22 "[;;;_,,'6x'@
8$$ !+8>3 ? ??	-  {'CDD '#+#0V J J #( ![_k6J$KKK K(CD L #( #'+:&n%"+ '	, , ,Jx( BBbBQQ:???LLrl   rE   rD   c           
         d}d}d}i }d| _         | j        s|                                 | _        t	          | j                  r|                     | j        | j        t          j	                 | j
        | j        | j                            t          j                  | j                            t          j                  || j        j                  \  }}}| _         nt	          | j                  rV| j        }|                     | j        |t          j	                 | j
        | j        |||j                  \  }| _        | _         njt%          | j        t&          j        j                  r$|                     | j                  \  }| _         }n"|                     | j                  \  }| _         |                     || j                  }| j                            |||          \  }}|||t          j        <   |Y||t          j        <   t&                              t&                              |t&          j                            |t          j        <   ||||fS )zECreate action related fields for base Policy and loss initialization.N)r   r   rB   prev_action_batchprev_reward_batchrD   is_training)r   r   rB   rD   rE   r   )action_distributionrE   rD   )rf   rI   _create_explorationexplorationr   r   r1   r^   r   rc   re   rH   rg   r]   r`   r   r   r;   rL   r.   kerasModelget_exploration_actionr   ACTION_LOGPexpcastfloat32ACTION_PROB)	rh   rE   rD   r8   r9   r:   r   in_dictaction_dists	            rk   rV   z&DynamicTFPolicyV2._init_action_fetches  se    "!~ :	#7799D T344 5 **J".{?"&"4!^&*&6&:&:;;S&T&T&*&6&:&:;;S&T&T# $ 0 < + 	 	"'OO !!<== T #.G
 33
")+/":&*&8!% '!)$+$7 4  	# "$*bhn== TMQZZ ,N NJT_6J6J 8<zz$BR7S7S4T_"ook4:FF $;;(3hPW <  "'
 "CN !?@*<O !89<>FF+RZ88= = !89
  	
 	
rl   c                     t          |                                           }| j        r| j                            |          }|sd S || _        |d         | _        d S )Nr   )r   r   r   get_exploration_optimizer_optimizers
_optimizer)rh   
optimizerss     rk   _init_optimizersz"DynamicTFPolicyV2._init_optimizerst  se       0 011
 	P)CCJOOJ  	F &$Q-rl   c                      j         r@                                                     t                                                     d S                                                        d           t           j                  dk    st          d  j        D                       rwt          
                    dt          j                  5   fdt           j                            dd                    D              _        d d d            n# 1 swxY w Y                                                        t                                                     d S )	NT)auto_remove_unneeded_view_reqsr/   c              3      K   | ]}d |v V  	dS )gpuNrF   )r   ds     rk   	<genexpr>zHDynamicTFPolicyV2.maybe_initialize_optimizer_and_loss.<locals>.<genexpr>  s&      'I'Iq
'I'I'I'I'I'Irl    )reusec                 0    g | ]}t                     S ))policyr   )r   r   rh   s     rk   r   zIDynamicTFPolicyV2.maybe_initialize_optimizer_and_loss.<locals>.<listcomp>  s4     / / / )555/ / /rl   num_multi_gpu_tower_stacks)rI   get_sessionrunrX   global_variables_initializerr   !_initialize_loss_from_dummy_batchrO   devicesanyvariable_scope
AUTO_REUSErN   r,   rg   multi_gpu_tower_stacksr   s   `rk   #maybe_initialize_optimizer_and_lossz5DynamicTFPolicyV2.maybe_initialize_optimizer_and_loss  s   > 	""3#C#C#E#EFFFF 	..d.SSS t|q  C'I'IDL'I'I'I$I$I 
 ##Bcn#==  / / / /"4;??3OQR#S#STT/ / /+               	s??AABBBBBs   :DDDTr   c                 >                                                          t                                                      j                                        D ]/\  }}|                    d          s| j        j        vrd|_	        0 
                                                                D ]-\  }}t          t          j                            dd|j                                        dd          |j        j                  t'           j                             j        |<   t+          ||           j        |<   | j        vrt,                              d	                    |                     t3          t          j                            dd|j                                        dd          |j        j                  d
           j        |<   / j        }t,                              d            j                             |                                                                  |          }|j        D ]}| j        vrt+          ||         |           j        |<   | j        vrXt3          t          j                            dd||         j        dd          ||         j                  d
           j        |<   t;          t=           j        fi  j        d          } j         rJ j!        |t:          j"        <    j        #                    t:          j"        |t:          j"                 i            j        #                    t=          |                     tI          d          r:t,          %                    d                    tM          |                                '                    |          }|j        |j        z  |j        z  tQ           j)        j        *                                          z  tW          j,         |fd|                                D             t:          j"        |v rt:          j"        |t:          j"                 fgng z              d j        v r j        d=  j-        #                     .                    | j/                             |r3|j        |j        z  |j        D ]}||j        vr| j)        j        vr|t:          j0        t:          j1        t:          j2        t:          j3        t:          j4        t:          j5        t:          j6        t:          j7        t:          j8        f	vr,| j        v rd j        |         _9        | j        v r j        |= tu           j        *                                          D ]}|vr|t:          j0        t:          j1        t:          j2        t:          j3        t:          j4        t:          j5        t:          j6        t:          j7        fvrk| j)        j        vr]||j;        v r.t,          <                    d                    |                     n j=        d          j        |= | j        v r j        |= tu           j        *                                          D ]U} j        |         }	|	j>        ?|	j>         j        vr1|	j>        |j        v }
t3          |	j?        |
           j        |	j>        <   V fd j                                        D              _@        d S )Nr   Fg      g      ?r/   )r   r   r   )valuer   z,Adding extra-action-fetch `{}` to view-reqs.)r   used_for_compute_actionsz0Testing `postprocess_trajectory` w/ dummy batch.T)_is_training	loss_initz1Initializing loss function with dummy input:

{}
c                 &    g | ]\  }}|v 	||fS rF   rF   )r   r   r   all_accessed_keyss      rk   r   zGDynamicTFPolicyV2._initialize_loss_from_dummy_batch.<locals>.<listcomp>  s,    NNN1q<M7M7MaV7M7M7Mrl   r   zSampleBatch key '{}' was deleted manually in postprocessing function! RLlib will automatically remove non-used items from the data stream. Remove the `del` from your postprocessing function.output)r   r   c                 D    i | ]\  }}|j         v|j        k    ||S rF   )re   rH   )r   r   r   rh   s      rk   
<dictcomp>zGDynamicTFPolicyV2._initialize_loss_from_dummy_batch.<locals>.<dictcomp>R  sC     (
 (
 (
1+++T^0C0C q0C0C0Crl   )Ar   r   rX   r   r   r   r   r^   r_   r  r   r   gymspacesBoxr   as_listr   r   rO   r   r   loggerinfoformatr   r   r   
added_keysr   r   _loss_input_dictre   rH   r   r   r$   debugr   _do_loss_initsetr1   keysr   _initialize_loss_stats_fetchesrz   _gradsEPS_IDAGENT_INDEX	UNROLL_IDTERMINATEDS
TRUNCATEDSREWARDSr   T
OBS_EMBEDSr   rM   deleted_keyswarningr,   data_colr   _loss_input_dict_no_rnn)rh   r   keyr   r  r   r   rr   lossesr   r   r  s   `          @rk   r   z3DynamicTFPolicyV2._initialize_loss_from_dummy_batch  sP   
 	s??AABBB "399;; 	: 	:MCNN;//:t/===4912244::<< 	 	JC%>
#U[%8%8%:%:122%>ekFV    t011	& & &Dc" %4%c$J$J$JDS!$000JQQRUVVWWW.=*..#k1133ABB7#k.	 )   .3/ / /&s+ 'FGGG//k4CSCSCUCUVVV''44) 	 	C$***(7%c*) ) ) % $000.=*..)#.4QRR8)#.4	 )   .3/ / /&s+ "!;;T%:;;
 
 

  	04K,-!((%{;3G'HI   	$$T+%6%6777K   	LLFMMk**    ##K00 %'($% $*.3355667 	 	!NNNN 1 1 3 3NNN ';66 &K4H(IJKK			
 		
 		
 D111%m4 	""4#5#5k4;#O#OPPP * J	 + 9K<U U"0 7 7{8884:#???#*#/#-#/#.#+#)##.
  d444HM.s3Ed333 1#6
 D27799:: !7 !7000#*#/#-#/#.#+#)#	
 
 4:#???
 k6667 8>vc{{    X.6 237d333 1#6 D27799:: 	 	+C0K+4+AAA(*{7P(P%:I h:K; ; ;D*2;7(
 (
 (
 (
-3355(
 (
 (
$$$rl   c                 R   |                      | j        | j        |          }t          |          }| j                            |                     |                     g | _        t          | j        t          j
        j                  s| j                                        | _        |S rn   )r<   r1   r;   r   r  r   rw   _update_opsrL   r.   r   r   
update_ops)rh   rr   r,  s      rk   r  zDynamicTFPolicyV2._do_loss_initX  s    4:tDDF##""4==#=#=>>>$*bhn55 	7#z4466Drl   c                    t          j        | j                  }t          j        | j                  }t	          |          t	          |          k    rt          d| j        | j        |          t          |          D ]a\  }}|j        	                                ||         j        	                                k    r"t          d||j        ||         j                  bg }t          t	          | j                            D ]B}|                    d                    |          |t	          |          |z            f           C|r'|                    t          j        |d         f           t          j        | j        |dt	          |                             t!          d| j        fd| j        fgfdt          | j                                                  D             z   |z             }|                     | j        | j        | j        || j        d	t3          | d	d          fd
t3          | d
d          fg          }||_        |                    t          |                    }	fdt          | j                                                  D             }
t7          j        ||	|
           |j                            |                    ||j                              |S )z9Creates a copy of self using existing input placeholders.zTensor list mismatchzTensor shape mismatchzstate_in_{}Nr   rE   c                 *    g | ]\  }}||         fS rF   rF   r   ri   r   existing_inputs_unflatteneds      rk   r   z*DynamicTFPolicyV2.copy.<locals>.<listcomp>  s8       Aq /23  rl   target_q_modeltarget_modelr'   c                 *    g | ]\  }}||         fS rF   rF   r3  s      rk   r   z*DynamicTFPolicyV2.copy.<locals>.<listcomp>  s8     
 
 
1 +A./
 
 
rl   )!treer   r  r*  rO   r   re   	enumerater   r  rN   appendr  r   r   unflatten_asr   _is_exploring	_timestepr  rj   r4   r+   r,   r1   r   r  r   r  r  r   rz   r  )rh   r(   flat_loss_inputsflat_loss_inputs_no_rnnri   r   
rnn_inputsr   instancer,  r=   r4  s              @rk   copyzDynamicTFPolicyV2.copyb  s8     <(=>>"&,t/K"L"L   C$8$888&%"	   566 	 	DAqw  OA$6$<$D$D$F$FFF +Q9K9Q   G 
s4-..// 	 	A!((++#C(?$@$@1$DE     	K{3_R5HIJJJ&*&7(:c"9:::;'
 '
# !d01J3OP   %d&B&G&G&I&IJJ  
 
 

 >>"K&
 "741A4#H#HI~t!D!DE " 
 
 %/!''J(?(?@@
 
 
 
!$">"C"C"E"EFF
 
 

 	!(FK@@@&&"":x??	
 	
 	
 rl   c                 F    | j         r| j                                         S g S rn   )r1   get_initial_stater   s    rk   rD  z#DynamicTFPolicyV2.get_initial_state  s%    : 	://111Irl   r   batchbuffer_indexc                    |                     d           t          | j                  dk    r/| j        d         dk    r|dk    sJ || _        t          |          S |                     |d          t          j        | j                  }| j        r| j        | j	        gz   }ng }fd|D             }fd|D             }| j
        |                             |                                 |||j        	          S )
NTr/   r   /cpu:0F)shufflec                      g | ]
}|         S rF   rF   r   r   r   s     rk   r   z<DynamicTFPolicyV2.load_batch_into_buffer.<locals>.<listcomp>  s    333A*Q-333rl   c                      g | ]
}|         S rF   rF   rK  s     rk   r   z<DynamicTFPolicyV2.load_batch_into_buffer.<locals>.<listcomp>  s    :::!
1:::rl   )r5   inputsr>   num_grad_updates)r   rO   r   _loaded_single_cpu_batch_get_loss_inputs_dictr8  r   r*  re   rH   r  	load_datar   rN  )rh   rE  rF  	data_keys
state_keysrM  r>   r   s          @rk   load_batch_into_bufferz(DynamicTFPolicyV2.load_batch_into_buffer  s&    	4    t|!!dl1o&A&A1$$$$,1D)u:://u/EE
L!=>>	 	+t~.>>JJJ3333333::::z:::*<8BB!!##%"3	 C 
 
 	
rl   c                     t          | j                  dk    r6| j        d         dk    r%|dk    sJ | j        t          | j                  ndS | j        |         j        S )Nr/   r   rH  )rO   r   rO  r  num_tuples_loaded)rh   rF  s     rk   "get_num_samples_loaded_into_bufferz4DynamicTFPolicyV2.get_num_samples_loaded_into_buffer  ss     t|!!dl1o&A&A1$$$$ 0< D1222 *<8JJrl   offsetc                    t          | j                  dk    r| j        d         dk    r|dk    sJ | j        t          d          | j                            d          }|&| j                            d| j        d                   }|t          | j                  k    r| j        }n| j                            |||z             }|                     |          S | j        |         }|	                    | 
                                |          }| xj        dz  c_        |                    t          | j        t          | j        dz
  |j        pdz
  i           |S )	Nr/   r   rH  zPMust call Policy.load_batch_into_buffer() before Policy.learn_on_loaded_batch()!minibatch_sizesgd_minibatch_sizetrain_batch_size)startend)rO   r   rO  r   r,   rg   slicelearn_on_batchr  optimizer   rN  r   r   r   )rh   rX  rF  r   sliced_batchtower_stackresultss          rk   learn_on_loaded_batchz'DynamicTFPolicyV2.learn_on_loaded_batch  s    t|!!dl1o&A&A1$$$$,4 6   )9::J!![__($+6H*I 
 S!>????#<#<BB fz&9  C     &&|4441,?&&t'7'7'9'96BB")4+@7)A-1M1RQRS		
 	
 	
 rl   c                 6   t          |          }t          |          }t          | j                  rE| j        d         r|                     ||          S |                     |d         |d                   S t	                                          ||          S )N%_tf_policy_handles_more_than_one_lossr   )r   r   r   r,   ra   	gradients)rh   r   r<   r   r,  rj   s        rk   rh  zDynamicTFPolicyV2.gradients
  s    	**
D!!233 		9 {BC K00VDDD 00Aq	JJJ77$$Z888rl   )NN)T)r   )r   r   )@__name__
__module____qualname____doc__r  r  Spacer    r   r   strr   rb   staticmethodro   r   rJ   r   r   r	   r   r   r   r#   r   r<   rw   r"   rz   rQ   r!   r   r   r   r   typer   r   r\   r   r   r   r   r   r   r   rK   rT   rS   boolrU   r   rV   r   r  r   r  rB  rD  rT  rW  re  rh  __classcell__)rj   s   @rk   r&   r&   /   s         CG,0Q
 Q
 Q
:#Q
 j&Q
 $	Q
 "$s,='=">?Q
 !)Q
 Q
 Q
 Q
 Q
 Q
f   \
 $:# j& $	   $# $Xf"W../" -." !	"
 
z4
++	," " "  $#"$ $	K 	Dj4I 	 	 	 $#	 $&/=	c:o	   $# $
G 
 
 
 $#
$ $'/9	   $#& $2  
	   $#" $&& 	&
 "& 
z:z4
3CC	D& & & $#&. $   	 
 "  
z4j!11	2      $# , $C    $# Xh:	$T#z/%: 	$ 	$ 	$ 	$ 	$ ;: 	$ ;S*_(=    ;: XhN N N Xf: 6:	A A!A &k2A A A ;: A. Xh#	#	.5T0UU	V	# 	# 	# 	# 	# $# 	#   P P P $s<M7M2N    <'!#C):$:;'!	uS*_%uT:-='>>	?'! '! '! '!R<M <M <M|U
c:o.U
9>tZ?O9PU
	z:z4c:o9NN	OU
 U
 U
 U
n( ( ( C C C4 Xf59w
 w
.2w
	w
 w
 w
 w
r     XhEDs4E/E)F$G EH E E E EN Xf4
#3     Xf 
 

 
 
	
 
 
 
> XfK Ks K3 K K K K Xf( (C (3 ( ( ( (T Xh9 9 9 9 9 9 9 9 9rl   r&   )Aloggingr   collectionsr   typingr   r   r   r   r   r	   	gymnasiumr  r8  ray.rllib.models.catalogr
   ray.rllib.models.modelv2r   "ray.rllib.models.tf.tf_action_distr   "ray.rllib.policy.dynamic_tf_policyr   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.tf_policyr   !ray.rllib.policy.view_requirementr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   r   r   r   r   ray.rllib.utils.debugr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r   $ray.rllib.utils.metrics.learner_infor   "ray.rllib.utils.spaces.space_utilsr   ray.rllib.utils.tf_utilsr   ray.rllib.utils.typingr    r!   r"   r#   ray.util.debugr$   rX   r.   tfv	getLoggerri  r  r&   rF   rl   rk   <module>r     s    				 # # # # # # ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ;      1 1 1 1 1 1 , , , , , , C C C C C C C C C C C C * * * * * * 5 5 5 5 5 5 / / / / / / = = = = = = & & & & & &              , + + + + + 3 3 3 3 3 3        C B B B B B H H H H H H 4 4 4 4 4 4            $ # # # # #}R		8	$	$ h9 h9 h9 h9 h9 h9 h9 h9 h9 h9rl   