
    &`i                        d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7  e)            \  Z8Z9Z: e j;        e<          Z=dZ>e$ G d de                      Z?e$ G d d                      Z@ edg d          ZAd ZBd ZCdS )    N)OrderedDict
namedtuple)CallableDictListOptionalTupleTypeUnion)DEPRECATED_VALUEdeprecation_warning)ModelCatalog)ModelV2)TFActionDistribution)Policy)SampleBatch)TFPolicy)ViewRequirement)
force_list)OldAPIStackoverride)	summarize)try_import_tf)'DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICYNUM_GRAD_UPDATES_LIFETIME)get_dummy_batch_for_space)get_placeholder)AlgorithmConfigDictLocalOptimizerModelGradients
TensorType)log_oncetowerc                       e Zd ZdZddddddddded
dej        j        dej        j        dede	e
eee         egef         dee	e
egeeef         f                  d	ee	e
eegeeef         f                  d
ee	e
ej        j        ej        j        egdf                  dee	e
ej        j        ej        j        egef                  dee	eee         geeeef         eeeeee         f         f         f                  dee	e
eeeegeeeee         f         f                  deeedf                  dee         dee	e
gef                  f fdZ ee          deeedf                  defd            Z ee
          dee         fd            Z ee
          	 d#dededefd            Z ee
          d#dedefd            Z ee
          d$dedefd            Z d Z! ee
          	 d%de"ddfd             Z#d!efd"Z$ xZ%S )&DynamicTFPolicya!  A TFPolicy that auto-defines placeholders dynamically at runtime.

    Do not sub-class this class directly (neither should you sub-class
    TFPolicy), but rather use rllib.policy.tf_policy_template.build_tf_policy
    to generate your custom tf (graph-mode or eager) Policy classes.
    N)
stats_fngrad_stats_fnbefore_loss_init
make_modelaction_sampler_fnaction_distribution_fnexisting_inputsexisting_modelget_batch_divisibility_reqobs_include_prev_action_reward	obs_spaceaction_spaceconfigloss_fnr&   r'   r(   r)   r*   r+   r,   ztf1.placeholderr-   r.   c       
             |t           k    rt          dd           | _        | _        | _        d _        | _        | _        | _        d _	        |du _
        d}|	s|
r|st          d          n#t          j        | j        d                   \  }}|rlt          |t                    rV|d          _        t#          d	t%          |                    D ]+}t'           ||         d         ||         d	                    ,n?|r | |||           _        n)t          j        ||| j        d         d
           _                                          |rBd |                                D              _         j        r|t0          j                  _	        nad  j        j                                        D              _         j        r-t6                              t:          j        dgd           _	                                          _         j                              j        j                   t0          j!         j        v rd j        t0          j!                 _"         j
        r4|d         }d} #                     j        |          \   _$         _%        n* j        &                    d          st          j'        |          }i }t0          j(         j        vr!t0          j(        t          j'        |d          i} #                     j        tS          t0          j*        |ifi |          \   _$         _%        n( #                     j        i           \   _$         _%        t6          +                    t:          ,                    dt:          j-                  dd          }t6          +                    ddd          } j$        .                     /                                           d}d}d}i }d _0         j
        s\ 1                                 _2        |	r |	  j         j$        t0          j3                  j         j	         j$        &                    t0          j(                   j$        &                    t0          j4                  | j$        j5        	  	        }t%          |          dk    r|\  }}} _0        nd}g  _0        |\  }}n|
r j$        }	  |
  j        | j         j	        |||j5                  \  }} _0        n# tl          $ r}d|j7        d         v sd|j7        d         v r| |
  j        |t0          j3                  j         j	        |&                    t0          j(                  |&                    t0          j4                  ||j5        	  	        \  }} _0        n|Y d}~nrd}~ww xY wt           j        t:          j8        j9                  r$                      j$                  \  } _0        }n"                      j$                  \  } _0         || j                  } j2        :                    |||          \  }}|||t0          j;        <   |Y||t0          j<        <   t:          =                    t:          >                    |t:          j?                            |t0          j@        <   t6          A                                p4t6          B                    t7          jC        di  j        d                   }t          |          r |           n|pd	}t0          j(         j$        jE        v r j$        t0          j(                 nd} t0          j4         j$        jE        v r j$        t0          j4                 nd}!t                      G                    |||| j$        t0          jH                  j$        t0          j*                 ||||dg  j         j         j0        | |! j	        |d         d         |||            | | |||           t           d!          r jJ                             |           n| _J         j
        s K                    d"           t%           jL                  d	k    st          d#  jL        D                       rwt6          N                    d$t6          jO        %          5   fd&t#           j        &                    d'd	                    D              _P        ddd           n# 1 swxY w Y    Q                                R                    t6          S                                           dS dS )(a  Initializes a DynamicTFPolicy instance.

        Initialization of this class occurs in two phases and defines the
        static graph.

        Phase 1: The model is created and model variables are initialized.

        Phase 2: A fake batch of data is created, sent to the trajectory
        postprocessor, and then used to create placeholders for the loss
        function. The loss and stats functions are initialized with these
        placeholders.

        Args:
            observation_space: Observation space of the policy.
            action_space: Action space of the policy.
            config: Policy-specific configuration data.
            loss_fn: Function that returns a loss tensor for the policy graph.
            stats_fn: Optional callable that - given the policy and batch
                input tensors - returns a dict mapping str to TF ops.
                These ops are fetched from the graph after loss calculations
                and the resulting values can be found in the results dict
                returned by e.g. `Algorithm.train()` or in tensorboard (if TB
                logging is enabled).
            grad_stats_fn: Optional callable that - given the policy, batch
                input tensors, and calculated loss gradient tensors - returns
                a dict mapping str to TF ops. These ops are fetched from the
                graph after loss and gradient calculations and the resulting
                values can be found in the results dict returned by e.g.
                `Algorithm.train()` or in tensorboard (if TB logging is
                enabled).
            before_loss_init: Optional function to run prior to
                loss init that takes the same arguments as __init__.
            make_model: Optional function that returns a ModelV2 object
                given policy, obs_space, action_space, and policy config.
                All policy variables should be created in this function. If not
                specified, a default model will be created.
            action_sampler_fn: A callable returning either a sampled action and
                its log-likelihood or a sampled action, its log-likelihood,
                action distribution inputs and updated state given Policy,
                ModelV2, observation inputs, explore, and is_training.
                Provide `action_sampler_fn` if you would like to have full
                control over the action computation step, including the
                model forward pass, possible sampling from a distribution,
                and exploration logic.
                Note: If `action_sampler_fn` is given, `action_distribution_fn`
                must be None. If both `action_sampler_fn` and
                `action_distribution_fn` are None, RLlib will simply pass
                inputs through `self.model` to get distribution inputs, create
                the distribution object, sample from it, and apply some
                exploration logic to the results.
                The callable takes as inputs: Policy, ModelV2, obs_batch,
                state_batches (optional), seq_lens (optional),
                prev_actions_batch (optional), prev_rewards_batch (optional),
                explore, and is_training.
            action_distribution_fn: A callable returning distribution inputs
                (parameters), a dist-class to generate an action distribution
                object from, and internal-state outputs (or an empty list if
                not applicable).
                Provide `action_distribution_fn` if you would like to only
                customize the model forward pass call. The resulting
                distribution parameters are then used by RLlib to create a
                distribution object, sample from it, and execute any
                exploration logic.
                Note: If `action_distribution_fn` is given, `action_sampler_fn`
                must be None. If both `action_sampler_fn` and
                `action_distribution_fn` are None, RLlib will simply pass
                inputs through `self.model` to get distribution inputs, create
                the distribution object, sample from it, and apply some
                exploration logic to the results.
                The callable takes as inputs: Policy, ModelV2, input_dict,
                explore, timestep, is_training.
            existing_inputs: When copying a policy, this specifies an existing
                dict of placeholders to use instead of defining new ones.
            existing_model: When copying a policy, this specifies an existing
                model to clone and share weights with.
            get_batch_divisibility_req: Optional callable that returns the
                divisibility requirement for sample batches. If None, will
                assume a value of 1.
        r/   T)olderrortfNzT`make_model` is required if `action_sampler_fn` OR `action_distribution_fn` is givenmodelr      )r0   r1   num_outputsmodel_config	frameworkc                 B    g | ]\  }}|                     d           |S )	state_in_)
startswith).0kvs      v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/dynamic_tf_policy.py
<listcomp>z,DynamicTFPolicy.__init__.<locals>.<listcomp>   s=     " " "ak9R9R"" " "    c           	          g | ]J\  }}|                     d           t          |j        t          |j        t
                     |          KS )r>   )space	time_axisname)r?   r   rG   
isinstanceshiftint)r@   rA   vrs      rC   rD   z,DynamicTFPolicy.__init__.<locals>.<listcomp>   si     " " " Ar<<,,"(",RXs";";;  " " "rE   seq_lens)dtypeshaperI   Ftimestep_disable_action_flatteningprev_action )rO   rI   is_exploring)	obs_batchstate_batchesrN   prev_action_batchprev_reward_batchexploreis_training   )
input_dictrX   rN   r[   rQ   r\   zpositional argumentzunexpected keyword argument)action_distributionrQ   r[   tf_session_args)r2   max_seq_len)observation_spacer1   r2   sess	obs_inputaction_inputsampled_actionsampled_action_logpdist_inputs
dist_classlossloss_inputsr8   state_inputsstate_outputsprev_action_inputprev_reward_inputrN   ra   batch_divisibility_reqr[   rQ   _extra_action_fetches)auto_remove_unneeded_view_reqsc              3      K   | ]}d |v V  	dS )gpuNrT   )r@   ds     rC   	<genexpr>z+DynamicTFPolicy.__init__.<locals>.<genexpr>  s&      +M+M1EQJ+M+M+M+M+M+MrE    )reusec                 0    g | ]}t                     S ))policy)TFMultiGPUTowerStack)r@   iselfs     rC   rD   z,DynamicTFPolicy.__init__.<locals>.<listcomp>  s4     3 3 3 -D9993 3 3rE   num_multi_gpu_tower_stacks)Tr   r   rb   r1   r2   r<   _loss_fn	_stats_fn_grad_stats_fn	_seq_lens	_is_tower
ValueErrorr   get_action_distrJ   listr8   rangelensetattrget_model_v2/_update_model_view_requirements_from_init_stateitems_state_inputsr   SEQ_LENSview_requirementstf1placeholderr7   int32_get_default_view_requirementsupdateINFOSused_for_training_get_input_dict_and_dummy_batch_input_dict_dummy_batchgetget_action_placeholderPREV_ACTIONSdictACTIONSplaceholder_with_defaultzerosint64set_training_get_is_training_placeholder
_state_out_create_explorationexplorationCUR_OBSPREV_REWARDSr\   	TypeErrorargskerasModelget_exploration_actionACTION_DIST_INPUTSACTION_LOGPexpcastfloat32ACTION_PROBget_default_sessionSessionConfigProtocallableaccessed_keyssuper__init__OBShasattrrq   !_initialize_loss_from_dummy_batchdevicesanyvariable_scope
AUTO_REUSEmulti_gpu_tower_stacksget_sessionrunglobal_variables_initializer)#r}   r0   r1   r2   r3   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   ri   	logit_dimr|   rQ   r[   	action_phprev_action_phrf   rg   rh   extra_action_fetchesaction_sampler_outputsin_dicteaction_distrc   rp   rn   ro   	__class__s#   `                                 rC   r   zDynamicTFPolicy.__init__7   s
   z *-===$DDQQQQ!*(!+(4
 		 6 		  8   %1$@dk'2% %!J	
  	.$// N+A.
q#n"5"566 N NAD."3A"6q8I!8LMMMM 		#D)\6JJDJJ%2#)%![1  DJ 	<<>>>  	" "-3355" " "D ! G!01E!F" " "Z9??AA" " "D ! !$(4&z "1 " " "&!D!D!F!F%%dj&BCCC 666JOD";#45G > !	R&z2HG262V2V&3 3/Dd// ;??#?@@ U(?MM	!#+43III#0,2U(-3 3&N 88*+-y9LL^LL $%% 889OQSTT$% 3328,,bz 4  H 224.2QQG 	%%d&G&G&I&IJJJ "!~ \	#7799D ! W):):J".{/BC"&"4!^&*&6&:&:;;S&T&T&*&6&:&:;;S&T&T# $ 0 <
* 
* 
*& -..!33 /&+# #'K&(DO:P7N$7$7 * 4T #.G&$
 32  J'.*.*<%)^$+%-(/(;	 	 		'& OO % $ $ $1QVAY>><q	II !7 6 $ $
*1+2E*F.2.@)-29++k>V2W2W29++k>V2W2W(/,3,?
! 
! 
!	 + * $ #$G !0$4 "$*bhn== TMQZZ ,N NJT_6J6J 8<zz$BR7S7S4T_(jdjAA $;;(3hPW <  "'
 "CN !?@*<O !89<>FF+RZ88= = !89
 &&(( 
CKK?DDT[1B%CDD -8 -
 -
 2333&&t,,,,1 	 '4+;+III [566 	 '4+;+III [566 	 	'%&{7)+*=>) 3#!*+///^w6#9- 	 	
 	
 	
4 'T9lFCCC4011 	>&--.BCCCC)=D& ~ 	G22RV2WWW 4<  1$$+M+M+M+M+M(M(M$
 ''#.'AA  3 3 3 3!&t{7SUV'W'W!X!X3 3 3D/               ""3#C#C#E#EFFFFF#	G 	Gs+   10T# #
W-BWW(:d..d25d2returnc                    t          j        | j                  }t          j        | j                  }t	          |          t	          |          k    rt          d| j        | j        |          t          |          D ]a\  }}|j        	                                ||         j        	                                k    r"t          d||j        ||         j                  bg }t          t	          | j                            D ]B}|                    d                    |          |t	          |          |z            f           C|r'|                    t          j        |d         f           t          j        | j        |dt	          |                             t!          d| j        fd| j        fgfdt          | j                                                  D             z   |z             }|                     | j        | j        | j        || j        d	t3          | d	d          fd
t3          | d
d          fg          }||_        |                    t          |                    }	fdt          | j                                                  D             }
t7          j        ||	|
           |j        r4|j                            |                    |||j                              |S )z9Creates a copy of self using existing input placeholders.zTensor list mismatchzTensor shape mismatchzstate_in_{}NrV   rQ   c                 *    g | ]\  }}||         fS rT   rT   r@   r|   rA   existing_inputs_unflatteneds      rC   rD   z(DynamicTFPolicy.copy.<locals>.<listcomp>  s8       Aq /23  rE   target_q_modeltarget_model)r,   r-   c                 *    g | ]\  }}||         fS rT   rT   r   s      rC   rD   z(DynamicTFPolicy.copy.<locals>.<listcomp>  s8     
 
 
1 +A./
 
 
rE   )!treeflatten_loss_input_dict_loss_input_dict_no_rnnr   r   r   	enumeraterP   as_listr   appendformatr   r   unflatten_asr   _is_exploring	_timestepkeysr   rb   r1   r2   r8   getattr_do_loss_initr   _initialize_lossr   _stats_fetchesr   _grads)r}   r,   flat_loss_inputsflat_loss_inputs_no_rnnr|   rB   
rnn_inputsr^   instancelossesrk   r   s              @rC   copyzDynamicTFPolicy.copy  sE     <(=>>"&,t/K"L"L   C$8$888&%"	   566 	 	DAqw  OA$6$<$D$D$F$FFF +Q9K9Q   G 
s4-..// 	 	A!((++#C(?$@$@1$DE     	K{3_R5HIJJJ&*&7(:c"9:::;'
 '
# !d01J3OP   %d&B&G&G&I&IJJ  
 
 

 >>"K&
 "741A4#H#HI~t!D!DE " 
 
 %/!''J(?(?@@
 
 
 
!$">"C"C"E"EFF
 
 

 	!(FK@@@" 	#**''*hoNN   rE   c                 F    | j         r| j                                         S g S N)r8   get_initial_stater}   s    rC   r   z!DynamicTFPolicy.get_initial_state+  s%    : 	://111IrE   r   batchbuffer_indexc                    |                     d           t          | j                  dk    r/| j        d         dk    r|dk    sJ || _        t          |          S |                     |d          t          j        | j                  }| j        r| j        | j	        gz   }ng }fd|D             }fd|D             }| j
        |                             |                                 |||j        	          S )
NTr9   r   /cpu:0F)shufflec                      g | ]
}|         S rT   rT   r@   rA   r^   s     rC   rD   z:DynamicTFPolicy.load_batch_into_buffer.<locals>.<listcomp>H  s    333A*Q-333rE   c                      g | ]
}|         S rT   rT   r   s     rC   rD   z:DynamicTFPolicy.load_batch_into_buffer.<locals>.<listcomp>I  s    :::!
1:::rE   )rc   inputsrl   num_grad_updates)r   r   r   _loaded_single_cpu_batch_get_loss_inputs_dictr   r   r   r   r   r   	load_datar   r   )r}   r   r   	data_keys
state_keysr   rl   r^   s          @rC   load_batch_into_bufferz&DynamicTFPolicy.load_batch_into_buffer2  s&    	4    t|!!dl1o&A&A1$$$$,1D)u:://u/EE
L!=>>	 	+t~.>>JJJ3333333::::z:::*<8BB!!##%"3	 C 
 
 	
rE   c                     t          | j                  dk    r6| j        d         dk    r%|dk    sJ | j        t          | j                  ndS | j        |         j        S )Nr9   r   r   )r   r   r   r   num_tuples_loaded)r}   r   s     rC   "get_num_samples_loaded_into_bufferz2DynamicTFPolicy.get_num_samples_loaded_into_bufferR  ss     t|!!dl1o&A&A1$$$$ 0< D1222 *<8JJrE   offsetc                    t          | j                  dk    r| j        d         dk    r|dk    sJ | j        t          d          | j                            d          }|&| j                            d| j        d                   }|t          | j                  k    r| j        }n| j                            |||z             }|                     |          S | j        |         }|	                    | 
                                |          }| xj        dz  c_        |                    t          | j        t          | j        dz
  |j        pdz
  i           |S )	Nr9   r   r   zPMust call Policy.load_batch_into_buffer() before Policy.learn_on_loaded_batch()!minibatch_sizesgd_minibatch_sizetrain_batch_size)startend)r   r   r   r   r2   r   slicelearn_on_batchr   optimizer   r   r   r   r   )r}   r  r   
batch_sizesliced_batchtower_stackresultss          rC   learn_on_loaded_batchz%DynamicTFPolicy.learn_on_loaded_batch`  s    t|!!dl1o&A&A1$$$$,4 6   )9::J!![__($+6H*I 
 S!>????#<#<BB fz&9  C     &&|4441,?&&t'7'7'9'96BB")4+@7)A-1M1RQRS		
 	
 	
 rE   c                    i }|                                 D ]%\  }}t          j        d|          }|1| j        t	          |                    d                             ||<   N|                    d          rd|t          j        k    ru||v r||         ||<   t          |j
        t                     }|j        r~| j                            d          r|t          j        t          j        fv rd}n,|t          j        t          j        fv r| j        d         rd}nd}t%          |j        |||	          ||<   '|                     d
          }	t          || j                  |	fS )a4  Creates input_dict and dummy_batch for loss initialization.

        Used for managing the Policy's input placeholders and for loss
        initialization.
        Input_dict: Str -> tf.placeholders, dummy_batch: str -> np.arrays.

        Args:
            view_requirements: The view requirements dict.
            existing_inputs (Dict[str, tf.placeholder]): A dict of already
                existing placeholders.

        Returns:
            Tuple[Dict[str, tf.placeholder], Dict[str, np.ndarray]]: The
                input_dict/dummy_batch tuple.
        zstate_in_(\d+)Nr9   
state_out_rR   F_disable_preprocessor_apiT)rG   rI   rH   r       r  )rN   )r   rematchr   rL   groupr?   r   r   rJ   rK   r   r2   r   r   r   r   NEXT_OBSr   rG   '_get_dummy_batch_from_view_requirementsr   )
r}   r   r,   r^   view_colview_reqmorH   r   dummy_batchs
             rC   r   z/DynamicTFPolicy._get_input_dict_and_dummy_batch  s     
"3"9"9";"; (	 (	Hh+X66B~'+'9#bhhqkk:J:J'K
8$$$$\22 "[;;;_,,'6x'@
8$$ !+8>3 ? ??	-  {'CDD '#+#0V J J #( ![_k6J$KKK K(CD L #( #'+:&n%"+ '	, , ,Jx( BBbBQQ:???LLrE   Trr   c                      j         s8t                                                      _          j         d          _                                                             t                                                      j        	                                D ]/\  }}|
                    d          s| j        j        vrd|_        0 j        	                                D ]-\  }}t          t           j                            dd|j                                        dd          |j        j                  t/           j                             j        |<   t3          ||	           j        |<   | j        vrt4                              d
                    |                     t;          t           j                            dd|j                                        dd          |j        j                  d           j        |<   / j        }t4                              d            j                             |                                                                 |          }|j         D ]}| j        vrt3          ||         |	           j        |<   | j        vrXt;          t           j                            dd||         j        dd          ||         j                  d           j        |<   tC          tE           j        fi  j#        d          } j$        rJ j%        |tB          j&        <    j#        '                    tB          j&        |tB          j&                 i            j#        '                    tE          |                     tQ          d          r:t4          )                    d                    tU          |                                +                    |          }	|j        |j        z  |j         z  tY           j-        j        .                                          z  t_          j0         |	fd|	                                D             tB          j&        |v rtB          j&        |tB          j&                 fgng z              d j#        v r j#        d=  j1        r4 j2        '                     1                     | j3                             |r3|j        |j        z  |j        D ]}||j        vr| j-        j        vr|tB          j4        tB          j5        tB          j6        tB          j7        tB          j8        tB          j9        tB          j:        tB          j;        tB          j<        f	vr,| j        v rd j        |         _=        | j#        v r j#        |= t}           j        .                                          D ]}|vr|tB          j4        tB          j5        tB          j6        tB          j7        tB          j8        tB          j9        tB          j:        tB          j;        fvrk| j-        j        vr]||j?        v r.t4          @                    d                    |                     n jA        d          j        |= | j#        v r j#        |= t}           j        .                                          D ]U} j        |         }
|
jB        ?|
jB         j        vr1|
jB        |j        v }t;          |
jC        |           j        |
jB        <   V fd j#        	                                D              _D        d S )Nr   r>   Fg      g      ?r9   )rP   rO   r  )valuerI   z,Adding extra-action-fetch `{}` to view-reqs.)rG   used_for_compute_actionsz0Testing `postprocess_trajectory` w/ dummy batch.T)_is_training	loss_initz1Initializing loss function with dummy input:

{}
c                 &    g | ]\  }}|v 	||fS rT   rT   )r@   rA   rB   all_accessed_keyss      rC   rD   zEDynamicTFPolicy._initialize_loss_from_dummy_batch.<locals>.<listcomp>'  s,    NNN1q<M7M7MaV7M7M7MrE   r\   zSampleBatch key '{}' was deleted manually in postprocessing function! RLlib will automatically remove non-used items from the data stream. Remove the `del` from your postprocessing function.output)rG   r   c                 D    i | ]\  }}|j         v|j        k    ||S rT   )r   r   )r@   rA   rB   r}   s      rC   
<dictcomp>zEDynamicTFPolicy._initialize_loss_from_dummy_batch.<locals>.<dictcomp>  sC     (
 (
 (
1+++T^0C0C q0C0C0CrE   )E_optimizersr   	optimizer
_optimizerr   r   r   r   r   r   r?   r   r   r%  rq   r   gymspacesBoxrP   r   rO   rI   r   r   r   loggerinfor   r   r   postprocess_trajectory
added_keysr   r   r   r   r   r   r   r"   debugr   r   setr8   r   r   r   r   r   r   EPS_IDAGENT_INDEX	UNROLL_IDTERMINATEDS
TRUNCATEDSREWARDSr   T
OBS_EMBEDSr   r   deleted_keyswarningr2   data_colrG   r   )r}   rr   r&   keyr   r$  r"  _train_batchr   rM   r   r)  s   `           @rC   r   z1DynamicTFPolicy._initialize_loss_from_dummy_batch  s     	2)$..*:*:;;D".q1DO 	s??AABBB "399;; 	: 	:MCNN;//:t/===4914::<< 	 	JC%>
#U[%8%8%:%:122%>ekFV    t011	& & &Dc" %4%c$J$J$JDS!$000JQQRUVVWWW.=*..#k1133ABB7#k.	 )   .3/ / /&s+ 'FGGG//k4CSCSCUCUVVV''44) 	 	C$***(7%c*) ) ) % $000.=*..)#.4QRR8)#.4	 )   .3/ / /&s+ "!;;T%:;;
 
 

  	04K,-!((%{;3G'HI   	$$T+%6%6777K   	LLFMMk**    ##K00 %'($% $*.3355667 	 	!NNNN 1 1 3 3NNN ';66 &K4H(IJKK			
 		
 		
 D111%m4  	&&##D+t{CC  
 * J	 + 9K<U U"0 7 7{8884:#???#*#/#-#/#.#+#)##.
  d444HM.s3Ed333 1#6
 D27799:: !7 !7000#*#/#-#/#.#+#)#	
 
 4:#???
 k6667 8>vc{{    X.6 237d333 1#6 D27799:: 	 	+C0K+4+AAA(*{7P(P%:I h:K; ; ;D*2;7(
 (
 (
 (
-3355(
 (
 (
$$$rE   rF  c                 d   |                      | | j        | j        |          }t          |          }| j        r.| j                            |                     | |                     g | _        t          | j        t          j
        j                  s| j                                        | _        |S r   )r   r8   ri   r   r   r   r   _update_opsrJ   r7   r   r   
update_ops)r}   rF  r   s      rC   r   zDynamicTFPolicy._do_loss_init  s    tTZ+NNF##> 	J&&t~~dK'H'HIII$*bhn55 	7#z4466DrE   )r   )r   r   )TN)&__name__
__module____qualname____doc__r   r0  r1  Spacer   r   r   r   r
   r   r   r!   r   r   strr    r   r   r	   typerL   r   r   r   r   r   r  r  r  r   boolr   r   __classcell__)r   s   @rC   r%   r%   .   sk        $  
    BF,0HL'7YiG iG iG:#iG j&iG $	iG
 Wd#78+F
R
iG fk*Dj,AAB
iG  fk>:Dj<QQR
iG #)3:+;=PQSWW
iG( )3:+;=PQ
)iG4 $T*-.*j01*j*d:>NNOQ
5iGF !)*j*Ej$Z(889;!
GiGR "$s,='=">?SiGT !)UiGV %-Xvhm-D$EWiG iG iG iG iG iGV XhFDs4E/E)F$G FH F F F FP Xf4
#3     Xf 
 

 
 
	
 
 
 
> XfK Ks K3 K K K K Xf' 'C '3 ' ' ' 'R<M <M <M| XfDHA
 A
.2A
	A
 A
 A
 A
F	 	 	 	 	 	 	 	 	rE   r%   c                   J    e Zd ZdZ	 	 	 	 	 	 	 	 d	defdZd
dZd Zd Zd Z	dS )r{   a   Optimizer that runs in parallel across multiple local devices.

    TFMultiGPUTowerStack automatically splits up and loads training data
    onto specified local devices (e.g. GPUs) with `load_data()`. During a call
    to `optimize()`, the devices compute gradients over slices of the data in
    parallel. The gradients are then averaged and applied to the shared
    weights.

    The data loaded is pinned in device memory until the next call to
    `load_data`, so you can make multiple passes (possibly in randomized order)
    over the same data once loaded.

    This is similar to tf1.train.SyncReplicasOptimizer, but works within a
    single TensorFlow graph, i.e. implements in-graph replicated training:

    https://www.tensorflow.org/api_docs/python/tf/train/SyncReplicasOptimizer
    Nrz   c	                     |6t          ddd           d| _        || _        || _        || _        || _        n|| _        | j        j        | _        | j        j        | _        |p3|j                            d|j                            dd                    t          | j                  z  | _        t          j        | j        j                  }g }| j        j        r| j        j        | j        j        gz   }| j        j                            d	          }| j        j        | _        t          | j                  d
k    sd| j        d         v sJ ||z   | _        t"                              t"          j        j        t"                                          j                  }	t"                              t0          j        d          | _        t"                              t0          j        d          | _        || _        t"                              t0          j        d          | _        d
| _        d t?          t          | j                            D             }
t          j        | j                  D ]}t0                               d          5  t0          !                    |t          | j                            }ddd           n# 1 swxY w Y   tE          | j                  D ]&\  }|
         #                    |                    'g | _$        tE          tK          | j        |
                    D ]W\  }\  }}| j$        #                    | &                    |||t          t          j        |                                         X| j        j        d         rg }tE          | j                  D ]\  }tO          fd| j$        D                       }|rbg }|D ]\  }}|#                    |           t0          (                    ||          \  }}tE          |          D ]\  \  }}|         |f|<   |#                    |           t"                              t"          j        j        t"                                          j                  | _)        |	D ]}| j)        *                    |           | j)        r2tV          ,                    d-                    | j)                             t"          .                    | j)                  5  t0          /                    d tK          | j        |          D                       | _0        ddd           n# 1 swxY w Y   ntO          d | j$        D                       }|rbg }|D ]\  }}|#                    |           t0          (                    ||          \  }}tE          |          D ]\  \  }}|         |f|<   t"                              t"          j        j        t"                                          j                  | _)        |	D ]}| j)        *                    |           | j)        r2tV          ,                    d-                    | j)                             t"          .                    | j)                  5  | j        d         1                    |          | _0        ddd           n# 1 swxY w Y   d| _2        dS )zInitializes a TFMultiGPUTowerStack instance.

        Args:
            policy: The TFPolicy object that this tower stack
                belongs to.
        NzTFMultiGPUTowerStack(...)z%TFMultiGPUTowerStack(policy=[Policy])T)r5   newr6   r  r
  i?B 	grad_clipr9   rt   r   )scopebatch_indexrU   per_device_batch_sizera   c                     g | ]}g S rT   rT   )r@   rE  s     rC   rD   z1TFMultiGPUTowerStack.__init__.<locals>.<listcomp>  s    DDDarDDDrE   r   %_tf_policy_handles_more_than_one_lossc                 *    g | ]}|j                  S rT   grads)r@   tr|   s     rC   rD   z1TFMultiGPUTowerStack.__init__.<locals>.<listcomp>  s    )K)K)K!'!*)K)K)KrE   z'Update ops to run on apply gradient: {}c                 >    g | ]\  }}|                     |          S rT   )apply_gradients)r@   oas      rC   rD   z1TFMultiGPUTowerStack.__init__.<locals>.<listcomp>   s*    QQQdaQ&&q))QQQrE   c                     g | ]	}|j         
S rT   r]  r@   r_  s     rC   rD   z1TFMultiGPUTowerStack.__init__.<locals>.<listcomp>#  s    %D%D%D!ag%D%D%DrE   )3r   rz   
optimizersr   max_per_device_batch_sizepolicy_copyr-  r2   r   r   r   r   r   r   r   r   rk   r   get_collection	GraphKeys
UPDATE_OPSget_variable_scoperI   r   r7   r   _batch_index_per_device_batch_size_loaded_per_device_batch_size_max_seq_len_loaded_max_seq_lenr   devicesplitr   r   _towerszip_setup_device_average_gradientsclip_by_global_normrH  remover3  r7  r   control_dependenciesr  	_train_opra  r   )r}   r.  r   input_placeholdersr   rg  build_graphgrad_norm_clippingrz   
shared_opsdevice_placeholdersr_  splitsru   tower_irr  placeholdersavgsoptimavgclippedgradrE  varopr|   s                            @rC   r   zTFMultiGPUTowerStack.__init__  s   ( >/;   
 DK'DO"DL-FD**D$*DK48K4KDO;.DL) =$$$fm&7&78JF&S&S  T\"".#D* "&dk.Q!R!RJ{( Q![6$+:O9PP
!%!3!7!7!D!D#{/D4<  1$$a(@(@(@@-
:''M$C,B,B,D,D,I ( 
 


  OOBH=OII '*ooH2 '6 '
 '
# .G*  OOBH=OII#$ DD5T\1B1B+C+CDDDd.// 	9 	9A8$$ 8 8!S%6%6778 8 8 8 8 8 8 8 8 8 8 8 8 8 8!$,// 9 91#A&--fQi88889 /81220
 0
 	 	+G+fl L""V\3t|DV7W7W3X3X     ;EF 8	ID%do66 	! 	!5()K)K)K)Kdl)K)K)KLL% 3 G#& - -at,,,,!#!7!7AS!T!TJGQ*3C.. 3 3;D#")!*c!2AC      #11(0F0F0H0H0M  2    D ! , , ''++++ =DDTEUVV   ))$*:;;  !#QQc$/46P6PQQQ" "              
 %%D%Dt|%D%D%DEEC! /" ) )GD!NN4((((33G=OPP
&/nn / /NA{c%aj#.CFF  #11(0F0F0H0H0M  2    D ! , , ''++++ =DDTEUVV   ))$*:;; I I!%!3!C!CC!H!HI I I I I I I I I I I I I I I !"s6   .KK	K	=U((U,/U,:&[,,[03[0c           
         || _         t          d          rCt                              d                    t          | j        |d                               i }t          | j                  t          |z             k    sJ | j        |f            t                    dk    r;d         }t          |d                   t          d                   z  | _        n|d         }d| _        | j	        | j        z  t          | j
                  z  dk     rUt                              d                    | j	        | j        | j        t          | j
                  z                       dt          |          k     r/t          t          |          t          | j
                            t          d          rSt                              d                    t          |          | j        t          | j
                                       t          | j
                  k     rt          d	          t          | j
                  z  | j        z  | _        t                    dk    rfd
D             fd|D             }t          d                   z  t          |d                   k    s4J t          d                   t          |d                   f            t          | j        |z             D ]
\  }}|||<   t          |d                   }	nGd}	t          | j        |          D ]/\  }}t          |          }
|
||<   |	dk    rt          |
          }	0|                    d | j        D             |           |	| _        |	t          | j
                  z  }|dk    s
J d            || j        z  dk    sJ |S )aV  Bulk loads the specified inputs into device memory.

        The shape of the inputs must conform to the shapes of the input
        placeholders this optimizer was constructed with.

        The data is split equally across all the devices. If the data is not
        evenly divisible by the batch size, excess data will be discarded.

        Args:
            sess: TensorFlow session.
            inputs: List of arrays matching the input placeholders, of shape
                [BATCH_SIZE, ...].
            state_inputs: List of RNN input arrays. These arrays have size
                [BATCH_SIZE / MAX_SEQ_LEN, ...].
            num_grad_updates: The lifetime number of gradient updates that the
                policy having collected the data has already undergone.

        Returns:
            The number of tuples loaded per device.
        r   z-Training on concatenated sample batches:

{}
)r  r   rl   r   r9   zvTarget minibatch size is {}, however the rollout sequence length is {}, hence the minibatch size will be raised to {}.data_slicingzBDivided {} rollout sequences, each of length {}, among {} devices.zMust load at least 1 tuple sequence per device. Try increasing `minibatch_size` or reducing `max_seq_len` to ensure that at least one sequence fits per device.c                 0    g | ]}t          |          S rT   )_make_divisible_by)r@   arrsequences_per_minibatchs     rC   rD   z2TFMultiGPUTowerStack.load_data.<locals>.<listcomp>  s2       EH"3(?@@  rE   c                 P    g | ]"}|d t          d                   z           #S Nr   )r   )r@   r  seq_lenrl   s     rC   rD   z2TFMultiGPUTowerStack.load_data.<locals>.<listcomp>  s5    NNNc:CQ007::;NNNrE   c                     g | ]	}|j         
S rT   )init_opre  s     rC   rD   z2TFMultiGPUTowerStack.load_data.<locals>.<listcomp>  s    222!)222rE   	feed_dictzNo data loaded?)r   r"   r3  r4  r   r   rk   r   rq  rg  r   rB  r  r   ro  ru  r   rt  r  )r}   rc   r   rl   r   r  smallest_arrayphr  truncated_lentruncated_arrsamples_per_devicer  r  s      `        @@rC   r   zTFMultiGPUTowerStack.load_dataB  sL   * !1K   	KKBII,0,<&,,8   
 
 
 	4#$$F\,A(B(BBBBE
BBB |q  )!_N&)nnLO(<(<<G'.D$$#AYN'(D$ *'($,  	 
 #Q&&NN &2,,s4</@/@@ 
 
 
 '(#~!888&8N##S%6%6' '# N## 	KK"&'')A3t|CTCT 	   #S%6%666H   $s4<'8'884;SS 	* |q     LX  L ONNNNvNNNF|A'''1S^^CCCLO$$'F1I	FCCC t/,1FGG $ $C #	"q	NNMMMt/88 7 7C 238O P P -	" A%%$'$6$6M22T\222iHHH!.*c$,.?.??!A%%%'8%%%!D$FF!KKKK!!rE   c                    | j         || j        | j        | j        | j        i}| j        D ].}|                    |j                                                   /d| j	        i}t          | j                  D ]6\  }}|j                                        }||d                    |          <   7|                    ||          S )a  Run a single step of SGD.

        Runs a SGD step over a slice of the preloaded batch with size given by
        self._loaded_per_device_batch_size and offset given by the batch_index
        argument.

        Updates shared model weights based on the averaged per-device
        gradients.

        Args:
            sess: TensorFlow session.
            batch_index: Offset into the preloaded data. This value must be
                between `0` and `tuples_per_device`. The amount of data to
                process is at most `max_per_device_batch_size`.

        Returns:
            The outputs of extra_ops evaluated over the batch.
        trainztower_{}r  )rm  rn  ro  rp  rq  rt  r   
loss_graphextra_compute_grad_feed_dictr{  r   _get_grad_and_stats_fetchesr   r   )r}   rc   rX  r  r#   fetches	tower_numtower_fetchs           rC   r  zTFMultiGPUTowerStack.optimize  s    ( {')Kt7
	
 \ 	N 	NEU-JJLLMMMMDN+ )$, 7 7 	@ 	@Iu*FFHHK4?GJ%%i0011xx9x555rE   c                 $    d | j         D             S )Nc                     g | ]	}|j         
S rT   )r  re  s     rC   rD   z:TFMultiGPUTowerStack.get_device_losses.<locals>.<listcomp>  s    333333rE   )rt  r   s    rC   get_device_lossesz&TFMultiGPUTowerStack.get_device_losses  s    33dl3333rE   c                    |t          |          k    sJ t                              |          5  t                              t
          d| z             5  g }g }t          |          D ]\  }}t                              |ddg           }	|                    |	           ||k     r| j	        }
| j	        }n	| j	        }
d}t          
                    |	| j        |
z  |z  gdgt          |j        dd                    z  z   | j        |
z  |z  gdgt          |j        dd                    z  z             }|                    |j                   |                    |           |                     |          }|                    | j        |j                  }d d d            n# 1 swxY w Y   t'          t          j        d |D              ||          cd d d            S # 1 swxY w Y   d S )NrE  F)	trainablevalidate_shapecollectionsr9   r   r   c                     g | ]	}|j         
S rT   )initializer)r@   r   s     rC   rD   z6TFMultiGPUTowerStack._setup_device.<locals>.<listcomp>  s    OOO5,OOOrE   )r   r7   rr  r   
name_scopeTOWER_SCOPE_NAMEr   Variabler   rp  r  rm  rP   rn  	set_shaperh  	gradientsrf  _losses_Towerr  )r}   r  rr  device_input_placeholdersnum_data_indevice_input_batchesdevice_input_slicesr|   r  current_batchscalegranularitycurrent_slice	graph_objdevice_gradss                  rC   rv  z"TFMultiGPUTowerStack._setup_device  s   c";<<<<<<YYv "	 "	 0=w== @AA W W')$&(#&'@AA > >EAr$'LLeEr %1 % %M )//>>>; $ 1&*&7 $ 1&'$&HH%!.%7+EF cC$5$556 "8EAKOP!dS!""%6%667
% 
%M "++BH555'..}==== ,,-@AA	(224?IDUVV9W W W W W W W W W W W W W W W: OO:NOOOP ="	 "	 "	 "	 "	 "	 "	 "	 "	 "	 "	 "	 "	 "	 "	 "	 "	 "	s6   &G%EF'G%'F+	+G%.F+	/)G%%G),G))NNNNNNNNr   )
rJ  rK  rL  rM  r   r   r   r  r  rv  rT   rE   rC   r{   r{     s         * "&T" T" T" T" T" T"lG" G" G" G"R 6  6  6D4 4 4$ $ $ $ $rE   r{   Tower)r  r^  r  c                     t          |           t          u r| | |z  z
  S | d| j        d         | j        d         |z  z
           S r  )rP  rL   rP   )rc  ns     rC   r  r    sD    Aww#~~1q5yQagaj1n,,--rE   c                 \   g }t          |  D ]}g }|D ]7\  }}|0t                              |d          }|                    |           8|sAt                              d|          }t                              |d          }|d         d         }||f}	|                    |	           |S )aW  Averages gradients across towers.

    Calculate the average gradient for each shared variable across all towers.
    Note that this function provides a synchronization point across all towers.

    Args:
        tower_grads: List of lists of (gradient, variable) tuples. The outer
            list is over individual gradients. The inner list is over the
            gradient calculation for each tower.

    Returns:
       List of pairs of (gradient, variable) where the gradient has been
           averaged across all towers.

    TODO(ekl): We could use NCCL if this becomes a bottleneck.
    Nr   )axisvaluesr9   )ru  r7   expand_dimsr   concatreduce_mean)
tower_gradsaverage_gradsgrad_and_varsr^  grE  
expanded_gr  rB   grad_and_vars
             rC   rw  rw  !  s    $ Mk* + + ! 	) 	)DAq}^^Aq11
 Z((( 	 yyay..~~dA&&
 !Qay\****rE   )Dloggingr  r  r   r   typingr   r   r   r   r	   r
   r   	gymnasiumr0  r   ray._common.deprecationr   r   ray.rllib.models.catalogr   ray.rllib.models.modelv2r   "ray.rllib.models.tf.tf_action_distr   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.tf_policyr   !ray.rllib.policy.view_requirementr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.debugr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r   "ray.rllib.utils.spaces.space_utilsr   ray.rllib.utils.tf_utilsr   ray.rllib.utils.typingr   r   r    r!   ray.util.debugr"   r   r7   tfv	getLoggerrJ  r3  r  r%   r{   r  r  rw  rT   rE   rC   <module>r     s    				 / / / / / / / / E E E E E E E E E E E E E E E E E E             2 1 1 1 1 1 , , , , , , C C C C C C * * * * * * 5 5 5 5 5 5 / / / / / / = = = = = = & & & & & & = = = = = = = = + + + + + + 3 3 3 3 3 3        I H H H H H 4 4 4 4 4 4            $ # # # # #}R		8	$	$   f f f f fh f f fR { { { { { { { {~ 
G???	@	@. . .. . . . .rE   