
    &`i                     .   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZBmCZCmDZD  e/            \  ZEZF ejG        eH          ZIe* G d de                       ZJe* G d d                      ZK eK            ZLdS )    N)	AnyCallableDictListOptionalSetTupleTypeUnion)ModelCatalog)ModelV2)TorchDistributionWrapper)TorchModelV2)PolicyPolicyState)#pad_batch_to_sequences_of_same_size)SampleBatch)NullContextManager
force_list)OldAPIStackoverride)&ERR_MSG_TORCH_POLICY_CANNOT_SAVE_MODEL)try_import_torch)'DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICYNUM_AGENT_STEPS_TRAINEDNUM_GRAD_UPDATES_LIFETIMELEARNER_STATS_KEY)convert_to_numpy)normalize_action)	with_lock)convert_to_torch_tensor)AlgorithmConfigDictGradInfoDictModelGradientsModelWeightsTensorStructType
TensorTypec                    	    e Zd ZdZdddddddddej        j        dej        j        dedee	         d	ee
eeee         egeeee         f         f                  d
eee                  dee
eee         geeeeee         f         eeeeee         f         f         f                  dee
eeeeegeeee         ee         f         f                  dedee
egef                  f fdZ ee          	 	 dHdeeef         dedee         deeee         eeef         f         fd            Z ee          	 	 	 	 	 	 	 dIdeee         ef         deee                  deee         ef         deee         ef         deeeef                  dee         dee         deeee         eeef         f         fd            Ze ee          	 	 	 	 dJdeee         ef         deee         ef         deee                  deeee         ef                  deeee         ef                  dedefd                        Ze ee          dedeeef         fd                         Z  ee          	 dKd"ed#edefd$            Z! ee          dKd#edefd%            Z" ee          dLd&ed#efd'            Z#e ee          dede$fd(                        Z% ee          d)e$ddfd*            Z&d+edee         fd,Z' ee          de(fd-            Z) ee          d.e(ddfd/            Z* ee          defd0            Z+ ee          defd1            Z, ee          dee         fd2            Z- ee          de.f fd3            Z/ ee          d4e.ddf fd5            Z0d6d7d	edeeef         fd8Z1deee2f         fd9Z3deeef         dee         de	d:edeeef         f
d;Z4d<edeeef         fd=Z5deed7         d7f         fd>Z6 ee          dMd?ed@ee         ddfdA            Z7 ee          dBeddfdC            Z8edD             Z9dMdefdEZ:dFee         deeee         e;f                  fdGZ< xZ=S )NTorchPolicyz0PyTorch specific Policy class to use with RLlib.N   )modellossaction_distribution_classaction_sampler_fnaction_distribution_fnmax_seq_lenget_batch_divisibility_reqobservation_spaceaction_spaceconfigr,   r-   r.   r/   r0   r1   r2   c                <    dx _         |d<   d _        t                                          |||           at	          j        | j        d          j                   \  }}t	          j         j         j	        | j        d          j                   ||} 
                                t          t          t          j                                                            }t                               dt%          |           d	           |d
         sdk    s|st                              d           _         fdt          t)          t+          j                            pd          D              _        fdt          t)          t+          j                            pd          D              _        t3           d          r fd j        D              _         _        nst8          j        j                                        t8          j        j        j         k    rt9          j!                    }t%          |          k     rtE          d| d d          fdtG          |          D              _         j        d          _        fdtG          |          D             }g  _        tG          |          D ]Q\  }}tI          j%                  } j        &                    |'                     j        |                              Rt3           d          r% fdtG           j                  D              _         j        d          _        tQ          j)                     _*         j        +                                 _,        t%           j,                  dk     _-         .                                  j/        0                     j        j/                    1                                 _2         _3        || _4        n. j5        j6        j7        dk    r j5        j6         _4        nd _4        tq           9                                           _:        g  _;        d tG           j        <                                          D             } j:        D ]k}g }tG          |j=                  D ]+\  }}|d         D ]}|&                    ||                    , j;        &                    t}          |                     l j        ?                    dd          }d t          |          D              _@        | _A        | _B        | _C        d _D        |	 _E        t          |
          r |
           n|
pd _G        dS )a8  Initializes a TorchPolicy instance.

        Args:
            observation_space: Observation space of the policy.
            action_space: Action space of the policy.
            config: The Policy's config dict.
            model: PyTorch policy module. Given observations as
                input, this module must return a list of outputs where the
                first item is action logits, and the rest can be any value.
            loss: Callable that returns one or more (a list of) scalar loss
                terms.
            action_distribution_class: Class for a torch action distribution.
            action_sampler_fn: A callable returning either a sampled action,
                its log-likelihood and updated state or a sampled action, its
                log-likelihood, updated state and action distribution inputs
                given Policy, ModelV2, input_dict, state batches (optional),
                explore, and timestep. Provide `action_sampler_fn` if you would
                like to have full control over the action computation step,
                including the model forward pass, possible sampling from a
                distribution, and exploration logic.
                Note: If `action_sampler_fn` is given, `action_distribution_fn`
                must be None. If both `action_sampler_fn` and
                `action_distribution_fn` are None, RLlib will simply pass
                inputs through `self.model` to get distribution inputs, create
                the distribution object, sample from it, and apply some
                exploration logic to the results.
                The callable takes as inputs: Policy, ModelV2, input_dict
                (SampleBatch), state_batches (optional), explore, and timestep.
            action_distribution_fn: A callable returning distribution inputs
                (parameters), a dist-class to generate an action distribution
                object from, and internal-state outputs (or an empty list if
                not applicable).
                Provide `action_distribution_fn` if you would like to only
                customize the model forward pass call. The resulting
                distribution parameters are then used by RLlib to create a
                distribution object, sample from it, and execute any
                exploration logic.
                Note: If `action_distribution_fn` is given, `action_sampler_fn`
                must be None. If both `action_sampler_fn` and
                `action_distribution_fn` are None, RLlib will simply pass
                inputs through `self.model` to get distribution inputs, create
                the distribution object, sample from it, and apply some
                exploration logic to the results.
                The callable takes as inputs: Policy, ModelV2, ModelInputDict,
                explore, timestep, is_training.
            max_seq_len: Max sequence length for LSTM training.
            get_batch_divisibility_req: Optional callable that returns the
                divisibility requirement for sample batches given the Policy.
        torch	frameworkFNr,   )r8   )	obs_spacer4   num_outputsmodel_configr8   zFound z visible cuda devices.
_fake_gpusr   cpuc                     g | ]	}j         
S  device).0_selfs     q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/torch_policy.py
<listcomp>z(TorchPolicy.__init__.<locals>.<listcomp>   s    VVVADKVVV       c                 H    g | ]}|d k    rnt          j                  S r   )copydeepcopy)rB   ir,   s     rE   rF   z(TorchPolicy.__init__.<locals>.<listcomp>   s?     % % % aT]5%9%9% % %rG   target_modelc                      i | ]
}|j         S r?   )rN   )rB   mrD   s     rE   
<dictcomp>z(TorchPolicy.__init__.<locals>.<dictcomp>   s+     & & &-.At(& & &rG   z7TorchPolicy was not able to find enough GPU IDs! Found z, but num_gpus=.c                 |    g | ]8\  }}|k     t                               d                     |                    9S )zcuda:{})r7   rA   formatrB   rM   id_num_gpuss      rE   rF   z(TorchPolicy.__init__.<locals>.<listcomp>   sH       Asx<< Y--a0011<<rG   c                 &    g | ]\  }}|k     |S r?   r?   rU   s      rE   rF   z(TorchPolicy.__init__.<locals>.<listcomp>   s"    HHH61c1x<<3<<<rG   c                     i | ]=\  }}|t          j        j                                      j        |                   >S r?   )rK   rL   rN   todevices)rB   rM   rP   rD   s      rE   rQ   z(TorchPolicy.__init__.<locals>.<dictcomp>   sP     & & &1 t}T%677::4<?KK& & &rG   zPolicy.lossc                     i | ]\  }}||	S r?   r?   )rB   rM   ps      rE   rQ   z(TorchPolicy.__init__.<locals>.<dictcomp>	  s    KKK1q!KKKrG   paramsnum_multi_gpu_tower_stacksc                     g | ]}g S r?   r?   rB   rC   s     rE   rF   z(TorchPolicy.__init__.<locals>.<listcomp>  s    ???q???rG   )Hr8   _loss_initializedsuper__init__r   get_action_distr5   get_model_v2r3   r4   _get_num_gpus_for_policylistranger7   cudadevice_countloggerinfolenrA   intmathceilr[   model_gpu_towershasattrtarget_modelsr,   ray_privateworker_modeWORKER_MODEget_gpu_ids
ValueError	enumeraterK   rL   appendrZ   	threadingRLock_lockget_initial_state_state_inputs_is_recurrent/_update_model_view_requirements_from_init_stateview_requirementsupdate_create_explorationexplorationunwrapped_model_lossr-   __func____qualname__r   	optimizer_optimizersmulti_gpu_param_groups
parametersparam_groupssetget_loaded_batches
dist_classr/   r0   distributed_world_sizer1   callablebatch_divisibility_req)rD   r3   r4   r5   r,   r-   r.   r/   r0   r1   r2   r   	logit_dimgpu_idsidsrM   rC   
model_copymain_paramsoparam_indicespg_idxpgr]   num_buffersrW   	__class__s   `   `                    @rE   rd   zTorchPolicy.__init__@   s   b 076,!&*L&AAA =$0$@dk'2dn% % %!J	 !-0!.%![1.  E )0,6) 0022uUZ44667788AS\\AAABBB , -	28q===,,u--DKVVVVs49X;N;N7O7O7TST1U1UVVVDL% % % %s49X#6#677<1==% % %D! t^,, & & & &262G& & &" DJJ |"((**cl.A.MMM/++7||h&& ;; ;/7; ; ;  
   '00  DL
 ,q/DKHHHHYw%7%7HHHC$&D!!# M M1!]511
%,,Z]]4<?-K-KLLLLt^,, & & & & )$*? @ @& & &" .q1DJ _&&
!Z99;; !344q8<<>>>%%dj&BCCC3355$ DJJ Y,==+DJJ DJ%dnn&6&677 79#KK	$*2G2G2I2I(J(JKKK! 	C 	CAM'77 9 9
H 9 9A!((Q88889'..s=/A/ABBBB koo&BAFF??E+,>,>???3!2&<# '+#& 2333&&t,,,,1 	###rG   
input_dictexploretimestepreturnc                    t                                           5  |                                                   d           fd                                D             }|rIt                               dgt          |d                   z  t           j        |d         j                  nd }| 	                    ||||          cd d d            S # 1 swxY w Y   d S )NTc                 8    g | ]}d |dd         v |         S )state_inN   r?   )rB   kr   s     rE   rF   z?TorchPolicy.compute_actions_from_input_dict.<locals>.<listcomp>1  s6       "#J!BQB%<O<O
1<O<O<OrG   rH   r   )dtyperA   )
r7   no_grad_lazy_tensor_dictset_trainingkeystensorrn   longrA   _compute_action_helper)rD   r   r   r   kwargsstate_batchesseq_lenss    `     rE   compute_actions_from_input_dictz+TorchPolicy.compute_actions_from_input_dict$  sE    ]]__ 	 	//
;;J##D)))   '1'8'8  M !C#mA.///*(+2       ..M8Wh '	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B0CCC	obs_batchr   prev_action_batchprev_reward_batch
info_batchc	                     t                                           5  t                               t          |          t           j                  }
                     t          j        |ddi          }|!t          j	        |          |t          j
        <   |!t          j	        |          |t          j        <    fd|pg D             }                     |||
||          cd d d            S # 1 swxY w Y   d S )Nr   is_trainingFc                 :    g | ]}t          |j                  S r?   r"   rA   rB   srD   s     rE   rF   z/TorchPolicy.compute_actions.<locals>.<listcomp>\  3       <='4;77  rG   )r7   r   onesrn   int32r   r   CUR_OBSnpasarrayPREV_ACTIONSPREV_REWARDSr   )rD   r   r   r   r   r   episodesr   r   r   r   r   s   `           rE   compute_actionszTorchPolicy.compute_actionsC  sG    ]]__ 	 	zz#i..zDDH//'!5 J !,79zBS7T7T
;34 ,79zBS7T7T
;34   BOBUSU  M ..M8Wh 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   CC--C14C1Tactionsactions_normalizedc           
      .     j         r j        t          d          t                                          5                       t          j        |t          j        |i          }|||t          j	        <   |||t          j
        <   t                              t          |          t          j                  }	 fd|pg D             } j                            d            j        r	                        j        |||	dd          \  }
}}n# t"          $ r^}d|j        d         v sd	|j        d         v r4                       j        |t          j                 dd
          \  }
}}n|Y d }~n)d }~ww xY w j        }                     |||	          \  }
} ||
 j                  }|t          j                 }|s" j        d         rt+          | j                  }|                    |          }|cd d d            S # 1 swxY w Y   d S )NzfCannot compute log-prob/likelihood w/o an `action_distribution_fn` and a provided `action_sampler_fn`!r   c                 :    g | ]}t          |j                  S r?   r   r   s     rE   rF   z7TorchPolicy.compute_log_likelihoods.<locals>.<listcomp>  r   rG   F)r   )r   r   r   r   r   positional argumentr   unexpected keyword argument)policyr,   r   r   r   normalize_actions)r/   r0   r{   r7   r   r   r   r   ACTIONSr   r   r   rn   r   r   before_compute_actionsr,   	TypeErrorargsr   r5   r    action_space_structlogp)rD   r   r   r   r   r   r   r   r   r   dist_inputsr   	state_outerC   action_distlog_likelihoodss   `                rE   compute_log_likelihoodsz#TorchPolicy.compute_log_likelihoodsc  s     ! 	d&A&I'   ]]__ =	# =	#//$i1DgN J !,7H
;34 ,7H
;34zz#i..zDDH   BOBUSU  M
 33E3BBB * !Q 9=9T9T
#-&3!) %$) :U : :6KYY !      -::8AF1IEE595P5P#'"&*&01D&E$)(- 6Q 6 62Z   23 " "_
!%Jx!P!PQ$*[$*==K !!45G% N$+6I*J N*7D4LMM)..w77O"{=	# =	# =	# =	# =	# =	# =	# =	# =	# =	# =	# =	# =	# =	# =	# =	# =	# =	#s>   B5H
.%DH

E<AE72H
7E<<BH

HHpostprocessed_batchc                    | j         r| j                                          i }| j                            | ||           |                     |          \  }}|                     t                     | xj        dz  c_        | j         r| j                                         |d<   |	                    d|t          |j        t          | j        t          | j        dz
  |j        pdz
  i           |S )Nr   train_batchresultrH   r,   custom_metricsr   )r,   train	callbackson_learn_on_batchcompute_gradientsapply_gradients_directStepOptimizerSingletonnum_grad_updatesmetricsr   r   countr   r   )rD   r   learn_statsgradsfetchess        rE   learn_on_batchzTorchPolicy.learn_on_batch  s    : 	J((%8 	) 	
 	
 	
 //0CDDw 	:;;;": 	4#z1133GG +')<)B)4+@7)*;@qB
	
 	
 	
 rG   r   batchbuffer_indexc                 R    |                     d           t           j                  dk    rp j        d         j        dk    rZ|dk    sJ t	          | j        d j         j                                        |           |g j	        d<   t          |          S |
                    t           j                            }|D ]%}t	          | j        d j         j                   & fdt          |          D             }| j	        |<   t          |d                   S )	NTrH   r   r=   Fr   r1   shuffler   r   )
num_slicesc                 V    g | ]%\  }}|                     j        |                   &S r?   )	to_devicer[   )rB   rM   slicerD   s      rE   rF   z6TorchPolicy.load_batch_into_buffer.<locals>.<listcomp>  s/    UUUxq%%//$,q/22UUUrG   )r   rn   r[   typer   r1   r   r   r   r   
timeslicesr|   )rD   r   r   slicesr   s   `    rE   load_batch_into_bufferz"TorchPolicy.load_batch_into_buffer  s^    	4    t|!!dl1o&:e&C&C1$$$$/ ,'+'B"&"8    ""5)))',gD #u:: !!S->->!??
  	 	E/ ,'+'B"&"8     VUUU9VCTCTUUU-3\* 6!9~~rG   c                     t          | j                  dk    r| j        d         dk    r|dk    sJ t          d | j        |         D                       S )NrH   r   z/cpu:0c              3   4   K   | ]}t          |          V  d S Nrn   )rB   bs     rE   	<genexpr>zATorchPolicy.get_num_samples_loaded_into_buffer.<locals>.<genexpr>  s(      FFa3q66FFFFFFrG   )rn   r[   sumr   )rD   r   s     rE   "get_num_samples_loaded_into_bufferz.TorchPolicy.get_num_samples_loaded_into_buffer  s]    t|!!dl1o&A&A1$$$$FF4#7#EFFFFFFrG   offsetc                 L     j         |         st          d           j                            d          & j                            d j        d                   t	           j                  z   j        r j        D ]}|                                 t	           j                  dk    r j        d         j        dk    rs|dk    sJ t	           j         d         d                   k    r j         d         d         }n j         d         d         z            } 	                    |          S t	           j                  dk    rV j
                                        } j        d          j
        u sJ  j        dd          D ]}|                    |           t          d  j         |         D                       k    r j         |         }nfd	 j         |         D             }i }t          |          D ].\  }i }	 j                             ||	
           d|	i|d <   /                     |          }
g }t%          t	          |
d         d                             D ]|
d         d                  U|                    t(                              t(                               fd|
D                       d                     k|                    d            t           j
                                                  D ]\  }|         |_                             t4                      xj        dz  c_        t          t9           j        |                    D ]y\  \  }}|d                              t<                               |          d|                                 tB           j        tD           j        dz
  |j        pdz
  i           z|                     #                                           |S )NzPMust call Policy.load_batch_into_buffer() before Policy.learn_on_loaded_batch()!minibatch_sizesgd_minibatch_sizetrain_batch_sizerH   r   r=   c              3   4   K   | ]}t          |          V  d S r  r  rB   r   s     rE   r  z4TorchPolicy.learn_on_loaded_batch.<locals>.<genexpr>B  s(      #W#WqCFF#W#W#W#W#W#WrG   c                 *    g | ]}|z            S r?   r?   )rB   r  device_batch_sizer
  s     rE   rF   z5TorchPolicy.learn_on_loaded_batch.<locals>.<listcomp>E  s9        &6$5556  rG   r   r   tower_c                 \    g | ](}|d                                        j                  )S rJ   rZ   rA   )rB   trM   rD   s     rE   rF   z5TorchPolicy.learn_on_loaded_batch.<locals>.<listcomp>\  s/    $T$T$TQqT!WZZ%<%<$T$T$TrG   )dimr,   )$r   r{   r5   r   rn   r[   rr   r   r   r   r,   
state_dictload_state_dictr  r|   r   r   _multi_gpu_parallel_grad_calcri   r}   r7   meanstackr   gradr   r   r   zipr   r   extra_grad_infor   r   r   extra_compute_grad_fetches)rD   r
  r   r  r   r  towerdevice_batchesbatch_fetchesr   tower_outputs	all_gradsr]   r,   r  rM   s   ``            @@rE   learn_on_loaded_batchz!TorchPolicy.learn_on_loaded_batch  s   #L1 	2   !KOO,<==$ $$./! ! 	c$,///   	*  				 t|!!dl1o&:e&C&C1$$$$ C(<Q(?(B$C$CCC,Q/2,Q/26FEV<V3VW&&u---t|q  ..00J(+tz9999.qrr2 2 2%%j1111#W#WD4H4V#W#W#W W WWW!1,?NN    -l;  N !.11 	M 	MHAuNN,,~ -    ,<^*LM,1,,'' ::>JJ 	s=+A.//00 		' 		'AQ"1%1  JJ$T$T$T$T$Tm$T$T$TUU         &&&&dj335566 	" 	"DAqq\AFF:;;;"!*3t/Dn+U+U!V!V 	 	A~u,1,,'..%t';';E'B'BU]]__-t/D <-1U5K5PqQ	    	T<<>>???rG   c                 "   t          | j                  dk    sJ |j        s#t          || j        d| j        | j                   |                    d           |                     || j        d                    | 	                    |g          }|d         \  }}|dxx         t          | j
                  z  cc<   |                    |                     |                     |                                 }|t          |fi t          |ifS )NrH   Fr   Tr   r@   allreduce_latency)rn   r[   zero_paddedr   r1   r   r   r   r   r  r   r   r  r   dictr   )rD   r   r$  r%  	grad_infor   s         rE   r   zTorchPolicy.compute_gradients{  s6    4<  A%%%% #. 	/) ,'+'B"&"8    	((...24<?KKK ::<O;PQQ,Q/	9%&&&#d.>*?*??&&&--.ABBCCC1133$wII+<i*HIIIIrG   	gradientsc                 .   |t           k    r0t          | j                  D ]\  }}|                                 d S t	          | j                  dk    sJ t          || j                                                  D ]x\  }}|qt          	                    |          r |
                    | j                  |_        At                              |          
                    | j                  |_        y| j        d                                          d S )NrH   r   )r   r|   r   steprn   r  r,   r   r7   	is_tensorrZ   rA   r  
from_numpy)rD   r,  rM   optgr]   s         rE   r   zTorchPolicy.apply_gradients  s   555#D$455  3



  t'((A----Itz'<'<'>'>?? E E1=q)) E!"dk!2!2!&!1!1!!4!4!7!7!D!DQ$$&&&&&rG   
stats_namec                 
    g } j         D ]A}||j        v r6|                    t          j         fd|j        |                              Bt          |          dk    s#J d| dt           j                    d            |S )a  Returns list of per-tower stats, copied to this Policy's device.

        Args:
            stats_name: The name of the stats to average over (this str
                must exist as a key inside each tower's `tower_stats` dict).

        Returns:
            The list of stats tensor (structs) of all towers, copied to this
            Policy's device.

        Raises:
            AssertionError: If the `stats_name` cannot be found in any one
            of the tower's `tower_stats` dicts.
        c                 8    |                      j                  S r  r  )r   rD   s    rE   <lambda>z-TorchPolicy.get_tower_stats.<locals>.<lambda>  s    !$$t{"3"3 rG   r   zStats `z+` not found in any of the towers (you have zV towers in total)! Make sure you call the loss function on at least one of the towers.)rr   tower_statsr}   treemap_structurern   )rD   r3  datar!  s   `   rE   get_tower_statszTorchPolicy.get_tower_stats  s     * 	 	EU...&3333U5Fz5R   
 4yy1}}}Mj M M4())M M M }}
 rG   c                 l    d | j                                                                         D             S )Nc                     i | ]>\  }}||                                                                                                 ?S r?   )r=   detachnumpy)rB   r   vs      rE   rQ   z+TorchPolicy.get_weights.<locals>.<dictcomp>  s<    XXX115577>>##))++XXXrG   )r,   r  itemsrD   s    rE   get_weightszTorchPolicy.get_weights  s0    XX
8M8M8O8O8U8U8W8WXXXXrG   weightsc                 f    t          || j                  }| j                            |           d S Nr@   )r"   rA   r,   r  )rD   rD  s     rE   set_weightszTorchPolicy.set_weights  s2    )'$+FFF
""7+++++rG   c                     | j         S r  )r   rB  s    rE   is_recurrentzTorchPolicy.is_recurrent  s    !!rG   c                 N    t          | j                                                  S r  )rn   r,   r   rB  s    rE   num_state_tensorszTorchPolicy.num_state_tensors  s    4://11222rG   c                 H    d | j                                         D             S )Nc                 ~    g | ]:}|                                                                                                 ;S r?   )r>  r=   r?  r  s     rE   rF   z1TorchPolicy.get_initial_state.<locals>.<listcomp>  s6    QQQQ

  &&((QQQrG   )r,   r   rB  s    rE   r   zTorchPolicy.get_initial_state  s$    QQ$*2N2N2P2PQQQQrG   c                 D   t                                                      }g |d<   t          | j                  D ]A\  }}t	          |                                          }|d                             |           B| j        r| j                                        |d<   |S )N_optimizer_variables_exploration_state)rc   	get_stater|   r   r   r  r}   r   )rD   staterM   r   optim_state_dictr   s        rE   rQ  zTorchPolicy.get_state  s    !!##(*$%d.// 	C 	CDAq/??()001ABBBB 	G +/*:*D*D*F*FE&'rG   rR  c                    |                     dd           }|rt          |          t          | j                  k    sJ t          | j        |          D ]C\  }}d|d         i}t	          |d         | j                  |d<   |                    |           Dt          | d          r%d|v r!| j        	                    |d                    |d         | _
        t                      	                    |           d S )	NrO  r   rR  r@   r   rP  )rR  global_timestep)r   rn   r   r  r"   rA   r  rs   r   	set_staterU  rc   )rD   rR  optimizer_varsr   r   rS  r   s         rE   rV  zTorchPolicy.set_state  s$    #94@@ 
	4~&&#d.>*?*?????D,n== 4 41 %3An4E#F ,CgJt{- - - ) !!"233334'' 	J,@E,I,I&&U3G-H&III  %%67 	%     rG   r   ztorch.optim.Optimizerc                     i S )a  Called after each optimizer.zero_grad() + loss.backward() call.

        Called for each self._optimizers/loss-value pair.
        Allows for gradient processing before optimizer.step() is called.
        E.g. for gradient clipping.

        Args:
            optimizer: A torch optimizer object.
            loss: The loss tensor associated with the optimizer.

        Returns:
            An dict with information on the gradient processing step.
        r?   )rD   r   r-   s      rE   extra_grad_processzTorchPolicy.extra_grad_process  s	      	rG   c                     t           i iS )zExtra values to fetch and return from compute_gradients().

        Returns:
            Extra fetch dict to be added to the fetch dict of the
            `compute_gradients` call.
        r   rB  s    rE   r   z&TorchPolicy.extra_compute_grad_fetches  s     "2&&rG   r   c                     i S )a  Returns dict of extra info to include in experience batch.

        Args:
            input_dict: Dict of model input tensors.
            state_batches: List of state tensors.
            model: Reference to the model object.
            action_dist: Torch action dist object
                to get log-probs (e.g. for already sampled actions).

        Returns:
            Extra outputs to return in a `compute_actions_from_input_dict()`
            call (3rd return value).
        r?   )rD   r   r   r,   r   s        rE   extra_action_outzTorchPolicy.extra_action_out   s	    ( 	rG   r   c                     i S )zReturn dict of extra grad info.

        Args:
            train_batch: The training batch for which to produce
                extra grad info for.

        Returns:
            The info dict carrying grad info per str key.
        r?   )rD   r   s     rE   r  zTorchPolicy.extra_grad_info6  s	     	rG   c                 `   t          | d          rEt          j                            | j                                        | j        d                   g}n7t          j                            | j                                                  g}| j        r| j                            |          }|S )zCustom the local PyTorch optimizer(s) to use.

        Returns:
            The local PyTorch optimizer(s) to use for this Policy.
        r5   lr)r_  )	rs   r7   optimAdamr,   r   r5   r   get_exploration_optimizer)rD   
optimizerss     rE   r   zTorchPolicy.optimizerB  s     4"" 	E  !6!6!8!8T[=N OOJJ  +**4:+@+@+B+BCCDJ 	P)CCJOOJrG   
export_dironnxc                      t          j        |d           |r                      j                   d j        vr1t	          j        dg          x j        d<    j        t          j        <    j        t          j                 }g }d}d                    |           j        v rT|	                     j        d                    |                              |dz  }d                    |           j        v T fd j        
                                D             }t           j                            |d	          }t          j                             j        |||f|d|dt#          |
                                          d
t          j        gz   ddgd t#          |
                                          d
t          j        gz   D             	  	         dS t           j                            |d          }	 t                               j        |           dS # t&          $ rV t           j                            |          rt          j        |           t,                              t0                     Y dS w xY w)aP  Exports the Policy's Model to local directory for serving.

        Creates a TorchScript model and saves it.

        Args:
            export_dir: Local writable directory or filename.
            onnx: If given, will export model in ONNX format. The
                value of this parameter set the ONNX OpSet version to use.
        T)exist_ok
state_in_0g      ?r   zstate_in_{}rH   c                 8    i | ]}|d k    |j         |         S )r   )_dummy_batch)rB   r   rD   s     rE   rQ   z,TorchPolicy.export_model.<locals>.<dictcomp>p  s8       %% 4$Q'%%%rG   z
model.onnx	state_insoutput
state_outsc                     i | ]}|d diS )r   
batch_sizer?   )rB   r   s     rE   rQ   z,TorchPolicy.export_model.<locals>.<dictcomp>  s/        <(  rG   )export_paramsopset_versiondo_constant_foldinginput_namesoutput_namesdynamic_axeszmodel.pt)fN)osmakedirsr   rj  r   arrayr   SEQ_LENSrT   r}   r   pathjoinr7   re  exportr,   rh   save	Exceptionexistsremoverl   warningr   )	rD   rd  re  r   rk  rM   dummy_inputs	file_namefilenames	   `        rE   export_modelzTorchPolicy.export_modelT  s    	J.... /	G""4#4555 4#444 HcUOO$!,/$2C(3 ()=>HIA&&q))T->>>  !2=3G3G3J3J!KLLLQ  &&q))T->>>   *//11  L Z>>IJ
y(3""$( !2!2!4!455 456&5 !,"3"3"5"566"K$89:        & w||J
;;HG

4:
22222 G G G7>>(++ (Ih'''EFFFFFFGs   :!H AI=<I=import_filec                 6    | j                             |          S )z!Imports weights into torch model.)r,   import_from_h5)rD   r  s     rE   import_model_from_h5z TorchPolicy.import_model_from_h5  s     z((555rG   c           
         ||n| j         d         }||n| j        }|duo|g k    | _        | j        r| j                                         | j        rHdx}}|                     | | j        ||||          }t          |          dk    r	|\  }	}
}}ng|\  }	}
}n_| j                            ||           | j	        r	 | 	                    | | j        |||||d          \  }}}n# t          $ r_}d|j        d         v sd	|j        d         v r5| 	                    | | j        |t          j                 ||d
          \  }}}n|Y d}~n)d}~ww xY w| j        }|                     |||          \  }}t          |t           j                  s<t%          |t&                    s't)          d                    |j                             ||| j                  }| j                            |||          \  }	}
|	|t          j        <   |                     ||| j        |          }|||t          j        <   |
Ht6                              |
                                          |t          j        <   |
|t          j        <   | xj        t          |t          j                           z  c_        tA          |	||f          S )zShared forward pass logic (w/ and w/o trajectory view API).

        Returns:
            A tuple consisting of a) actions, b) state_out, c) extra_fetches.
        Nr   )r   r      F)r   r   r   r   r   r   r   r   r   )r   r   r   z`dist_class` ({}) not a TorchDistributionWrapper subclass! Make sure your `action_distribution_fn` or `make_model_and_action_dist` return a correct distribution class.)action_distributionr   r   )!r5   rU  r   r,   evalr/   rn   r   r   r0   r   r   r   r   r   
isinstance	functoolspartial
issubclassr   r{   rT   __name__get_exploration_actionr   r\  ACTION_DIST_INPUTSr7   expfloatACTION_PROBACTION_LOGPr   )rD   r   r   r   r   r   r   r   action_sampler_outputsr   r   r   r   r   extra_fetchess                  rE   r   z"TorchPolicy._compute_action_helper  sQ    %0''dk)6L'3889M*$6N=B;N : 	JOO! G	(,,K+%)%;%;
! &< & &" )**a//8N5{II+A(yy 33Gh3WWW* %Y 9=9T9T
#-&3!) '!)$) :U 	: 	:6KYY !      -::8AF1IEE !77  J&{':;$+%-(- 8  	'&%I   &IIII ( "_
)-Jx)X)X&Y :y'899	j*BCC	 !* +1&1D*E*E	   %*[$*==K !,CC$/(G D  MGT +2
;&' --tz;
 

 "<GM+89 5:YYtzz||5L5LM+1259M+12 	J{/B$C D DD)] CDDDs   ?&C& &
E0AE

Ec                     t          |t                    st          |          }|                    t          j        t
          |p| j                             |S rF  )r  r   set_get_interceptorr  r  r"   rA   )rD   r   rA   s      rE   r   zTorchPolicy._lazy_tensor_dict  s`    -{;; 	C"-.A"B"B//5f>STTT	
 	
 	
 #"rG   sample_batchesc                     t           j                  t          |          k    sJ t          j                    i t                                           fdt           j                  dk    s j        d         rt          t           j        | j                            D ]Y\  }\  }}} ||||           t                    dz
           }t          |d         t                    r|d         |d         Znffdt          t           j        | j                            D             }|D ]}|                                 |D ]}|                                 g }	t          t          |                    D ]O}|         }
t          |
d         t                    r|
d         |
d         |	                    |                    P|	S )ah  Performs a parallelized loss and gradient calculation over the batch.

        Splits up the given train batch into n shards (n=number of this
        Policy's devices) and passes each data shard (in parallel) through
        the loss function using the individual devices' models
        (self.model_gpu_towers). Then returns each tower's outputs.

        Args:
            sample_batches: A list of SampleBatch shards to
                calculate loss and gradients for.

        Returns:
            A list (one item per device) of 2-tuples, each with 1) gradient
            list and 2) grad info dict.
        c                 P   t                                          	 |j        dk    rt                      nt           j                            |          5  t                              |j        |                    }|	                    ||          }t          |          t          j                  k    sJ ddi}t          |                                          }d t          t          |                    D             }t          j                  D ]\  }}	j        |         }
t          |          D ].\  }}||
v r%|j        |j        j                                         /||                             d           |                                        |	||                              g }t          |          D ]4\  }}||
v r+|j        |                    |j                   |j        ||<   5j        rt3          j                    }t           j                                        r;|D ]7}t           j                            |t           j        j        j                   8n5t           j                            |t           j        j        j                   |	j         D ])}|d         D ]}|j        |xj        j        z  c_        *|dxx         t3          j                    |z
  z  cc<   	 d d d            n# 1 swxY w Y   5  ||f| <   d d d            d S # 1 swxY w Y   d S # tB          $ rb}d	d l"}5  tG          d
|  d| d| d|$                                 d	          |f| <   d d d            n# 1 swxY w Y   Y d }~d S Y d }~d S d }~ww xY w)Nr=   r(  g        c                     g | ]}d S r  r?   ra   s     rE   rF   zNTorchPolicy._multi_gpu_parallel_grad_calc.<locals>._worker.<locals>.<listcomp>:  s     F F F! F F FrG   T)retain_graph)opr^   r   zError In tower z on device z2 during multi GPU parallel gradient calculation:: z
Traceback: 

)%r7   set_grad_enabledr   r   rj   rA   r   r   r   custom_lossrn   r   rh   r   ri   r|   r   r  r:  zero_backwardr   rY  r}   r   timeis_availabledistributed
all_reduceReduceOpSUMall_reduce_coalescedr   r  	tracebackr{   
format_exc)	shard_idxr,   sample_batchrA   loss_outr+  r   r%  opt_idxr1  r   	param_idxparamr   startr2  param_groupr]   r   r  grad_enabledlockresultsrD   s                       rE   _workerz:TorchPolicy._multi_gpu_parallel_grad_calc.<locals>._worker&  s_   ""<000N-3[E-A-A')))uzGXGXH H ;R ;R  *

4NN   H  %00<HHHx==C0@,A,AAAAA "5c :I!%e&6&6&8&8!9!9J F FuS__/E/E F F FI(1$2B(C(C )R )R )-(CG(L09*0E0E 8 8,Iu(M99ej>T %
 5 5 7 7 7 )222EEE!(( 33C'9JKK   !# 1:*0E0E B B,Iu(M99#(:#9$)LL$<$<$<7<z	) 46 R$(IKKE$z6688 
" */ !& !&A$)$5$@$@()e.?.H.L %A %& %& %& %&!&
 !& 1 F F$)e.?.H.L !G !" !" !" 03/? N N)4X)> !N !NA'(v'9()$2M(M!N &&9:::dikkE>QQ:::S)R%;R ;R ;R ;R ;R ;R ;R ;R ;R ;R ;R ;R ;R ;R ;Rz  @ @*3Y)?GI&@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @         ":i : :%: : "#: :
  )3355: : :  
*GI&                          s   8L9 J(L
>L9 
LL9 LL9 L,L9 ,L00L9 3L04L9 9
N%N 	4N	=N 	N	N N	N  N%rH   r<   r   c           	      R    g | ]#\  }\  }}}t          j        ||||f           $S ))targetr   )r~   Thread)rB   r  r,   r  rA   r  s        rE   rF   z=TorchPolicy._multi_gpu_parallel_grad_calc.<locals>.<listcomp>  sV        =I<|V  ")UL&)Q    rG   )rn   rr   r~   Lockr7   is_grad_enabledr[   r5   r|   r  r  r{   r  r|  ri   r  r}   )rD   r  r  r,   r  rA   last_resultthreadsthreadoutputsrl  r  r  r  r  s   `          @@@@rE   r  z)TorchPolicy._multi_gpu_parallel_grad_calc  s9   $ 4())S-@-@@@@@~,,..P	 P	 P	 P	 P	 P	 P	 P	h t|!!T[%>!<ED)>4<HH= = = =8	8E< 	5,???%c'llQ&67k!nj99 =%a.k!n<==    AJ-~t|LLA A	  G "  !   s>2233 	/ 	/IY'F&)Y// /QiVAY.NN79-....rG   )NN)NNNNNNN)NNNTrJ   )r   r   r  )>r  
__module__r   __doc__gymspacesSpacer#   r   r   r   r   r   r
   r   r   r   r(   r   r	   ro   rd   r   r   strboolr   r'   rh   r   r!   r   r   r  r	  r&  r%   r   r   r;  r&   rC  rG  rI  rK  r   r   rQ  rV  rY  r   r   r\  r  r   r  r  r   r   r$   r  __classcell__r   s   @rE   r*   r*   <   s	       :: )- NR  HL=b
 b
 b
:+b
 j&b
 $	b
 %b
 $'?"@+Nj$z"2235
b
 $,D1I,J#Kb
 $T*-.*j$z2BBC*j*d:>NNOQ
b
. !)*j*Ej$'?"@$zBRRSU!
/b
: ;b
< %-Xvhm-D$E=b
 b
 b
 b
 b
 b
H Xf "&	 j)  3-	 
z4
+T#z/-BB	C   < Xf 59MQMQ04"&"& ./1AAB  Z 01 !&6!79I!IJ	
 !&6!79I!IJ T#t)_- $ 3- 
j!14Z3HH	I   > Xf
 59  #'R# R#t,-/??@R# ./1AABR#  Z 01	R#
 $$'(*::;
R# $$'(*::;
R# !R# 
R# R# R#  YR#h Xf$+ $$sJBW $ $ $  Y$L Xf / // / 
	/ / / /b XfG Gs G3 G G G G
 Xf_ _C _3 _ _ _ _B XfJ[ J^ J J J  YJ8 Xf' 'D ' ' ' ' # $7G2H    < XfY\ Y Y Y Y Xf,< ,D , , , , Xf"d " " " " Xf33 3 3 3 3 XfR4
#3 R R R R Xf;       Xf!{ !t ! ! ! ! ! !008B	c:o	   $'DcN ' ' ' 'j) J' 	
 . 
c:o	   ,
; 
4Z;P 
 
 
 
	t+,.EE	F   $ Xf;G ;Gs ;G(3- ;G4 ;G ;G ;G ;Gz Xf6 6 6 6 6 6 lE lE YlE\# #[ # # # #K";/K	eD$l23	4K K K K K K K KrG   r*   c                   2     e Zd ZdZdZ fdZd Zd Z xZS )DirectStepOptimizerzsTypesafe method for indicating `apply_gradients` can directly step the
    optimizers with in-place gradients.
    Nc                     t           j        +t                                          |           t           _        t           j        S r  )r  	_instancerc   __new__)clsr   s    rE   r  zDirectStepOptimizer.__new__  s.    (0,1GGOOC,@,@)",,rG   c                 >    t          |           t          |          u S r  )r   )rD   others     rE   __eq__zDirectStepOptimizer.__eq__  s    DzzT%[[((rG   c                     dS )Nr  r?   rB  s    rE   __repr__zDirectStepOptimizer.__repr__  s    $$rG   )	r  r  r   r  r  r  r  r  r  r  s   @rE   r  r    sg          I- - - - -
) ) )% % % % % % %rG   r  )MrK   r  loggingrp   rw  r~   r  typingr   r   r   r   r   r   r	   r
   r   	gymnasiumr  r?  r   r8  ru   ray.rllib.models.catalogr   ray.rllib.models.modelv2r   (ray.rllib.models.torch.torch_action_distr   $ray.rllib.models.torch.torch_modelv2r   ray.rllib.policy.policyr   r   ray.rllib.policy.rnn_sequencingr   ray.rllib.policy.sample_batchr   ray.rllib.utilsr   r   ray.rllib.utils.annotationsr   r   ray.rllib.utils.errorr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r   r   $ray.rllib.utils.metrics.learner_infor   ray.rllib.utils.numpyr   "ray.rllib.utils.spaces.space_utilsr    ray.rllib.utils.threadingr!   ray.rllib.utils.torch_utilsr"   ray.rllib.utils.typingr#   r$   r%   r&   r'   r(   r7   nn	getLoggerr  rl   r*   r  r   r?   rG   rE   <module>r     sK          				     
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
          



 1 1 1 1 1 1 , , , , , , M M M M M M = = = = = = 7 7 7 7 7 7 7 7 O O O O O O 5 5 5 5 5 5 : : : : : : : : = = = = = = = = H H H H H H 6 6 6 6 6 6         
 C B B B B B 2 2 2 2 2 2 ? ? ? ? ? ? / / / / / / ? ? ? ? ? ?                	r		8	$	$ ] ] ] ] ]& ] ] ]@# % % % % % % % %& !4 3 5 5   rG   