
    &`iN                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d d	l"m#Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZBmCZC d dlDmEZEmFZFmGZGmHZHmIZImJZJmKZK  e4            \  ZLZM ejN        eO          ZPe, G d de!                      ZQdS )    N)AnyDictListOptionalSetTupleTypeUnion)version)ModelCatalog)ModelV2)TorchDistributionWrapper)TorchModelV2)Policy)#pad_batch_to_sequences_of_same_size)SampleBatch)_directStepOptimizerSingleton)NullContextManager
force_list)OldAPIStackOverrideToImplementCustomLogic5OverrideToImplementCustomLogic_CallToSuperRecommendedis_overriddenoverride)&ERR_MSG_TORCH_POLICY_CANNOT_SAVE_MODEL)try_import_torch)'DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICYNUM_AGENT_STEPS_TRAINEDNUM_GRAD_UPDATES_LIFETIMELEARNER_STATS_KEY)convert_to_numpy)normalize_action)	with_lock)TORCH_COMPILE_REQUIRED_VERSIONconvert_to_torch_tensor)AlgorithmConfigDictGradInfoDictModelGradientsModelWeightsPolicyStateTensorStructType
TensorTypec                   	    e Zd ZdZdddej        j        dej        j        dedef fdZ	d	 Z
e ee          d
edee         dedeeee         f         fd                        Zed
edededeeeeee         f         fd            Zed
edededeeeee         f         fd            Zedefd            Zedeeee         f         fd            Zedefd            Zededeeef         fd            Ze dddedeeef         fd            Z!e deee"f         fd            Z#e deeef         dee         d
e$dedeeef         f
d            Z% ee          e 	 	 dQd ed!e&ee"ef                  defd"                        Z'edeed         df         fd#            Z(d$ Z) ee          	 	 dQdeeef         d%e*d&e&e         deeee         eeef         f         fd'            Z+ ee          	 	 	 	 	 	 	 dRdeee,         e,f         de&ee                  d(eee,         e,f         d)eee,         e,f         d*e&eee-f                  d%e&e*         d&e&e         dee,ee         eeef         f         fd+            Z.e/ ee          	 	 	 	 	 dSd-eee,         e,f         deee,         e,f         de&ee                  d(e&eee,         e,f                  d)e&eee,         e,f                  d.e*d/e*defd0                        Z0e/ ee          d1edeeef         fd2                        Z1 ee          	 dTd4ed5edefd6            Z2 ee          dTd5edefd7            Z3 ee          dUd8ed5efd9            Z4e/ ee          d1ede5fd:                        Z6 ee          d;e5ddfd<            Z7d=edee,         fd>Z8 ee          de9fd?            Z: ee          d@e9ddfdA            Z; ee          de*fdB            Z< ee          defdC            Z= ee          dee         fdD            Z> ee          e de?f fdE                        Z@ ee          e dFe?ddf fdG                        ZA ee          dVdHedIe&e         ddfdJ            ZB ee          dKeddfdL            ZCe/dM             ZDdVd1efdNZEdOee         deeee         eFf                  fdPZG xZHS )WTorchPolicyV2z0PyTorch specific Policy class to use with RLlib.   )max_seq_lenobservation_spaceaction_spaceconfigr1   c                
    dx _         |d<   d _        t                                          |||                                            \  }                                 t          t          t          j	        
                                                    }t                              dt          |           d           |d         sdk    s|st                              d           _         fd	t          t          t!          j                            pd
          D              _        fdt          t          t!          j                            pd
          D              _        t)           d          r fd j        D              _         _        nst.          j        j                                        t.          j        j        j        k    rt/          j                    }t          |          k     rt;          d| d d          fdt=          |          D              _         j        d          _        fdt=          |          D             }g  _        t=          |          D ]Q\  }}	t?          j                   }
 j        !                    |
"                     j        |                              Rt)           d          r% fdt=           j                  D              _         j        d          _        | _#         _$        tK          j&                     _'         j        (                                 _)        t          tU          j+         j)                            dk     _,         -                                  j.        /                     j        j.                    0                                 _1        te           3                                           _4        d _5        g  _6        d t=           j        7                                          D             } j4        D ]k}g }t=          |j8                  D ]+\  }}|d         D ]}|!                    ||                    , j6        !                    ts          |                     l j:        ;                    dd
          }d t          |          D              _<        d _=         >                                 _?        | _@        i  _A        t)           j        d          s j        D ]i  jA        <   dS dS )a  Initializes a TorchPolicy instance.

        Args:
            observation_space: Observation space of the policy.
            action_space: Action space of the policy.
            config: The Policy's config dict.
            max_seq_len: Max sequence length for LSTM training.
        torch	frameworkFzFound z visible cuda devices.
_fake_gpusr   cpuc                     g | ]	}j         
S  device).0_selfs     t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/policy/torch_policy_v2.py
<listcomp>z*TorchPolicyV2.__init__.<locals>.<listcomp>q   s    VVVADKVVV       c                 H    g | ]}|d k    rnt          j                  S r   )copydeepcopy)r>   imodels     rA   rB   z*TorchPolicyV2.__init__.<locals>.<listcomp>r   s?     % % % aT]5%9%9% % %rC   target_modelc                      i | ]
}|j         S r;   )rK   )r>   mr@   s     rA   
<dictcomp>z*TorchPolicyV2.__init__.<locals>.<dictcomp>w   s+     & & &-.At(& & &rC   z7TorchPolicy was not able to find enough GPU IDs! Found z, but num_gpus=.c                 |    g | ]8\  }}|k     t                               d                     |                    9S )zcuda:{})r6   r=   formatr>   rI   id_num_gpuss      rA   rB   z*TorchPolicyV2.__init__.<locals>.<listcomp>   sH       Asx<< Y--a0011<<rC   c                 &    g | ]\  }}|k     |S r;   r;   rR   s      rA   rB   z*TorchPolicyV2.__init__.<locals>.<listcomp>   s"    HHH61c1x<<3<<<rC   c                     i | ]=\  }}|t          j        j                                      j        |                   >S r;   )rG   rH   rK   todevices)r>   rI   rM   r@   s      rA   rN   z*TorchPolicyV2.__init__.<locals>.<dictcomp>   sP     & & &1 t}T%677::4<?KK& & &rC   Nc                     i | ]\  }}||	S r;   r;   )r>   rI   ps      rA   rN   z*TorchPolicyV2.__init__.<locals>.<dictcomp>   s    KKK1q!KKKrC   paramsnum_multi_gpu_tower_stacksc                     g | ]}g S r;   r;   r>   r?   s     rA   rB   z*TorchPolicyV2.__init__.<locals>.<listcomp>   s    ???q???rC   tower_stats)Br7   _loss_initializedsuper__init___init_model_and_dist_class_get_num_gpus_for_policylistranger6   cudadevice_countloggerinfolenr=   intmathceilrX   model_gpu_towershasattrtarget_modelsrJ   ray_privateworker_modeWORKER_MODEget_gpu_ids
ValueError	enumeraterG   rH   appendrW   
dist_classunwrapped_model	threadingRLock_lockget_initial_state_state_inputstreeflatten_is_recurrent/_update_model_view_requirements_from_init_stateview_requirementsupdate_create_explorationexplorationr   	optimizer_optimizers_lossmulti_gpu_param_groups
parametersparam_groupssetr4   get_loaded_batchesdistributed_world_sizeget_batch_divisibility_reqbatch_divisibility_reqr1   r_   )r@   r2   r3   r4   r1   r{   gpu_idsidsrI   r?   
model_copymain_paramsoparam_indicespg_idxpgrZ   num_buffersrJ   rT   	__class__s   `                 @@rA   rb   zTorchPolicyV2.__init__B   sn     076,!&*L&AAA !;;==z 0022uUZ44667788AS\\AAABBB , -	28q===,,u--DKVVVVs49X;N;N7O7O7TST1U1UVVVDL% % % %s49X#6#677<1==% % %D! t^,, & & & &262G& & &" DJJ |"((**cl.A.MMM/++7||h&& ;; ;/7; ; ;  
   '00  DL
 ,q/DKHHHHYw%7%7HHHC$&D!!# M M1!]511
%,,Z]]4<?-K-KLLLLt^,, & & & & )$*? @ @& & &" .q1DJ$$ _&&
!Z99;; d.@!A!ABBQF<<>>>%%dj&BCCC3355%dnn&6&677 

 79#KK	$*2G2G2I2I(J(JKKK! 	C 	CAM'77 9 9
H 9 9A!((Q88889'..s=/A/ABBBB koo&BAFF??E+,>,>??? '+#&*&E&E&G&G#& tz=11 	-. - -*, ''	- 	-- -rC   c                     | j         S N)r`   r@   s    rA   loss_initializedzTorchPolicyV2.loss_initialized   s    %%rC   rJ   r{   train_batchreturnc                     t           )a  Constructs the loss function.

        Args:
            model: The Model to calculate the loss for.
            dist_class: The action distr. class.
            train_batch: The training data.

        Returns:
            Loss tensor given the input batch.
        )NotImplementedError)r@   rJ   r{   r   s       rA   losszTorchPolicyV2.loss   s
    $ "!rC   	obs_batchstate_batchesc                    dS )ae  Custom function for sampling new actions given policy.

        Args:
            model: Underlying model.
            obs_batch: Observation tensor batch.
            state_batches: Action sampling state batch.

        Returns:
            Sampled action
            Log-likelihood
            Action distribution inputs
            Updated state
        )NNNNr;   r@   rJ   r   r   kwargss        rA   action_sampler_fnzTorchPolicyV2.action_sampler_fn   s
    , &%rC   c                    dS )aC  Action distribution function for this Policy.

        Args:
            model: Underlying model.
            obs_batch: Observation tensor batch.
            state_batches: Action sampling state batch.

        Returns:
            Distribution input.
            ActionDistribution class.
            State outs.
        )NNNr;   r   s        rA   action_distribution_fnz$TorchPolicyV2.action_distribution_fn  s
    *  rC   c                     dS )zCreate model.

        Note: only one of make_model or make_model_and_action_dist
        can be overridden.

        Returns:
            ModelV2 model.
        Nr;   r   s    rA   
make_modelzTorchPolicyV2.make_model  s	     trC   c                     dS )zCreate model and action distribution function.

        Returns:
            ModelV2 model.
            ActionDistribution class.
        NNr;   r   s    rA   make_model_and_action_distz(TorchPolicyV2.make_model_and_action_dist(  s	     zrC   c                     dS )zrGet batch divisibility request.

        Returns:
            Size N. A sample batch must be of size K*N.
        rD   r;   r   s    rA   r   z(TorchPolicyV2.get_batch_divisibility_req4  s	     qrC   c                     i S )zStats function. Returns a dict of statistics.

        Args:
            train_batch: The SampleBatch (already) used for training.

        Returns:
            The stats dict.
        r;   )r@   r   s     rA   stats_fnzTorchPolicyV2.stats_fn>  s	     	rC   r   ztorch.optim.Optimizerr   c                     i S )a  Called after each optimizer.zero_grad() + loss.backward() call.

        Called for each self._optimizers/loss-value pair.
        Allows for gradient processing before optimizer.step() is called.
        E.g. for gradient clipping.

        Args:
            optimizer: A torch optimizer object.
            loss: The loss tensor associated with the optimizer.

        Returns:
            An dict with information on the gradient processing step.
        r;   )r@   r   r   s      rA   extra_grad_processz TorchPolicyV2.extra_grad_processJ  s	    " 	rC   c                     t           i iS )zExtra values to fetch and return from compute_gradients().

        Returns:
            Extra fetch dict to be added to the fetch dict of the
            `compute_gradients` call.
        r    r   s    rA   extra_compute_grad_fetchesz(TorchPolicyV2.extra_compute_grad_fetches]  s     "2&&rC   
input_dictaction_distc                     i S )a  Returns dict of extra info to include in experience batch.

        Args:
            input_dict: Dict of model input tensors.
            state_batches: List of state tensors.
            model: Reference to the model object.
            action_dist: Torch action dist object
                to get log-probs (e.g. for already sampled actions).

        Returns:
            Extra outputs to return in a `compute_actions_from_input_dict()`
            call (3rd return value).
        r;   )r@   r   r   rJ   r   s        rA   extra_action_outzTorchPolicyV2.extra_action_outg  s	    * 	rC   Nsample_batchother_agent_batchesc                     |S )aW  Postprocesses a trajectory and returns the processed trajectory.

        The trajectory contains only data from one episode and from one agent.
        - If  `config.batch_mode=truncate_episodes` (default), sample_batch may
        contain a truncated (at-the-end) episode, in case the
        `config.rollout_fragment_length` was reached by the sampler.
        - If `config.batch_mode=complete_episodes`, sample_batch will contain
        exactly one episode (no matter how long).
        New columns can be added to sample_batch and existing ones may be altered.

        Args:
            sample_batch: The SampleBatch to postprocess.
            other_agent_batches (Optional[Dict[PolicyID, SampleBatch]]): Optional
                dict of AgentIDs mapping to other agents' trajectory data (from the
                same episode). NOTE: The other agents use the same policy.
            episode (Optional[Episode]): Optional multi-agent episode
                object in which the agents operated.

        Returns:
            SampleBatch: The postprocessed, modified SampleBatch (or a new one).
        r;   )r@   r   r   episodes       rA   postprocess_trajectoryz$TorchPolicyV2.postprocess_trajectory~  s
    : rC   c                 `   t          | d          rEt          j                            | j                                        | j        d                   g}n7t          j                            | j                                                  g}| j        r| j                            |          }|S )zCustom the local PyTorch optimizer(s) to use.

        Returns:
            The local PyTorch optimizer(s) to use for this Policy.
        r4   lr)r   )	rp   r6   optimAdamrJ   r   r4   r   get_exploration_optimizer)r@   
optimizerss     rA   r   zTorchPolicyV2.optimizer  s     4"" 	E  !6!6!8!8T[=N OOJJ  +**4:+@+@+B+BCCDJ 	P)CCJOOJrC   c           
         t          | j                  r#t          | j                  rt          d          t          | j                  rD|                                 }t	          j        | j        | j        d         | j                  \  }}nt          | j                  r|                                 \  }}nbt	          j        | j        | j        d         | j                  \  }}t	          j	        | j
        | j        || j        d         | j                  }| j                            d          rt          6t          j        t          j                  t           k     rt          d          | j                            d          rdnd	}t                              || j                            d
| dd          d| j                            d
| d                    }||fS )NzGOnly one of make_model or make_model_and_action_dist can be overridden.rJ   )r7   )	obs_spacer3   num_outputsmodel_configr7   torch_compile_learnerz3`torch.compile` is not supported for torch < 2.0.0!worker_indexlearnerrt   torch_compile__dynamo_backendinductorF_dynamo_mode)backenddynamicmode)r   r   r   rx   r   get_action_distr3   r4   r7   get_model_v2r2   r   r6   r   parse__version__r%   compile)r@   rJ   r{   r?   	logit_dimlws         rA   rc   z(TorchPolicyV2._init_model_and_dist_class  s   )) 	m+/
 /
 	 %  
 )) 	OO%%E(8!4;w#74>  MJ 4:;; 	 $ ? ? A AE::$0$@!4;w#74>% % %!J	 !-0!.%![1.  E ;??233 	!M%"3447UUU !VWWW"koon==K8BMM8R888*  [__%Fb%F%F%FGG "  E j  rC   exploretimestepc                    d }t                                           5  |                                                   d           fd                                D             }|rIt                               dgt          |d                   z  t           j        |d         j                  }| 	                    ||||          cd d d            S # 1 swxY w Y   d S )NTc                 8    g | ]}d |dd         v |         S )state_inN   r;   )r>   kr   s     rA   rB   zATorchPolicyV2.compute_actions_from_input_dict.<locals>.<listcomp>  s6       "#J!BQB%<O<O
1<O<O<OrC   rD   r   )dtyper=   )
r6   no_grad_lazy_tensor_dictset_trainingkeystensorrk   longr=   _compute_action_helper)r@   r   r   r   r   seq_lensr   s    `     rA   compute_actions_from_input_dictz-TorchPolicyV2.compute_actions_from_input_dict  s>    ]]__ 	 	//
;;J##D)))   '1'8'8  M   <<C#mA.///*(+2 (   ..M8Wh !	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B.CCCprev_action_batchprev_reward_batch
info_batchc	                     t                                           5  t                               t          |          t           j                  }
                     t          j        |ddi          }|!t          j	        |          |t          j
        <   |!t          j	        |          |t          j        <    fd|pg D             }                     |||
||          cd d d            S # 1 swxY w Y   d S )Nr   is_trainingFc                 :    g | ]}t          |j                  S r;   r&   r=   r>   sr@   s     rA   rB   z1TorchPolicyV2.compute_actions.<locals>.<listcomp>  3       <='4;77  rC   )r6   r   onesrk   int32r   r   CUR_OBSnpasarrayPREV_ACTIONSPREV_REWARDSr   )r@   r   r   r   r   r   episodesr   r   r   r   r   s   `           rA   compute_actionszTorchPolicyV2.compute_actions  sG    ]]__ 	 	zz#i..zDDH//'!5 J !,79zBS7T7T
;34 ,79zBS7T7T
;34   BOBUSU  M ..M8Wh 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   CC--C14C1Tactionsactions_normalizedin_trainingc           	          t           j                  r#t           j                  st          d          t                                          5                       t          j        |t          j	        |i          }|||t          j
        <   |||t          j        <   t                              t          |          t          j                  }	 fd|pg D             } j        r j                            d           t           j                  r6                      j        |||	dd          \  }
}} ||
 j                  }n2 j        }                     |||	          \  }
} ||
 j                  }|t          j	                 }|s" j        d         rt)          | j                  }|                    |          }|cd d d            S # 1 swxY w Y   d S )NzfCannot compute log-prob/likelihood w/o an `action_distribution_fn` and a provided `action_sampler_fn`!r   c                 :    g | ]}t          |j                  S r;   r   r   s     rA   rB   z9TorchPolicyV2.compute_log_likelihoods.<locals>.<listcomp>A  r   rC   F)r   )r   r   r   r   r   normalize_actions)r   r   r   rx   r6   r   r   r   r   ACTIONSr  r  r   rk   r   r   before_compute_actionsrJ   r{   r4   r#   action_space_structlogp)r@   r  r   r   r   r   r  r  r   r   dist_inputsr{   	state_outr   r?   log_likelihoodss   `               rA   compute_log_likelihoodsz%TorchPolicyV2.compute_log_likelihoods  sg   " /00 	':
 :
 	 '   ]]__ *	# *	#//$i1DgN J !,7H
;34 ,7H
;34zz#i..zDDH   BOBUSU  M  G 777FFF T899 B595P5PJ("/%! % 6Q 6 62Z )jdjAA "_
!%Jx!P!PQ(jdjAA !!45G% N$+6I*J N*7D4LMM)..w77O"U*	# *	# *	# *	# *	# *	# *	# *	# *	# *	# *	# *	# *	# *	# *	# *	# *	# *	#s   E=GG #G postprocessed_batchc                    | j         r| j                                          i }| j                            | ||           |                     |          \  }}|                     t                     | xj        dz  c_        | j         r2t          | j         d          r| j         	                                |d<   ni |d<   |
                    d|t          |j        t          | j        t          | j        dz
  |j        pdz
  i           |S )Npolicyr   resultrD   metricsrJ   custom_metricsr   )rJ   train	callbackson_learn_on_batchcompute_gradientsapply_gradientsr   num_grad_updatesrp   r  r   r   countr   r   )r@   r  learn_statsgradsfetchess        rA   learn_on_batchzTorchPolicyV2.learn_on_batchd  s/   
 : 	J((%8 	) 	
 	
 	
 //0CDDw 	:;;;": 	"'$*i88 	"#z1133GG!GG +')<)B)4+@7)*;@qB
	
 	
 	
 rC   r   batchbuffer_indexc           
      Z    |                     d           t           j                  dk    rr j        d         j        dk    r\|dk    sJ t	          | j        d j         j        dd                                |           |g j	        d<   t          |          S |
                    t           j                            }|D ]'}t	          | j        d j         j        dd           ( fd	t          |          D             }| j	        |<   t          |d                   S )
NTrD   r   r9   Fzeror&  r1   shuffler   r   _enable_new_api_stackpadding)
num_slicesc                 V    g | ]%\  }}|                     j        |                   &S r;   )	to_devicerX   )r>   rI   slicer@   s      rA   rB   z8TorchPolicyV2.load_batch_into_buffer.<locals>.<listcomp>  s/    UUUxq%%//$,q/22UUUrC   )r   rk   rX   typer   r1   r   r   r   r   
timeslicesry   )r@   r&  r'  slicesr1  s   `    rA   load_batch_into_bufferz$TorchPolicyV2.load_batch_into_buffer  sj    	4    t|!!dl1o&:e&C&C1$$$$/ ,'+'B"&"8&+    ""5)))',gD #u:: !!S->->!??
  		 		E/ ,'+'B"&"8&+     VUUU9VCTCTUUU-3\* 6!9~~rC   c                     t          | j                  dk    r| j        d         dk    r|dk    sJ t          d | j        |         D                       S )NrD   r   z/cpu:0c              3   4   K   | ]}t          |          V  d S r   rk   )r>   bs     rA   	<genexpr>zCTorchPolicyV2.get_num_samples_loaded_into_buffer.<locals>.<genexpr>  s(      FFa3q66FFFFFFrC   )rk   rX   sumr   )r@   r'  s     rA   "get_num_samples_loaded_into_bufferz0TorchPolicyV2.get_num_samples_loaded_into_buffer  s]    t|!!dl1o&A&A1$$$$FF4#7#EFFFFFFrC   offsetc                 L     j         |         st          d           j                            d          & j                            d j        d                   t	           j                  z   j        r j        D ]}|                                 t	           j                  dk    r j        d         j        dk    rs|dk    sJ t	           j         d         d                   k    r j         d         d         }n j         d         d         z            } 	                    |          S t	           j                  dk    rV j
                                        } j        d          j
        u sJ  j        dd          D ]}|                    |           t          d  j         |         D                       k    r j         |         }nfd	 j         |         D             }i }t          |          D ].\  }i }	 j                             ||	
           d|	i|d <   /                     |          }
g }t%          t	          |
d         d                             D ]|
d         d                  U|                    t(                              t(                               fd|
D                       d                     k|                    d            t           j
                                                  D ]\  }|         |_                             t4                      xj        dz  c_        t          t9           j        |                    D ]y\  \  }}|d                              t<                               |          d|                                 tB           j        tD           j        dz
  |j        pdz
  i           z|                     #                                           |S )NzPMust call Policy.load_batch_into_buffer() before Policy.learn_on_loaded_batch()!minibatch_sizesgd_minibatch_sizetrain_batch_sizerD   r   r9   c              3   4   K   | ]}t          |          V  d S r   r8  r>   r   s     rA   r:  z6TorchPolicyV2.learn_on_loaded_batch.<locals>.<genexpr>  s(      #W#WqCFF#W#W#W#W#W#WrC   c                 *    g | ]}|z            S r;   r;   )r>   r9  device_batch_sizer=  s     rA   rB   z7TorchPolicyV2.learn_on_loaded_batch.<locals>.<listcomp>  s9        &6$5556  rC   r  r  tower_c                 \    g | ](}|d                                        j                  )S rF   rW   r=   )r>   trI   r@   s     rA   rB   z7TorchPolicyV2.learn_on_loaded_batch.<locals>.<listcomp>  s/    $T$T$TQqT!WZZ%<%<$T$T$TrC   )dimrJ   )$r   rx   r4   r   rk   rX   ro   r  r2  r%  rJ   
state_dictload_state_dictr;  ry   r  r  _multi_gpu_parallel_grad_calcrf   rz   r6   meanstackr   gradr  r   r   zipr   r!   r   r  r   r   r   )r@   r=  r'  rI  r&  rK  towerdevice_batchesbatch_fetchesr  tower_outputs	all_gradsrZ   rJ   rE  rI   s   ``            @@rA   learn_on_loaded_batchz#TorchPolicyV2.learn_on_loaded_batch  s   #L1 	2   !KOO,<==$ $$./! ! 	c$,///   	*  				 t|!!dl1o&:e&C&C1$$$$ C(<Q(?(B$C$CCC,Q/2,Q/26FEV<V3VW&&u---t|q  ..00J(+tz9999.qrr2 2 2%%j1111#W#WD4H4V#W#W#W W WWW!1,?NN    -l;  N !.11 	M 	MHAuNN,,~ -    ,<^*LM,1,,'' ::>JJ 	s=+A.//00 		' 		'AQ"1%1  JJ$T$T$T$T$Tm$T$T$TUU         &&&&dj335566 	" 	"DAqq\AFF:;;;"!*3t/Dn+U+U!V!V 	 	A~u,1,,'..%t}}U';';U]]__-t/D <-1U5K5PqQ	    	T<<>>???rC   c           	      &   t          | j                  dk    sJ |j        s%t          || j        d| j        | j        dd           |                    d           |                     || j        d                    | 	                    |g          }|d         \  }}|dxx         t          | j
                  z  cc<   |                    |                     |                     |                                 }|t          |fi t          |ifS )	NrD   Fr)  r*  Tr   r<   allreduce_latency)rk   rX   zero_paddedr   r1   r   r   r   r   rM  r   r   r   r   dictr!   )r@   r  rU  rV  	grad_infor$  s         rA   r  zTorchPolicyV2.compute_gradients-  s:    4<  A%%%% #. 		/) ,'+'B"&"8&+    	((...24<?KKK ::<O;PQQ,Q/	9%&&&#d.>*?*??&&&':;;<<<1133$wII+<i*HIIIIrC   	gradientsc                 .   |t           k    r0t          | j                  D ]\  }}|                                 d S t	          | j                  dk    sJ t          || j                                                  D ]x\  }}|qt          	                    |          r |
                    | j                  |_        At                              |          
                    | j                  |_        y| j        d                                          d S )NrD   r   )r   ry   r   steprk   rQ  rJ   r   r6   	is_tensorrW   r=   rP  
from_numpy)r@   r]  rI   optgrZ   s         rA   r  zTorchPolicyV2.apply_gradientsN  s   555#D$455  3



  t'((A----Itz'<'<'>'>?? E E1=q)) E!"dk!2!2!&!1!1!!4!4!7!7!D!DQ$$&&&&&rC   
stats_namec                 .    g } j         D ]S} j        r j        |         }n|j        }||v r1|                    t          j         fd||                              Tt          |          dk    s#J d| dt           j                    d            |S )a  Returns list of per-tower stats, copied to this Policy's device.

        Args:
            stats_name: The name of the stats to average over (this str
                must exist as a key inside each tower's `tower_stats` dict).

        Returns:
            The list of stats tensor (structs) of all towers, copied to this
            Policy's device.

        Raises:
            AssertionError: If the `stats_name` cannot be found in any one
                of the tower's `tower_stats` dicts.
        c                 8    |                      j                  S r   rH  )r   r@   s    rA   <lambda>z/TorchPolicyV2.get_tower_stats.<locals>.<lambda>x  s    !$$t{"3"3 rC   r   zStats `z+` not found in any of the towers (you have zV towers in total)! Make sure you call the loss function on at least one of the towers.)ro   r_   rz   r   map_structurerk   )r@   rd  datarJ   r_   s   `    rA   get_tower_statszTorchPolicyV2.get_tower_stats_  s     * 	 	E 0".u5#/[((&3333[5L    4yy1}}}Mj M M4())M M M }}
 rC   c                 l    d | j                                                                         D             S )Nc                     i | ]>\  }}||                                                                                                 ?S r;   )r9   detachnumpy)r>   r   vs      rA   rN   z-TorchPolicyV2.get_weights.<locals>.<dictcomp>  s<    XXX115577>>##))++XXXrC   )rJ   rK  itemsr   s    rA   get_weightszTorchPolicyV2.get_weights  s0    XX
8M8M8O8O8U8U8W8WXXXXrC   weightsc                 f    t          || j                  }| j                            |           d S Nr<   )r&   r=   rJ   rL  )r@   rr  s     rA   set_weightszTorchPolicyV2.set_weights  s2    )'$+FFF
""7+++++rC   c                     | j         S r   )r   r   s    rA   is_recurrentzTorchPolicyV2.is_recurrent  s    !!rC   c                 N    t          | j                                                  S r   )rk   rJ   r   r   s    rA   num_state_tensorszTorchPolicyV2.num_state_tensors  s    4://11222rC   c                 H    d | j                                         D             S )Nc                 ~    g | ]:}|                                                                                                 ;S r;   )rm  r9   rn  rC  s     rA   rB   z3TorchPolicyV2.get_initial_state.<locals>.<listcomp>  s6    QQQQ

  &&((QQQrC   )rJ   r   r   s    rA   r   zTorchPolicyV2.get_initial_state  s$    QQ$*2N2N2P2PQQQQrC   c                 D   t                                                      }g |d<   t          | j                  D ]A\  }}t	          |                                          }|d                             |           B| j        r| j                                        |d<   |S )N_optimizer_variables_exploration_state)ra   	get_statery   r   r"   rK  rz   r   )r@   staterI   r   optim_state_dictr   s        rA   r  zTorchPolicyV2.get_state  s     !!##(*$%d.// 	C 	CDAq/??()001ABBBB 	G +/*:*D*D*F*FE&'rC   r  c                    |                     dd           }|rt          |          t          | j                  k    sJ t          | j        |          D ]C\  }}d|d         i}t	          |d         | j                  |d<   |                    |           Dt          | d          r%d|v r!| j        	                    |d                    |d         | _
        t                      	                    |           d S )	Nr}  r   r  r<   r   r~  )r  global_timestep)r   rk   r   rQ  r&   r=   rL  rp   r   	set_stater  ra   )r@   r  optimizer_varsr   r   r  r   s         rA   r  zTorchPolicyV2.set_state  s$    #94@@ 
	4~&&#d.>*?*?????D,n== 4 41 %3An4E#F ,CgJt{- - - ) !!"233334'' 	J,@E,I,I&&U3G-H&III  %%67 	%     rC   
export_dironnxc                      t          j        |d           |r                      j                   d j        vr1t	          j        dg          x j        d<    j        t          j        <    j        t          j                 }g }d}d                    |           j        v rT|	                     j        d                    |                              |dz  }d                    |           j        v T fd j        
                                D             }t           j                            |d	          }t          j                             j        |||f|d|dt#          |
                                          d
t          j        gz   ddgd t#          |
                                          d
t          j        gz   D             	  	         dS t           j                            |d          }	 t                               j        |           dS # t&          $ rV t           j                            |          rt          j        |           t,                              t0                     Y dS w xY w)aP  Exports the Policy's Model to local directory for serving.

        Creates a TorchScript model and saves it.

        Args:
            export_dir: Local writable directory or filename.
            onnx: If given, will export model in ONNX format. The
                value of this parameter set the ONNX OpSet version to use.
        T)exist_ok
state_in_0g      ?r   zstate_in_{}rD   c                 8    i | ]}|d k    |j         |         S )r   )_dummy_batch)r>   r   r@   s     rA   rN   z.TorchPolicyV2.export_model.<locals>.<dictcomp>  s8       %% 4$Q'%%%rC   z
model.onnx	state_insoutput
state_outsc                     i | ]}|d diS )r   
batch_sizer;   )r>   r   s     rA   rN   z.TorchPolicyV2.export_model.<locals>.<dictcomp>  s/        <(  rC   )export_paramsopset_versiondo_constant_foldinginput_namesoutput_namesdynamic_axeszmodel.pt)fN)osmakedirsr   r  r   arrayr   SEQ_LENSrQ   rz   r   pathjoinr6   r  exportrJ   re   save	Exceptionexistsremoveri   warningr   )	r@   r  r  r   r  rI   dummy_inputs	file_namefilenames	   `        rA   export_modelzTorchPolicyV2.export_model  s    	J.... /	G""4#4555 4#444 HcUOO$!,/$2C(3 ()=>HIA&&q))T->>>  !2=3G3G3J3J!KLLLQ  &&q))T->>>   *//11  L Z>>IJ
y(3""$( !2!2!4!455 456&5 !,"3"3"5"566"K$89:        & w||J
;;HG

4:
22222 G G G7>>(++ (Ih'''EFFFFFFGs   :!H AI=<I=import_filec                 6    | j                             |          S )z!Imports weights into torch model.)rJ   import_from_h5)r@   r  s     rA   import_model_from_h5z"TorchPolicyV2.import_model_from_h5  s     z((555rC   c           	      j   ||n| j         d         }||n| j        }| j        r| j                                         dx}x}}t	          | j                  r'd}	|                     | j        ||||          \  }
}}}n| j                            ||           t	          | j                  r&|                     | j        |||||d          \  }}}n!| j	        }|                     |||          \  }}t          |t          j                  s<t          |t                    s't          d                    |j                             ||| j                  }	| j                            |	||          \  }
}||                     ||| j        |	          }|||t(          j        <   |Ht,                              |                                          |t(          j        <   ||t(          j        <   | xj        t7          |t(          j                           z  c_        t;          |
||f          S )	a-  Shared forward pass logic (w/ and w/o trajectory view API).

        Returns:
            A tuple consisting of a) actions, b) state_out, c) extra_fetches.
            The input_dict is modified in-place to include a numpy copy of the computed
            actions under `SampleBatch.ACTIONS`.
        Nr   )r   r   r   r   )r   r   F)r   r   r   r   r   r   z`dist_class` ({}) not a TorchDistributionWrapper subclass! Make sure your `action_distribution_fn` or `make_model_and_action_dist` return a correct distribution class.)action_distributionr   r   )r4   r  rJ   evalr   r   r   r  r   r{   
isinstance	functoolspartial
issubclassr   rx   rQ   __name__get_exploration_actionr   r   ACTION_DIST_INPUTSr6   expfloatACTION_PROBACTION_LOGPrk   r   r"   )r@   r   r   r   r   r   extra_fetchesr  r  r   r  r  r{   s                rA   r   z$TorchPolicyV2._compute_action_helper  sy    %0''dk)6L'3889M : 	JOO-111d/00 )	K484J4J
$+! 5K 5 51GT;		 33Gh3WWWT899 Y595P5PJ("/%#% % 6Q 6 62Z "_
)-Jx)X)X&Y :y'899	j*BCC	 !* +1&1D*E*E	   %*[$*==K !,CC$/(G D  MGT
   11M4:{ M
 "<GM+89 5:YYtzz||5L5LM+1259M+12 	J{/B$C D DD)] CDDDrC   c                     t          |t                    st          |          }|                    t          j        t
          |p| j                             |S rt  )r  r   set_get_interceptorr  r  r&   r=   )r@   r  r=   s      rA   r   zTorchPolicyV2._lazy_tensor_dictY  s`    -{;; 	C"-.A"B"B//5f>STTT	
 	
 	
 #"rC   sample_batchesc                     t           j                  t          |          k    sJ t          j                    i t                                           fdt           j                  dk    s j        d         rt          t           j        | j                            D ]Y\  }\  }}} ||||           t                    dz
           }t          |d         t                    r|d         |d         Znffdt          t           j        | j                            D             }|D ]}|                                 |D ]}|                                 g }	t          t          |                    D ]O}|         }
t          |
d         t                    r|
d         |
d         |	                    |                    P|	S )ah  Performs a parallelized loss and gradient calculation over the batch.

        Splits up the given train batch into n shards (n=number of this
        Policy's devices) and passes each data shard (in parallel) through
        the loss function using the individual devices' models
        (self.model_gpu_towers). Then returns each tower's outputs.

        Args:
            sample_batches: A list of SampleBatch shards to
                calculate loss and gradients for.

        Returns:
            A list (one item per device) of 2-tuples, each with 1) gradient
            list and 2) grad info dict.
        c           	         t                                          	 |j        dk    rt                      nt           j                            |          5  t                              |j        |                    }t          |d          r|
                    ||          }t          |          t          j                  k    sJ ddi}t          |                                          }d t          t          |                    D             }t!          j                  D ]\  }}	j        |         }
t!          |          D ].\  }}||
v r%|j        |j        j                                         /||                             d           |                                        |	||                              g }t!          |          D ]4\  }}||
v r+|j        |                    |j                   |j        ||<   5j        rt5          j                    }t           j                                        r;|D ]7}t           j                            |t           j        j        j                   8n5t           j                             |t           j        j        j                   |	j!        D ])}|d	         D ]}|j        |xj        j        z  c_        *|dxx         t5          j                    |z
  z  cc<   	 d d d            n# 1 swxY w Y   5  ||f| <   d d d            d S # 1 swxY w Y   d S # tD          $ r}d
d l#}5  tI          |j%        d
         dz   |&                                z   dz   d'                    | |          z             |f| <   d d d            n# 1 swxY w Y   Y d }~d S Y d }~d S d }~ww xY w)Nr9   custom_lossrY  g        c                     g | ]}d S r   r;   r^   s     rA   rB   zPTorchPolicyV2._multi_gpu_parallel_grad_calc.<locals>._worker.<locals>.<listcomp>  s     F F F! F F FrC   T)retain_graph)opr[   r   z
 traceback
zIn tower {} on device {})(r6   set_grad_enabledr2  r   rg   r=   r   r   r{   rp   r  rk   r   re   r   rf   ry   r   rP  ri  zero_backwardr   r   rz   r   timeis_availabledistributed
all_reduceReduceOpSUMall_reduce_coalescedr   r  	tracebackrx   args
format_excrQ   )	shard_idxrJ   r   r=   loss_outr\  r   rV  opt_idxrb  r   	param_idxparamr#  startrc  param_grouprZ   er  grad_enabledlockresultsr@   s                       rA   _workerz<TorchPolicyV2._multi_gpu_parallel_grad_calc.<locals>._workerx  sm   ""<000N-3[E-A-A')))uzGXGXH H <R <R  *		%,GG   H um44 M#(#4#4X|#L#Lx==C0@,A,AAAAA "5c :I!%e&6&6&8&8!9!9J F FuS__/E/E F F FI(1$2B(C(C )R )R )-(CG(L09*0E0E 8 8,Iu(M99ej>T %
 5 5 7 7 7 )222EEE!(( 33C'9JKK   !# 1:*0E0E B B,Iu(M99#(:#9$)LL$<$<$<7<z	) 46 R$(IKKE$z6688 
" */ !& !&A$)$5$@$@()e.?.H.L %A %& %& %& %&!&
 !& 1 F F$)e.?.H.L !G !" !" !" 03/? N N)4X)> !N !NA'(v'9()$2M(M!N &&9:::dikkE>QQ:::S)R'<R <R <R <R <R <R <R <R <R <R <R <R <R <R <R|  @ @*3Y)?GI&@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @        
 
"F1I,-'22445 ## 9??	6RR	S  	*GI&
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s   8M J7LM LM  L!M &L;.M ;L??M L?M 
OOAN6*O6N:	:O=N:	>OOrD   r8   r   c           	      R    g | ]#\  }\  }}}t          j        ||||f           $S ))targetr  )r}   Thread)r>   r  rJ   r   r=   r  s        rA   rB   z?TorchPolicyV2._multi_gpu_parallel_grad_calc.<locals>.<listcomp>  sV        =I<|V  ")UL&)Q    rC   )rk   ro   r}   Lockr6   is_grad_enabledrX   r4   ry   rQ  r  rx   r  r  rf   r  rz   )r@   r  r  rJ   r   r=   last_resultthreadsthreadoutputsr  r  r  r  r  s   `          @@@@rA   rM  z+TorchPolicyV2._multi_gpu_parallel_grad_calca  s9   $ 4())S-@-@@@@@~,,..P	 P	 P	 P	 P	 P	 P	 P	h t|!!T[%>!<ED)>4<HH= = = =8	8E< 	5,???%c'llQ&67k!nj99 =%a.k!n<==    AJ-~t|LLA A	  G "  !   s>2233 	/ 	/IY'F&)Y// /QiVAY.NN79-....rC   r   )NNNNNNN)NNNTTrF   )r   r   r   )Ir  
__module____qualname____doc__gymspacesSpacer'   rl   rb   r   r   r   r   r   r	   r   r   r
   r-   r   r   r   r   r2  r   r   r   r   r   strr   r   r   r   r   r   r   r   r   r   rc   boolr   r,   re   r  r$   r  r%  r5  r<  rW  r)   r  r  rj  r*   rq  ru  rw  ry  r   r+   r  r  r  r  r   r   r(   rM  __classcell__)r   s   @rA   r/   r/   >   s5       :: R- R- R-:+R- j&R- $	R- R- R- R- R- R- R-h& & & $Xf"" 12" !	"
 
z4
++	," " "  $#"$ $&& 	&
 "& 
z:z4
3CC	D& & & $#&. $   	 
 "  
z4j!11	2      $# , $	G 	 	 	 $#	 $		w566	7	 	 	 $#	 $C    $# $	K 	Dj4I 	 	 	 $#	 ;08B	c:o	   ;:$ ;'DcN ' ' ' ;:' ;j) J' 	
 . 
c:o	   ;:, Xf: AE	 ! &d3+;&<=
 
   ;: : $	t+,.EE	F   $#$-! -! -!^ Xf "&	 j)  3-	 
z4
+T#z/-BB	C   : Xf 59MQMQ04"&"& ./1AAB  Z 01 !&6!79I!IJ	
 !&6!79I!IJ T#t)_- $ 3- 
j!14Z3HH	I   @ Xf
 59  #' B# B#t,-/??@B# ./1AABB#  Z 01	B#
 $$'(*::;
B# $$'(*::;
B# !B# B# 
B# B# B#  YB#H Xf&+ &$sJBW & & &  Y&P Xf 3 33 3 
	3 3 3 3j XfG Gs G3 G G G G
 Xf` `C `3 ` ` ` `D XfJ[ J^ J J J  YJ> Xf' 'D ' ' ' ' "# "$7G2H " " " "H XfY\ Y Y Y Y Xf,< ,D , , , , Xf"d " " " " Xf33 3 3 3 3 XfR4
#3 R R R R Xf:;      ;:  Xf:!{ !t ! ! ! ! ! ;: !0 Xf<G <Gs <G(3- <G4 <G <G <G <G| Xf6 6 6 6 6 6 OE OE YOEb# #[ # # # #K";/K	eD$l23	4K K K K K K K KrC   r/   )RrG   r  loggingrm   r  r}   r  typingr   r   r   r   r   r   r	   r
   	gymnasiumr  rn  r   r   	packagingr   rr   ray.rllib.models.catalogr   ray.rllib.models.modelv2r   (ray.rllib.models.torch.torch_action_distr   $ray.rllib.models.torch.torch_modelv2r   ray.rllib.policy.policyr   ray.rllib.policy.rnn_sequencingr   ray.rllib.policy.sample_batchr   ray.rllib.policy.torch_policyr   ray.rllib.utilsr   r   ray.rllib.utils.annotationsr   r   r   r   r   ray.rllib.utils.errorr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r   r   $ray.rllib.utils.metrics.learner_infor!   ray.rllib.utils.numpyr"   "ray.rllib.utils.spaces.space_utilsr#   ray.rllib.utils.threadingr$   ray.rllib.utils.torch_utilsr%   r&   ray.rllib.utils.typingr'   r(   r)   r*   r+   r,   r-   r6   nn	getLoggerr  ri   r/   r;   rC   rA   <module>r
     sF          				      E E E E E E E E E E E E E E E E E E E E                



 1 1 1 1 1 1 , , , , , , M M M M M M = = = = = = * * * * * * O O O O O O 5 5 5 5 5 5 G G G G G G : : : : : : : :              I H H H H H 6 6 6 6 6 6         
 C B B B B B 2 2 2 2 2 2 ? ? ? ? ? ? / / / / / /                         	r		8	$	$ m m m m mF m m m m mrC   