
    &`i                         d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZm Z  d dl!m"Z" d dl#m$Z$m%Z%  G d de          Z&e&Z'dS )    N)defaultdict)AnyDictOptional)
APPOConfig)CircularBuffer)IMPALALearner)Learner)update_target_networkTargetNetworkAPIValueFunctionAPI)MultiRLModuleSpec)RLModuleSpec)override)LambdaDefaultDict)LAST_TARGET_UPDATE_TSNUM_ENV_STEPS_TRAINED_LIFETIMENUM_MODULE_STEPS_TRAINEDNUM_TARGET_UPDATES)	Scheduler)ModuleIDShouldModuleBeUpdatedFnc                       e Zd ZdZ ee           fd            Z ee          ddddede	de
e         de
e         d	ef
 fd
            Z ee          ded	ef fd            Z ee          deeef         d	df fd            Ze ee          d	ee         fd                        Zej        deded	dfd            Z xZS )APPOLearnerzAdds KL coeff updates via `after_gradient_based_update()` to IMPALA logic.

    Framework-specific subclasses must override `_update_module_kl_coeff()`.
    c                 4    t          t                     _        t           j        j         j        j                   _        t                      	                                  j
                            d            t           fd           _        d S )N)num_batchesiterations_per_batchc                 X    t          |t                    r|                                nd S N)
isinstancer   make_target_networks)midmods     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py<lambda>z#APPOLearner.build.<locals>.<lambda>,   s.    c#344((***     c                 h                         j                            |           j                  S r    )_get_tensor_variableconfigget_config_for_modulekl_coeff)	module_idselfs    r%   r&   z#APPOLearner.build.<locals>.<lambda>8   s-    d7711)<<E  r'   )r   int_last_update_ts_by_midr   r*   circular_buffer_num_batches$circular_buffer_iterations_per_batch_learner_thread_in_queuesuperbuildmoduleforeach_moduler   curr_kl_coeffs_per_module)r.   	__class__s   `r%   r5   zAPPOLearner.build   s    &1#&6&6#(6?!%!Q)
 )
 )
%
 	 	"" 	
 	
 	
    
 
 	&&&r'   N)config_overridesnew_should_module_be_updatedr-   module_specr:   r;   returnc                   t                                          ||||          }t          | j        |                                         t
                    r1| j        |                                                                          |S )N)r-   r<   r:   r;   )r4   
add_moduler!   r6   	unwrappedr   r"   )r.   r-   r<   r:   r;   	marl_specr9   s         r%   r?   zAPPOLearner.add_module=   s     GG&&#-)E	 ' 
 
	 dk),6688:JKK 	FK	",,..CCEEEr'   c                 ~    t                                          |          }| j                            |           |S r    )r4   remove_moduler8   pop)r.   r-   rA   r9   s      r%   rC   zAPPOLearner.remove_moduleQ   s7    GG)))44	&**9555r'   	timestepsc                f   t                                          |           |                    t          d          }| j        j                                        D ]R\  }}| j                            |          }t          |
                                t                    r|| j        |         z
  |j        |j        z  |j        z  |j        z  k    r|
                                                                D ]\  }}t%          |||j                   | j                            |t,          fdd           || j        |<   | j                            |t.          f|d           |j        r>| j                            |t4          fd          dk    r|                     ||	           Td
S )zUpdates the target Q Networks.)rE   r   )main_net
target_nettau   lifetime_sum)reducemax)default)r-   r*   N)r4   after_gradient_based_updategetr   r6   _rl_modulesitemsr*   r+   r!   r@   r   r0   target_network_update_freqr1   r2   train_batch_size_per_learnerget_target_network_pairsr   rI   metrics	log_valuer   r   use_kl_losspeekr   _update_module_kl_coeff)	r.   rE   curr_timestepr-   r6   r*   rG   rH   r9   s	           r%   rO   z'APPOLearner.after_gradient_based_updateW   s    	++i+@@@ "&DaHH!%!8!>!>!@!@ $	Q $	QIv[66yAAF&**,,.>??  ;I FF589AB 9:  %%''@@BB  )!)#-"J     && 23Q~ '    :G+I6&& 56e '   
 "QL%%y2J&KUV%WW  ,,y,PPPI$	Q $	Qr'   c                     t           t          gS r    r   )clss    r%   rl_module_required_apisz#APPOLearner.rl_module_required_apis   s    
 !"233r'   r*   c                     dS )a  Dynamically update the KL loss coefficients of each module.

        The update is completed using the mean KL divergence between the action
        distributions current policy and old policy of each module. That action
        distribution is computed during the most recent update/call to `compute_loss`.

        Args:
            module_id: The module whose KL loss coefficient to update.
            config: The AlgorithmConfig specific to the given `module_id`.
        N )r.   r-   r*   s      r%   rZ   z#APPOLearner._update_module_kl_coeff   s      r'   )__name__
__module____qualname____doc__r   r	   r5   r
   r   r   r   r   r   r   r?   strrC   r   rO   classmethodlisttyper^   abcabstractmethodr   rZ   __classcell__)r9   s   @r%   r   r      s        
 Xm
 
 
 
 
: Xg ,0JN    "	
 #4. '//F&G 
     & Xms /@      
 Xg+QS#X +Q4 +Q +Q +Q +Q +Q +QZ Xg4T
 4 4 4  [4
 	
 
: 
RV 
 
 
 
 
 
 
 
r'   r   )(ri   collectionsr   typingr   r   r   ray.rllib.algorithms.appo.appor   ray.rllib.algorithms.appo.utilsr   *ray.rllib.algorithms.impala.impala_learnerr	   ray.rllib.core.learner.learnerr
   ray.rllib.core.learner.utilsr   ray.rllib.core.rl_module.apisr   r   (ray.rllib.core.rl_module.multi_rl_moduler   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.utils.annotationsr   "ray.rllib.utils.lambda_defaultdictr   ray.rllib.utils.metricsr   r   r   r   #ray.rllib.utils.schedules.schedulerr   ray.rllib.utils.typingr   r   r   AppoLearnerr`   r'   r%   <module>r|      s   



 # # # # # # & & & & & & & & & & 5 5 5 5 5 5 : : : : : : D D D D D D 2 2 2 2 2 2 > > > > > > L L L L L L L L F F F F F F ; ; ; ; ; ; 0 0 0 0 0 0 @ @ @ @ @ @            : 9 9 9 9 9 D D D D D D D D~ ~ ~ ~ ~- ~ ~ ~B r'   