
    &`i%                        d Z ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ  ed
dd          Ze                    d           e                    dedd           e                    ddd           edk    rjddlmZ e                                Zd Zd Zd Zej        dk    r ej        dd            n ej        de            eej                   !                                "                    ddd d!d"d#$          #                    ej$        rdned%&          %                    ej$        rdned'd(ej&        pdz  d)d*+          '                     ed#g d,d-d.g/          0          Z(ej         d1k    re(%                    d2d3d4d5d6d78           ej        dk    r0e()                    d9  e*ej                  D             d: ;            ee(e           dS dS )<a>  Example using 2 ConnectorV2 for observation frame-stacking in Atari environments.

An RLlib Algorithm has 3 distinct connector pipelines:
- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
a batch for an RLModule to compute actions (`forward_inference()` or
`forward_exploration()`).
- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
it into an action readable by the environment.
- A learner connector pipeline on a Learner taking a list of episodes and producing
a batch for an RLModule to perform the training forward pass (`forward_train()`).

Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
adds/prepends to these pipelines in order to perform the most basic functionalities.
For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
env-to-module pipeline to make sure the batch for computing actions contains - at the
minimum - the most recent observation.

On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
pieces (or use the ones available already in RLlib) and add them to one of the 3
different pipelines described above, as required.

This example:
    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
    env-to-module pipeline.
    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
    learner connector pipeline.
    - demonstrates that using these two pieces (rather than performing framestacking
    already inside the environment using a gymnasium wrapper) increases overall
    performance by about 5%.


How to run this script
----------------------
`python [script file name].py --num-frames=4 --env=ALE/Pong-v5`

Use the `--num-frames` option to define the number of observations to framestack.
If you don't want to use Connectors to perform the framestacking, set the
`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
gymnasium observation wrapper. In this case though, be aware that the tensors being
sent through the network are `--num-frames` x larger than if you use the Connector
setup.

For debugging, use the following additional command line options
`--no-tune --num-env-runners=0`
which should allow you to set breakpoints anywhere in the RLlib code and
have the execution stop there for inspection and debugging.

For logging to your WandB account, use:
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
--wandb-run-name=[optional: WandB run name (within the defined project)]`


Results to expect
-----------------

With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
and learner connector pipelines), you should see something like this using:
`--env ALE/Pong-v5 --num-learners=4 --num-gpus-per-learner=1 --num-env-runners=95`
+---------------------------+------------+--------+------------------+...
| Trial name                | status     |   iter |   total time (s) |
|                           |            |        |                  |
|---------------------------+------------+--------+------------------+...
| PPO_atari-env_2fc4a_00000 | TERMINATED |    200 |          335.837 |
+---------------------------+------------+--------+------------------+...

Note that the time to run these 200 iterations is about ~5% faster than when
performing framestacking already inside the environment (using a
`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).

Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal),
the output looks like this:
+---------------------------+------------+--------+------------------+...
| Trial name                | status     |   iter |   total time (s) |
|                           |            |        |                  |
|---------------------------+------------+--------+------------------+...
| PPO_atari-env_2fc4a_00000 | TERMINATED |    200 |          351.505 |
+---------------------------+------------+--------+------------------+...
    N)FrameStackingEnvToModule)FrameStackingLearner)DefaultModelConfig)wrap_atari_for_new_api_stack)make_multi_agent)add_rllib_example_script_args#run_rllib_example_script_experiment)get_trainable_clsi@KL g      4@   )default_timestepsdefault_rewarddefault_iterszale_py:ALE/Pong-v5)envz--num-frames   z*The number of observation frames to stack.)typedefaulthelpz--use-gym-wrapper-framestacking
store_truezWhether to use RLlib's Atari wrapper's framestacking capabilities (as opposed to doing it via a specific ConenctorV2 pipeline).)actionr   __main__)tunec                 T    t          t          j        t          j        dk              S Nr   )
num_framesmulti_agent)r   argsr   
num_agents)r   spacesdevices      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py_make_env_to_module_connectorr!   y   s*     (!+
 
 
 	
    c                 T    t          t          j        t          j        dk              S r   )r   r   r   r   )input_observation_spaceinput_action_spaces     r    _make_learner_connectorr&      s(    #!+
 
 
 	
r"   c                     t          t          j        t          j        fi | ddit          j        rt          j        nd           S )Nrender_mode	rgb_array)
framestack)r   gymmaker   r   use_gym_wrapper_framestackingr   cfgs    r    _env_creatorr0      sM    +HTXEEE(DEE $(#EO4
 
 
 	
r"   z	atari-envc                 j     t          t                    t          | fi dt          j        i          S )Nr   )r   r0   dictr   r   r.   s    r    <lambda>r3      s6    6(66S<<\4?;<<  r"      Fg        )	frameskipfull_action_spacerepeat_action_probabilityT)
env_configclip_rewards   )env_to_module_connectornum_envs_per_env_runnerg{Gz?ga2U0*#?g      Y@global_norm)learner_connectorentropy_coefflr	grad_clipgrad_clip_by))   r   r:   )    r   r:   )@   r   r:   )   r   r:   relu   )vf_share_layersconv_filtersconv_activationhead_fcnet_hiddens)model_configPPO
   rE   gffffff?g      ?g?g      $@)
num_epochsminibatch_sizelambda_kl_coeff
clip_paramvf_clip_paramc                     h | ]}d | S )p ).0is     r    	<setcomp>r[      s    >>>!g!gg>>>r"   c                     d|  S )NrW   rX   )aidakws      r    r3   r3      s    III r"   )policiespolicy_mapping_fn)+__doc__	gymnasiumr+   1ray.rllib.connectors.env_to_module.frame_stackingr   +ray.rllib.connectors.learner.frame_stackingr   -ray.rllib.core.rl_module.default_model_configr   %ray.rllib.env.wrappers.atari_wrappersr   +ray.rllib.examples.envs.classes.multi_agentr   ray.rllib.examples.utilsr   r	   ray.tune.registryr
   parserset_defaultsadd_argumentint__name__rayr   
parse_argsr   r!   r&   r0   r   register_envalgoget_default_configenvironmentenv_runnersr-   trainingnum_learners	rl_modulebase_configr   rangerX   r"   r    <module>r|      s  N N^     V V V V V V L L L L L L L L L L L L N N N N N N H H H H H H        0 / / / / / 
'	&d#
 
 
          		5	       %
@     zD
 
 

 
 

 
 
  	
 	
 	
 	
 	+|444 	$)$$					 %*-0	   
 	

 	

 
 532$% 
 

 

 
 :W@W$+0q1& 
 


 


 
++ $NNN &$'5	   
 

 

C X yE 	 	
 	
 	
 >>uuT_'='=>>>== 	  	
 	
 	
 ('T:::::a r"   