
    &`i                        d dl Z d dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZ d dlmZ d d	lmZ d d
lmZ  e            \  ZZZ e j        e          Zeddefd            Zedefd            Z dS )    N)Dict)deprecation_warning)DEFAULT_POLICY_ID)OldAPIStack)try_import_tf)LEARN_ON_BATCH_TIMERLOAD_BATCH_TIMERNUM_AGENT_STEPS_TRAINEDNUM_ENV_STEPS_TRAINED)LearnerInfoBuilder)do_minibatch_sgd)log_oncereturnc           	      n   | j         }| j        }|j        |                    d|                    dd                    }|                    d          }||                    dd          }| j        t
                   }|5  |dk    s|dk    r6t          |fd|p                    |          D             ||g           }n                    |          }ddd           n# 1 swxY w Y   |	                    |j
                   | j        t          xx         |j
        z  cc<   | j        t          xx         |                                z  cc<   | j        rXi |t                    d	<   | j                                        D ].\  }	}
|
                    |          |t                    d	         |	<   /|S )
aV  Function that improves the all policies in `train_batch` on the local worker.

    .. testcode::
        :skipif: True

        from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
        algo = [...]
        train_batch = synchronous_parallel_sample(algo.env_runner_group)
        # This trains the policy on one batch.
        print(train_one_step(algo, train_batch)))

    .. testoutput::

        {"default_policy": ...}

    Updates the NUM_ENV_STEPS_TRAINED and NUM_AGENT_STEPS_TRAINED counters as well as
    the LEARN_ON_BATCH_TIMER timer of the `algorithm` object.
    
num_epochsnum_sgd_iter   minibatch_sizeNsgd_minibatch_sizer   c                 <    i | ]}|                     |          S  )
get_policy).0pidlocal_workers     q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/execution/train_ops.py
<dictcomp>z"train_one_step.<locals>.<dictcomp>=   s9        0055      off_policy_estimation)configenv_runner_grouplocal_env_runnerget_timersr   r   get_policies_to_trainlearn_on_batchpush_units_processedcount	_countersr   r
   agent_stepsreward_estimatorsr   itemstrain)	algorithmtrain_batchpolicies_to_trainr    workersr   r   learn_timerinfoname	estimatorr   s              @r   train_one_stepr6      s7   ( F(G+L::lFJJ~q,I,IJJLZZ 011N$8!<<#$89K	 < < !~11#   0  G#99+FF  
  DD  ..{;;D%< < < < < < < < < < < < < < <( $$[%6777-...+2CC.../000K4K4K4M4MM000" ;= 78(:@@BB 	 	OD)EN__F FD"#$;<TBB Ks   AC&&C*-C*c           	      ^   t          d          rt          d           | j        }| j        }|j        }|                    d|                    dd                    }|                    d          }||d	         }t          t          j        |d
         pd                    }||z  }||z  }	|	|z  dk    sJ |	|k    s
J d            |	                                }| j
        t                   }
|
5  i }|j                                        D ]\\  }}|j        |                    ||          s#|                                 |j        |                             |d          ||<   ]	 ddd           n# 1 swxY w Y   | j
        t$                   }|5  t'          |          }|                                D ]\  }}|j        |         }t)          dt          |          t          |          z            }t*                              d                    |                     t1          |          D ]i}t2          j                            |          }t1          |          D ]8}|                    ||         |z  d          }|                    ||           9j|                                }ddd           n# 1 swxY w Y   |
                    |j                    |                    |j                    | j!        tD          xx         |j         z  cc<   | j!        tF          xx         |$                                z  cc<   | j%        rXi |tL                   d<   | j%                                        D ].\  }}|'                    |          |tL                   d         |<   /|S )a  Multi-GPU version of train_one_step.

    Uses the policies' `load_batch_into_buffer` and `learn_on_loaded_batch` methods
    to be more efficient wrt CPU/GPU data transfers. For example, when doing multiple
    passes through a train batch (e.g. for PPO) using `config.num_sgd_iter`, the
    actual train batch is only split once and loaded once into the GPU(s).

    .. testcode::
        :skipif: True

        from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
        algo = [...]
        train_batch = synchronous_parallel_sample(algo.env_runner_group)
        # This trains the policy on one batch.
        print(multi_gpu_train_one_step(algo, train_batch)))

    .. testoutput::

        {"default_policy": ...}

    Updates the NUM_ENV_STEPS_TRAINED and NUM_AGENT_STEPS_TRAINED counters as well as
    the LOAD_BATCH_TIMER and LEARN_ON_BATCH_TIMER timers of the Algorithm instance.
    ,mulit_gpu_train_one_step_deprecation_warningz6ray.rllib.execution.train_ops.multi_gpu_train_one_step)oldr   r   r   r   Ntrain_batch_sizenum_gpusr   zBatch size too small!)buffer_index)num_devicesz== sgd epochs for {} ==r   )(r   r   r    r!   r"   r#   intmathceilas_multi_agentr$   r	   policy_batchesr,   is_policy_to_traindecompress_if_needed
policy_mapload_batch_into_bufferr   r   maxloggerdebugformatrangenprandompermutationlearn_on_loaded_batchadd_learn_on_batch_resultsfinalizer'   r(   r)   r   r
   r*   r+   r   r-   )r.   r/   r    r1   r   r   r   r=   per_device_batch_size
batch_size
load_timernum_loaded_samples	policy_idbatchr2   learner_info_buildersamples_per_devicepolicynum_batches_rN   batch_indexresultslearner_infor4   r5   s                             r   multi_gpu_train_one_stepr`   X   s   2 >?? 
I	
 	
 	
 	
 F(G+L::lFJJ~q,I,IJJLZZ 011N 23 diz 2 7a8899K +k9&4J#q(((($$$&=$$$ ,,..K "#34J	 < < + : @ @ B B 	< 	<Iu /;$77	;OO <  &&(((
 -9,C-$$U$;; y))	<< < < < < < < < < < < < < < <* #$89K	 7 7  2kJJJ-?-E-E-G-G 	X 	X)I)!,Y7Fa%7!8!8C@U<V<V!VWWKLL299)DDEEE<(( 
X 
X i33K@@#(#5#5 X XK %::#K03HHWX ;  G )CCGYWWWWX
X ,446637 7 7 7 7 7 7 7 7 7 7 7 7 7 76 ##K$5666$$[%6777 -...+2CC.../000K4K4K4M4MM000" -CE&'(?@(:@@BB 	- 	-OD) ,, *+,CD  s&   4A9E::E>E>DKKK)N)!loggingr?   typingr   numpyrL   ray._common.deprecationr   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r	   r
   r   $ray.rllib.utils.metrics.learner_infor   ray.rllib.utils.sgdr   ray.utilr   tf1tftfv	getLogger__name__rH   r6   r`   r   r   r   <module>rq      sz               7 7 7 7 7 7 ; ; ; ; ; ; 3 3 3 3 3 3 3 3 3 3 3 3            D C C C C C 0 0 0 0 0 0      }R		8	$	$ : :d : : : :z t t t t t t tr   