
    Pi                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlmZ d dlZd dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZmZmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m1Z1  e!j2        d          Z3dej4        j        _5         G d de)          Z6ej7        deddfd            Z8e9dk    r ej:         e8                       dS dS )    N)partial)chain)AnyDictListOptionalTupleUnion)warn)
DictConfig
ListConfig)nn)	Optimizer)
DataLoaderDistributedSampler)config
generationmodulesrlhftrainingutils)padded_collate)ConcatDataset)local_kv_cache)FTRecipeInterface)PPOStats
Trajectory)disable_dropoutDummyProfilerPROFILER_KEY)tqdmDEBUG   c                   (   e Zd ZdZdeddfdZdeddfdZ	 d8dee         dee	j
        j        ef         fdZd9d	Zdeddfd
Zdededededeej        ej        ej        ej        f         f
dZdededededeeef         deeef         deeef         deeef         deej        ej        ej        f         fdZ	 	 d:dededeeeef                  defdZdeded edeeef         fd!Z 	 d;d"ed#eddfd$Z!d%eeef         ddfd&Z"d'e	j#        de$fd(Z%d'e	j#        de$fd)Z&d9d*Z'd+e$d,e	j#        d-e	j#        d.ede(f
d/Z)d+e$d0e(d1e	j#        d2e	j#        d3e	j#        d4e	j#        ddfd5Z*d+e$d0e(d,e	j#        d-e	j#        d1e	j#        d2e	j#        ddfd6Z+d9d7Z,dS )<!PPOFullFinetuneRecipeSingleDevicea  
    Full finetuning recipe for RLHF with PPO for dense transformer-based LLMs such as LLama2. This recipe is optimized
    for single GPU training. Training on CPU is not supported.

    This implementation is based on "Learning to summarize from human feedback" (https://arxiv.org/abs/2009.01325) and
    "Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback" (https://arxiv.org/abs/2204.05862>).

    Features:
        - Activation Checkpointing. This can be controlled using the ``activation_checkpointing``
            flag. Activation checkpointing helps reduce the memory footprint since we no longer keep
            activations in memory and instead recompute them during the backward pass. This is especially
            helpful for larger batch sizes when you're memory constrained. But these savings in memory
            come at the cost of training performance. In most cases training can slow-down quite a bit as
            a result of this activation recomputation.

        - Precision. Full fp32 and bf16 training are supported. Precision is controlled using the ``dtype``
            flag. When ``dtype=bf16``, all activations, gradients and optimizer states are in bfloat16. In
            most cases this should halve the memory footprint of full precision (fp32) training, without
            loss in model quality (will depend on the model, training data and other settings). For
            GPUs which do not support bfloat16, we fall back to fp32. Mixed precision training and fp16
            precision are currently not supported.

        - Adjusting batch sizes when memory constrained. This recipe uses three different batch sizes:
            - ``batch_size`` controls the total number of samples which are sampled from the dataset for a single trajectory.
            - ``forward_batch_size`` controls the mini-batch size for trajectory generation. Since gradients are disabled
                during trajectory generation, memory consumption is lower and this can be higher than ``ppo_batch_size``.
            - ``ppo_batch_size`` controls the number of samples used for a single optimization step during PPO optimization.
                Since we're optimizing two models at once, adjusting this parameter can have a big impact during training.

        - Gradient Accumulation. You can simulate larger ``ppo_batch_size`` sizes by accumulating gradients. This is
            controlled using the ``gradient_accumulation_steps`` flag.

            For example: with ``ppo_batch_size``=32 and ``gradient_accumulation_steps``=16, each backward pass during
            PPO optimization uses a 'micro batch size' of 2.

            Gradient accumulation is especially useful when you are memory constrained. In this case,
            accumulating gradients might give you better training speed than enabling activation
            checkpointing.

        - Optimizer in Backward. Fusing the optimizer step into the backward pass helps reduce the memory
            footprint associated with gradients. This can be especially helpful when you are memory
            constrained. Note that users can only use ONE of gradient accumulation or optimizer in backward.
            These features currently do not work together. For more details on optimizer in backward, please
            see this tutorial: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html

            This paramater can provide significant performance gains, since there the number of optimization steps
            scales with ``ppo_epochs`` and ``batch_size``. Depending on the maximum sequence length sampled from the dataset,
            we've found that setting ``ppo_batch_size`` to the highest you can fit in memory, and `optimizer_in_bwd=True` to
            provide significant memory savings.

        - Lower precision optimizers. This recipe supports lower-precision optimizers from the bitsandbytes
            library (https://huggingface.co/docs/bitsandbytes/main/en/index). We've tested the recipe with
            8-bit AdamW and Paged AdamW. These optimizers are especially helpful when you are memory constrained
            since they help reduce the memory footprint associated with the optimizer states.

        - Checkpointing. Model weights are checkpointed both at the end of each epoch, and at the end of
            training. Optimizer State and recipe state (seed, total_epochs, number of epochs run etc) are
            only saved at the end of a given epoch and used in case of resuming training.

            Resuming training is controlled by the ``resume_from_checkpoint`` flag. Mid-epoch checkpointing is
            currently not supported.

            For more details on the checkpointer, please take a look at
            our checkpointer deepdive (https://pytorch.org/torchtune/main/deep_dives/checkpointer.html).

        - Logging. Terminal, Disk, WandB and TensorBoard are all supported.

    Args:
        cfg (DictConfig): OmegaConf object parsed from yaml file

    Raises:
        RuntimeError: If ``dtype`` is set to fp16.
    cfgreturnNc                    t          j        |j                  | _        t	          j        |j        | j                  | _        | j        t          j	        k    rt          d          |j        | _        |                    dd          | _        |                    dd          | _        | j        r1| j        j        dk    r!t"                              d           d| _        t	          j        |j        |                    d	d           
          | _        t          j        | j                                      | j                  | _        d| _        d| _        d| _        d| _        d| _        |j        | _        |j        | _         d S )NdevicezVfull fp16 training is not supported with this recipe. Please use bf16 or fp32 instead.log_every_n_steps   log_peak_memory_statsFcudazplog_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False.cudnn_deterministic_mode)seed
debug_moder   )!r   
get_devicer*   _devicer   	get_dtypedtype_dtypetorchfloat16RuntimeError
output_dir_output_dirget_log_every_n_steps_log_peak_memory_statstypeloginfoset_seedr0   	Generatormanual_seed_rng_total_steps
_steps_run_total_epochs_epochs_runglobal_stepresume_from_checkpoint_resume_from_checkpointgradient_accumulation_steps_gradient_accumulation_steps)selfr&   s     {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/recipes/ppo_full_finetune_single_device.py__init__z*PPOFullFinetuneRecipeSingleDevice.__init__o   so   'sz:::(4<HHH ;%-''h  
 >"%''*=q"A"A&)gg.Eu&M&M#& 	04<+<+F+FHH C   +0D' %cgg.H$&O&O
 
 
	 ODL11==diHH	 (+'A$,/,K)))    c                     t          j        |j                   _         j                            |                                |j        |j        |j        |j	                  \   _
        } _        } j
                                        }|                                } j                                        }|                                }|j         _        |j         _                             |j        |j        |j         j        |t(          j                 |t(          j                 |t(          j                 |t(          j                           \   _         _         _         _        t          j        |j                   _        t8                              d                                |j        |j         j         r|t(          j!                 nd           _"        t          j        |j#                   _$        t8                              d            %                    |j&        |j'        |j(                  \   _)         _*         +                    |            ,                    |            fd _-         j         r .                    |            j/         j0        z   j(         j1        z  z   _2         3                    |4                    tj          d                     _6        dS )z
        Sets up the recipe state correctly. This includes setting recipe attributes based
        on the ``resume_from_checkpoint`` flag.
        )	cfg_modelcfg_reward_value_modelenable_activation_checkpointingcompile_modelpolicy_state_dictref_policy_state_dictvalue_model_state_dictreward_model_state_dictz#Tokenizer is initialized from file.N)cfg_optimizeroptimizer_in_bwdopt_state_dictzLoss is initialized.)cfg_datasetshuffle
batch_sizec                 ~    | r(t          j        j        j        j        |          nt          j                    S )N)ra   r5   r*   decoder_max_seq_len)r   _policy_model_forward_batch_sizer6   r3   
contextlibnullcontext)enable_kv_cacherc   rO   s     rP   <lambda>z9PPOFullFinetuneRecipeSingleDevice.setup.<locals>.<lambda>   sO     *N"3k|$7    ')) rR   )7r   instantiatemetric_logger_metric_logger
log_config_setup_checkpointerscheckpointerref_policy_checkpointervalue_checkpointerreward_checkpointer_policy_checkpointer_value_checkpointerload_checkpointcompiler]   _optimizer_in_bwd_setup_modelspolicy_modelreward_and_value_modelrV   r   	MODEL_KEYrd   _value_model_reward_model_ref_policy_model	tokenizer
_tokenizerr@   rA   _setup_optimizer	optimizerrL   OPT_KEY
_optimizerloss_loss_fn_setup_datadatasetr`   ra   _sampler_dataloader_setup_training_parameters_setup_training_hyperparameterscache_ctx_manager_update_recipe_staterG   _ppo_epochs_ppo_batch_sizerJ   _setup_profilerr<   r    	_profiler)rO   r&   rp   rr   policy_model_checkpoint_dictrY   value_model_checkpoint_dictr[   s   `       rP   setupz'PPOFullFinetuneRecipeSingleDevice.setup   s   
 %01BCC 	&&s+++ %%'"#	
 
	
%#$ (,'@'P'P'R'R$ 7 G G I I '+&>&N&N&P&P#"5"E"E"G"G {!$!5 &#&#=,/,O,:8;MN"78J"K#>x?Q#R$;H<N$O  	
 	
	
" !,S];;6777 //- 1 /,X-=>> 0 
 
 *3844'((( +/*:*:K~ +; +
 +
't' 	'',,,,,S111
"
 
"
 
"
 
"
 ' 	D%%&BCCC O$"668 	 --cgglD.I.IJJrR   cfg_profilerc                    |t          ddi          }|                    dd          d|d<   n#|                    d          dk    s
J d            t          j        |          \  }}t                              d|            |                    dd          | _        |d         r'|d	         | _        |d
         | _        |d         | _	        |S )a  
        Parses the `profiler` section of top-level `cfg` and sets up profiler

        Args:
            cfg_profiler (Optional[DictConfig]): ``profiler`` section of the top-level ``cfg`` (the main config passed to
                `recipe.main`). Default None.

        Returns:
            profiler: Union[torch.profiler.profile, DummyProfiler] - DummyProfiler is a nullcontext with no-op methods
            for `start`, `stop`, and `step` that can be used in place of `torch.profiler.profile` if profiler is not enabled such
            that the instrumented training loop does not need to be changed profiling is disabled.

        The profiler config can be provided in configs under the `profiler` key with the following layout:

        .. code-block:: yaml
            profiler:
                enabled: bool

                #Output directory of trace artifacts
                output_dir: str

            #`torch.profiler.ProfilerActivity` types to trace
            cpu: bool
            cuda: bool

                #Trace options
                profile_memory: bool
                with_stack: bool
                record_shapes: bool
                with_flops: bool

            # `torch.profiler.schedule` options:
            # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
            wait_steps: int
            warmup_steps: int
            active_steps: int
            num_cycles: int
        NenabledF_component_z'torchtune.training.setup_torch_profilerzdOnly torch profiler supported currently: component must be `torchtune.training.setup_torch_profiler`z& Profiler config after instantiation: profile_memory
wait_stepswarmup_stepsactive_steps)
r   r<   r   rj   r@   rA   profiler_profile_memoryprofiler_wait_stepsprofiler_warmup_stepsprofiler_active_steps)rO   r   profilerprofiler_cfgs       rP   r   z1PPOFullFinetuneRecipeSingleDevice._setup_profiler  s    V %y%&899L M4008*SL''   //<= = =u= = = "(!3L!A!A,H,HHIII'3'7'78H%'P'P$	" 	F'3L'AD$)5n)ED&)5n)ED&rR   c                 4   |j         | _        |j        | _        |j        | _        |j        | _        |j        | _	        |j
        | _        |j        | _        |j        | _        |j        | _        |j        | _        |                    dd          r6|j        }| j        j        |vr t1          d| j        j         d| d           n3t3          | j        d          st1          d           g }n| j        j        }t7          j        || j                  | _        d	S )
z
        Sets up the training hyperparameters for the recipe. This includes the GAE hyperparameters,
        generation hyperparameters, reward masking hyperparameters, and stop token ids.
        stop_token_idsFztokenizer eos_id (z) is not in stop_token_ids (z().This may lead to unexpected behaviour.stop_tokenszkNo stop tokens defined in tokenizer, and no stop_token_ids provided. This may lead to unexpected behaviour.r)   N)kl_coeff	_kl_coeffgamma_gammalmbda_lmbdawhiten_rewards_whiten_rewardstemperature_temperaturetop_k_top_kmax_generated_tokens_max_generated_tokensmin_response_length_min_response_lengthpenalise_no_eos_penalise_no_eosreward_penalty_reward_penaltyr<   r   r   eos_idr   hasattrr   r7   tensorr3   _stop_token_ids)rO   r&   r   s      rP   r   zAPPOFullFinetuneRecipeSingleDevice._setup_training_hyperparametersJ  s9    ii"1  Oi%(%=" %($;! # 3"1 77#U++ 	= /N%^;;=)? = =]k = = =  
 4?M:: = B   "$!%!<$|N4<PPPrR   c           	         |j         | _         |j        | _        |j        | _        |j        | _        |j        | _        |j        | j        z  | _	        |j
        | _
        | j         | j        z  dk    r t          d| j          d| j         d          | j         | j        z  dk    r t          d| j          d| j         d          | j        | j        z  dk    r t          d| j         d| j         d          | j        dk    r| j        rt          d	          |j        | j         z  | _        t!          dt#          | j                            }t'          j        | j        |z            | _        | j        dk    r t          d
|j         d| j          d          | j        t#          | j                  k     r5t-          d| j         dt#          | j                   d| j         d           | j        |k    r,| j        |z  dk    rt-          d|j         d| d| d           t.                              d| j         d| j                    dS )a  
        Validates and sets up parameters for used during training and for tracking training state,
        batch sizes for model forward passes during trajectory generation, PPO minibatches, and
        PPO microbatches for gradient accumulation.

        Raises
            - ValueError if:
                - batch_size is not divisible by forward_batch_size
                - batch_size is not divisible by ppo_batch_size
                - ppo_batch_size is not divisible by gradient_accumulation_steps
                - num_steps is less than batch_size
                - gradient_accumulation_steps > 1 and optimizer_in_bwd is True
        r   zbatch_size (z3) must be exactly divisible by forward_batch_size (z).z/) must be exactly divisible by ppo_batch_size (zppo_batch_size (z<) must be exactly divisible by gradient_accumulation_steps (r,   zGradient accumulation is not supported with optimizer in bwd.Please set gradient_accumulation_steps=1, or optimizer_in_bwd=False.z
num_steps z% must be greater than the batch size .zThere are fewer total steps (z2, (num_steps//batch_size) than there are batches (z,) in the dataset. Training will stop after (z/) steps without saving intermediate checkpointsznum_steps (zD) is not exactly divisible by the number of batches in the dataset (z5). Intermediate checkpoints will only be saved every z steps.zTotal steps to run: z, Total epochs to run: N)ra   forward_batch_sizere   
ppo_epochsr   ppo_batch_sizer   rM   rN   _ppo_backward_batch_sizerh   
ValueErrorrw   r9   	num_stepsrF   maxlenr   mathceilrH   r   r@   rA   )rO   r&   batches_per_epochs      rP   r   z<PPOFullFinetuneRecipeSingleDevice._setup_training_parametersr  s8    .#&#9 >"1,/,K)$"CC 	%  #2?T55::Dt D D'+'?D D D   ?T11Q66<t < <#'#7< < <   $"CCqHHY4#7 Y Y373TY Y Y  
 ,q00T5K0W  
  MT_<s4#$$
 
 "Yt'8;L'LMM!!cS]ccQUQ`ccc   s4#34444p0A p p+.t/?+@+@p p-1->p p p  
  111 11Q66`cm ` `9J` `EV` ` `  
 	a4#4aaTM_aa	
 	
 	
 	
 	
rR   
policy_cfgref_policy_cfg	value_cfg
reward_cfgc                 ~   | j         sT|j        |j        k    sJ d|j         d|j                     |j        |j        k    sJ d|j         d|j                     t          j        || j                   }t          j        |d          }t          j        |d          }t          j        |d          }||||fS )z
        Sets up checkpointers for policy, reference policy, value, and reward models.
        Only the policy checkpoint handles recipe state for resuming from checkpoints.
        zrPolicy and reference policy should be loaded from the same checkpoint directoriesat the start of training. Found: z andzlPolicy and reference policy should be loaded from the same checkpoint filesat the start of training. Found: )should_load_recipe_stateF)rL   checkpoint_dircheckpoint_filesr   rj   )	rO   r   r   r   r   policy_checkpointerrp   rq   rr   s	            rP   rn   z6PPOFullFinetuneRecipeSingleDevice._setup_checkpointers  s6   " + 
	,0MMMM34>4M3 3!03 3 NMM
 ..2QQQQ54>4O5 5!25 5 RQQ %0%)%A
 
 

 #)"4%*#
 #
 #

 $/%*
 
 

 %0%*
 
 
  #	
 	
rR   rT   rU   rV   rW   rX   rY   rZ   r[   c	                    t          j        | j                  5  | j        5  t	          j        |          }	t	          j        |          }
t	          j        |          }t	          j        |          }ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   |rPt          j        |	           t          j        |
           t          j        |           t          j        |           |rBt          j        |	t          j	        h           t          j        |t          j	        h           |	
                    |           |

                    |           t          j        ||                                           |
                    |           t          j        ||                                           |
                    |           t          j        |                                | j                   t          j        |                                | j                   t          j        |                                | j                   t          j        |
                                | j                   t                              d| j         d           t!          |	           t!          |           |                                 |
                                 |                                D ]	}d|_        
|
                                D ]	}d|_        
| j        j        dk    r.t          j        | j                  }t          j        |           |	|||
fS )	zb
        Sets up the policy model, reference policy model, reward model, and value model.
        N)auto_wrap_policy)r5   z&Models are initialized with precision r   Fr.   r)   )r   set_default_dtyper6   r3   r   rj   rW   set_activation_checkpointingr   TransformerSelfAttentionLayerload_state_dict update_state_dict_for_classifiernamed_parametersvalidate_expected_param_dtyper@   rA   r   eval
parametersrequires_gradr?   get_memory_statslog_memory_stats)rO   rT   rU   rV   rW   rX   rY   rZ   r[   ry   ref_policy_modelreward_modelvalue_modelpmemory_statss                  rP   rx   z/PPOFullFinetuneRecipeSingleDevice._setup_models  s*    '44 	E 	Edl 	E 	E!-i88L%1)<<!-.DEEL ,-CDDK		E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E  	1"<000"#3444";///"<000* 	10U/V    1w/T.U    	$$%6777(()>??? 	1#\%B%B%D%D	
 	
 	
 	$$%<=== 	1"K$@$@$B$B	
 	
 	
 	##$:;;; 	.((**$+	
 	
 	
 	
 	.))++4;	
 	
 	
 	
 	.((**$+	
 	
 	
 	
 	.--//t{	
 	
 	
 	
 	H$+HHHIII 	%%%$$$ 	((** 	$ 	$A#AOO!,,.. 	$ 	$A#AOO<&&#4DLIIIL%l333[,8HHHs5   BAA?3B?B	BB	BBBFr\   r]   r^   c                 J   |rfdt          | j                                        | j                                                  D             }t	          j        | j        |           t	          j        | j        |           t	          j        | j        |          | _        t	          j        | j        |          | _        |>	 | j                            |           n"# t          $ r}t          d          |d }~ww xY wt                              d           d S t          j        t          | j                                        | j                                                            }|r|                    |           t                              d           |S )Nc                 >    i | ]}|t          j        |g          S  )r   rj   ).0r   r\   s     rP   
<dictcomp>zFPPOFullFinetuneRecipeSingleDevice._setup_optimizer.<locals>.<dictcomp>X  s:        6%maS99  rR   )model
optim_dictzzFailed loading in-backward optimizer checkpoints.Please make sure run being restored from was using in-backward optimizer.z"In-backward optimizers are set up.zOptimizer is initialized.)r   rd   r   r|   r   register_optim_in_bwd_hookscreate_optim_in_bwd_wrapper_optim_ckpt_wrapperr   BaseExceptionr9   r@   rA   r   rj   )rO   r\   r]   r^   r   er   s    `     rP   r   z2PPOFullFinetuneRecipeSingleDevice._setup_optimizerO  s     ,	   &1133T5F5Q5Q5S5S   J 0(Z    0'J    (0'K(Z( ( (D$ (0'K'J( ( (D$ ),<<^LLLL$   &d  
 HH9:::4*d(3355t7H7S7S7U7UVV I  :)).999HH0111s   C# #
D-C==Dr_   r`   ra   c                 >    t          |t                    r fd|D             }t          |          }nt          j        | j                  }t          |dd|d          }t          |||dt          t          dd	d
g j        j
                            }||fS )z6
        All data related setup happens here.
        c                 F    g | ]}t          j        |j                   S )r   )r   rj   r   )r   single_cfg_datasetrO   s     rP   
<listcomp>zAPPOFullFinetuneRecipeSingleDevice._setup_data.<locals>.<listcomp>  s<       & "#5QQQ  rR   )datasetsr   r,   r   )num_replicasrankr`   r0   Tlefttokenslabels)pad_directionkeys_to_padpadding_idx)r   samplerra   	drop_last
collate_fn)
isinstancer   r   r   rj   r   r   r   r   r   pad_id)rO   r_   r`   ra   r   dsr  
dataloaders   `       rP   r   z-PPOFullFinetuneRecipeSingleDevice._setup_data  s     k:.. 	L   *5  H 111BB#K4?KKKB$
 
 
  !$%x0 O2	  
 
 

 
""rR   epochis_intermediate_checkpointc                    t           j        | j                                        i}t           j        | j                                        i}|r|                    t           j        | j        t           j        | j	        t           j
        | j        t           j        | j        t           j        | j        t           j        | j                                        i           | j        s'| j                                        |t           j        <   n&| j                                        |t           j        <   | j                            |||           | j                            ||d           dS )z
        Save state dict to file. The recipe save_checkpoint method is responsible for
        correctly creating the checkpoint dict and passing to the checkpointer.
        )r	  intermediate_checkpointFN)r   r{   rd   
state_dictr|   updateSEED_KEYr0   
EPOCHS_KEYrI   TOTAL_EPOCHS_KEYrH   MAX_STEPS_KEYrF   	STEPS_KEYrG   RNG_KEYrE   	get_staterw   r   r   r   rs   save_checkpointrt   )rO   r	  r
  policy_ckpt_dictvalue_ckpt_dicts        rP   r  z1PPOFullFinetuneRecipeSingleDevice.save_checkpoint  s_    %.0B0M0M0O0OP#-t/@/K/K/M/MN & 	:##%ty')9-t/A*D,=&$di&9&9&;&;	 	 	 ) :59_5O5O5Q5Q !122 ,7799 !$ 	!11$> 	2 	
 	
 	
 	 00$) 	1 	
 	
 	
 	
 	
rR   	ckpt_dictc                 p   	 | j         |t          j                 k    s6| j        |t          j                 k    s| j        |t          j                 k    rt          d           t          j        |t          j                           | _         | j	        
                    |t          j                            |t          j                 | _        |t          j                 | _        |t          j                 | _        |t          j                 | _        dS # t           $ r}t            |d          d}~ww xY w)z;
        Updates the recipe state from checkpoint.
        zzConfigured value for seed, total_steps, or total_epochs
                    does not match the value stored in checkpoint.)message)r0   zCheckpoint does not contain the required keys needed for updating recipe state.Are you sure you passed in the right recipe checkpoint?N)r0   r   r  rF   r  rH   r  r   rB   rE   	set_stater  r  rG   r  rI   KeyError)rO   r  r   s      rP   r   z6PPOFullFinetuneRecipeSingleDevice._update_recipe_state  s"   		Yx'8999$	(2H(III%83L)MMMF    !)y9J/KLLLDII	(*: ;<<<'(:;DO )(*@ AD!*8+D!ED()<=D 	 	 	J! ! 	s   DD 
D5D00D5	input_idsc                    |j         \  }}|                     | j        || j        z             5  t	          j        | j        || j        | j        | j        | j	        j
        | j                  \  }}ddd           n# 1 swxY w Y   |dd|df                                         }|| j	        j
        k    }t	          j        |          }t	          j        |          }	~t          j        ||| j                  }
~|                     ||	|          }t          j        ||          }t          j        ||| j                  }~|                     ||	|          }t          j        ||                              d          }t          j        || j        | j	        j
                  \  }}|                     t1          j        ||gd          |	|          }~t5          j        |          }|                    d||z   ddddf                                       d          }| j        s| j        r+t          j        ||| j        | j                  }| j         ||<   d	|
|<   d	||<   t1          j!        |d
k    || j        dz
  k     z  |dz   |          }|                                }|"                    d|#                    d          d          }d||<   tI          ||
||||	|||||          S )a4  
        Generates a trajectory given the current policy and value models, the reference policy model, the reward model,
        and batch of inputs. This is done over the following steps:

        1: Generate responses, and logits corresponding to the responses using the current policy,
            generating (query, response) pairs.
        2. Estimate logprobs of the generated responses using the current policy.
        3. Estimate values from the generated responses using the current value function.
        4. Replace any tokens in the response after the first stop token (usually EOS token) with padding,
            producting truncated responses.
        5. Run the reward model on the (query, truncated-response) pairs.
        6. Mask out all the invalid values in the trajectory due to padding tokens.

        Args:
            input_ids (torch.Tensor): tensor of input token IDs with shape [b, seq_length]

        Returns:
            Trajectory: An instance of :class:`~torchtune.rlhf.Trajectory` comprising
                the current trajectory.
        )rc   )r   promptr   r   r   r  rngN	input_posmaskr,   )dim)r%        ?r   F        )query_responseslogprobsref_logprobsvaluesmasksposition_idsresponse_padding_masksvalue_padding_masksvalue_seq_idxsscoresseq_lens)%shaper   rh   r   r   generaterd   r   r   r   r  rE   clone!get_causal_mask_from_padding_mask"get_position_ids_from_padding_maskr   logits_to_logprobsr~   truncate_sequence_for_logprobsr|   squeeze%truncate_sequence_at_first_stop_tokenr   r}   r7   catr   get_unmasked_sequence_lengthsgatherr   r   get_reward_penalty_maskr   wherescatter_	unsqueezer   )rO   r  _context_lengthr*  logits	responsesquery_response_padding_masksr.  r/  r+  
ref_logitsr,  r-  r0  r3  r4  reward_penalty_maskr2  r1  s                       rP   generate_trajectoryz5PPOFullFinetuneRecipeSingleDevice.generate_trajectory  s   * &O>##  .1K K $ 
 
 	 	 '1&9( %)%? -k-I' ' '#OV		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 $AAA~$67==??	'6$/:P'P$ <(
 
 "D(
 
 ) *69d>OPP ++|% , 
 

 8^TT
.z9dFWXX ""?lQV"WW4V^LLTTUWXX -1,Vt+T_-C-
 -
)	
 ##Iy),!444" $ 
 
  9:PQQq8n#<aaatm"LMMUU
 
   	?D$= 	?"&">&%)	# # +/*>F&' ,/'(/2+, \h)Ca)GGHqL
 

 5::<<1::~''++U
 
 '*"#+%%#9 3)
 
 
 	
s   AA==BBc                 ^   g }t          j                    5  t          d| j        | j                  D ]<}|||| j        z            }|                    |                     |                     =	 ddd           n# 1 swxY w Y   t          t          t           j	        t          |            S )a  
        Generates a self.batch_size batch of trajectories using self._forward_batch_size batch sizes.
        See generate_trajectory for more details.

        Args:
            input_ids (torch.Tensor): tensor of input token IDs with shape [b, seq_length]

        Returns:
            Trajectory: An instance of :class:`~torchtune.rlhf.Trajectory`, comprising
                the current trajectory.
        r   N)r7   no_gradrangera   re   appendrL  r   mapr>  zip)rO   r  trajectoriesbatch_startbatch_input_idss        rP   generate_trajectory_batchedz=PPOFullFinetuneRecipeSingleDevice.generate_trajectory_batchedq  s    *,]__ 	O 	O$Q9QRR O O"++0H"HH# ##D$<$<_$M$MNNNNO	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 3uy#|*<==>>s   AA<<B B c                 n
   | j         rt                              d           | j        s| j                                         d}| j                                         t          | j	        | j
                  }t          | j        | j                  D ]}| j                            |           t!          | j                  D ]\  }}|dk    rM| j        rF|| j        | j        z   k    r3| j        j        dk    r#t.          j        j                                         |d                             | j                  }|j        \  }}|                                }t=          j                    }	|                      |          }
t=          j                    |	z
  }tC          j"        |
j#        |
j$        |
j%        | j&        |
j'                  \  }}}tC          j(        |
j)        || j*        | j+        |
j,                   \  }}t=          j                    }g }t          | j-                  D ]q}t/          j.        | j/        | j                  }t          d| j/        | j0                  D ]1}|||| j0        z            }g }t          d| j0        | j1                  D ]}}|||| j1        z            }te          tg          ti          t.          j5        d|	          |
           }|6                    | 7                    |||         ||         |                     ~~|6                    tq          tg          tr          tu          |                       | j        s4| j        ;                                 | j                            d
           | xj<        dz  c_<        3st=          j                    |z
  }| xj
        dz  c_
        | j
        | j=        z  dk    rF| >                    |
tq          tg          t.          j?        tu          |            ||||z  ||z             | @                    |
|||||           |A                    d           |dk    rW| j        rP|| j        | j        z   | jB        z   k    r5| j        j        dk    r%t.          j        j                            d           | j        ;                                 | j
        | j	        k    rd
} n| xj        dz  c_        | C                    ||            |r| j        D                                  dS | j        D                                 dS )z 
        The core training loop.zoNOTE: torch.compile is enabled and model is compiled in first forward.Expect a relatively slow first iteration.F)totalinitialr   r.   r   )r.  r)   )r&  indexT)set_to_noner,   N)r   )r
  )Erv   r@   rA   rw   r   	zero_gradr   startr!   rF   rG   rO  rI   rH   r   	set_epoch	enumerater   r   r   r   r3   r?   r7   r.   memory_record_memory_historytor5  numeltimeperf_counterrV  r   get_rewards_ppor3  r+  r,  r   r2  estimate_advantagesr-  r   r   r0  r   randpermra   r   r   r   rQ  r   index_selectrP  ppo_stepr   sumrR  steprJ   r=   log_metricsstackcleanup_after_stepr  r   r  stop)rO   training_completedpbar
curr_epochidxbatchrE  rF  
num_tokenst0_traj
trajectory	traj_timerewardskl
kl_rewards
advantagesreturnst0_ppo	ppo_stats
batch_idxsimini_batch_idxsbatch_ppo_statsjbackward_batch_idxsbatch_trajectoryppo_times                              rP   trainz'PPOFullFinetuneRecipeSingleDevice.train  s    < 	HH<  
 % 	(O%%'''"$+T_EEE 0$2DEE C	 C	J M##J///'(899 t t
U !OO4 $t7$:TTTT)V33J%<<>>>h**4<88$)K!>"[[]]
 +--!==eDD
 -//'9	 +/*>%'+N-+ +'Z '+&>%KK%<<' ' '#
G *,,,.	t/00 '. '.A!&!U!U!UJ"1dot7KLL %. %.*4QT=Q9Q5Q*R:<!&t3T5R" " 1 1A 3B !A(E$E E3/ 0:!$$+(-(:,-.A%& %& %&
 %/"" ""	0, ,22 $$4$./B$C$+,?$@$2	!" !"   !1 0!((3sC<Q3R3R)STTT#5 H O00222 O55$5GGG((A-(((K%.L  ,..7 1$?T%<<AA$$" #ek3	?"C"CD""Y."X-   ''	:wJ   A !OO4 $/01011 1 )V33J%<<T<JJJ ##%%%?d&777)-&E 8
 !  ;M7M !    " ##%%% 	rR   rx  r}  r~  rF  c           
      z   |                      |j        |j        |j                  }t	          j        ||          }t	          j        ||j        dd|df         | j                  }d||j        <   ~| 	                    |j        |j        |j                  }t	          j        ||          
                    d          }d||j        <   |                     |j        |||j        |||j         |j                   \  }}	}
}}|| j        z  }|                                 t#          j                    5  d||j        z
                      d          z                                  }ddd           n# 1 swxY w Y   t+          ||	| j        z  |
| j        z  || j        z  || j        z  || j        z            S )	a  
        Perform a single PPO optimisation step over a batch of trajectories and corresponding advantages and returns.

        Args:
            trajectory (Trajectory): a batch of trajectories
            advantages (torch.Tensor): advantages corresponding to the trajectories
            returns (torch.Tensor): returns corresponding the trajectories
            context_length (int): input ids sequence length

        Returns:
            PPOStats: An instance of :class:`~torchtune.rlhf.PPOStats`, a NamedTuple containing:
               - loss (torch.Tensor): The total PPO loss.
               - policy_loss (torch.Tensor): The policy function loss.
               - value_loss (torch.Tensor): The value function loss.
               - ratios (torch.Tensor): The ratio between the current and old policy probabilities.
               - clipfrac (torch.Tensor): The fraction of ratios that were clipped.
               - approx_policy_kls: Average estimated KL divergence between the policy before and after the optimisation step.

        r"  Nr(  r%  r)  )padding_masksr1  g      ?   )rd   r*  r/  r.  r   r;  r:  r   r0  r|   r<  r1  r   r+  r-  rN   backwardr7   rN  powmeanr   )rO   rx  r}  r~  rF  	pi_logitspi_logprobs
phi_valuesr   policy_loss
value_lossratiosclipfracapprox_policy_klss                 rP   rj  z*PPOFullFinetuneRecipeSingleDevice.ppo_step  s2   6 &&& -! ' 
 
	
 7	>RR	-z1!!!^__2DEtGX
 
 :=J56 &&& -! ' 
 

 8
 

'"++ 	 69
:12 ;?--%<<!+!? ? ;H 	;
 	;
7k:vx 	11]__ 	 	{Z%88==a@@@dff 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 $;;::T66t88 AA
 
 	
s   73E66E:=E:r  r{  r|  tokens_per_second_trajectorytokens_per_second_lossc                    |j                                         |j                            d                                          |j                                         |                    d                                          z   |                    d                                          |                    d                                          |j                                        |j                                        |j                                        |j                                        |j	                                        |j
                                        |j                                                                        ||d}| j        j        dk    r4| j        r-|                    t#          j        | j                             | j                            || j                   dS )zW
        Log metrics and statistics for the current step to the metric logger.
        r%  r,   )r3  num_stop_tokensrlhf_rewardr{  	kl_rewardr   r  r  r  r  approx_policy_klresponse_lengths$tokens_per_second_per_gpu_trajectorytokens_per_second_per_gpu_ppor.   r)   )rl  N)r3  r  r0  anyrk  r   r  r  r  r  r  r4  floatr3   r?   r>   r  r   r   rl   log_dictrJ   )rO   rx  r  r{  r|  r  r  r  s           rP   rm  z-PPOFullFinetuneRecipeSingleDevice.log_metricso  s    !',,..)@DDRHHLLNN%,1133jnnQ6G6G6L6L6N6NN&&))..""#**//11N''))$05577#.3355!*//11&++-- ) ; @ @ B B * 3 9 9 ; ; @ @ B B4P-C
 
  <&&4+F&OOH5T\JJJKKK$$XD4D$EEEEErR   c                 *    |D ]}~~|D ]}~~~~~~dS )zH
        Cleanup tensors after each PPO step to free up memory.
        Nr   )rO   rx  r  r}  r~  r{  r|  vs           rP   ro  z4PPOFullFinetuneRecipeSingleDevice.cleanup_after_step  sB      	 	A 	 	AJJrR   c                 8    | j                                          d S N)rl   close)rO   kwargss     rP   cleanupz)PPOFullFinetuneRecipeSingleDevice.cleanup  s    !!#####rR   r  )r'   N)FN)F)-__name__
__module____qualname____doc__r   rQ   r   r   r
   r7   r   profiler   r   r   r   r	   r   Checkpointerrn   boolr   strr   r   Modulerx   r   r   intr   r   r   r  r   Tensorr   rL  rV  r  r   rj  rm  ro  r  r   rR   rP   r%   r%   $   s       H HT&LJ &L4 &L &L &L &LPnK nK nK nK nK nKb 48A A$Z0A	u~%}4	5A A A AF&Q &Q &Q &QPH
j H
T H
 H
 H
 H
T6
6
 #6
 	6

 6
 
	
6
 6
 6
 6
pYIYI !+YI *.	YI
 YI  S>YI  $CH~YI !%S#XYI "&c3hYI 
ry")RY.	/YI YI YI YI| "'37	3 3!3 3 !c3h0	3
 
3 3 3 3j##%##04##BE##	!:-	.## ## ## ##L >C'
 '
'
6:'
	'
 '
 '
 '
Rd38n     :@
U\ @
j @
 @
 @
 @
D?U\ ?j ? ? ? ?,U U U UnO
O
 LO
 	O

 O
 
O
 O
 O
 O
bFF F L	F
 LF ',lF !&F 
F F F FB  L	
  L L 
   0$ $ $ $ $ $rR   r%   r&   r'   c                     t          j        d|            t          |           }|                    |            |                                 |                                 dS )z
    Entry point for the recipe.

    Configurable parameters are read in the following order:
        - Parameters specified in config (see available configs through ``tune ls``)
        - Overwritten by arguments from the command-line
    r%   )recipe_namer&   )r&   N)r   rm   r%   r   r  r  )r&   recipes     rP   recipe_mainr    sb     "E3OOOO.3777F
LLSL
LLNNN
NNrR   __main__);rf   r   sysrd  	functoolsr   	itertoolsr   typingr   r   r   r   r	   r
   warningsr   r7   	omegaconfr   r   r   torch.optimr   torch.utils.datar   r   	torchtuner   r   r   r   r   r   torchtune.datar   torchtune.datasetsr   torchtune.modulesr   torchtune.recipe_interfacesr   torchtune.rlhfr   r   torchtune.trainingr   r   r    r!   
get_loggerr@   _dynamocache_size_limitr%   parser  r  exitr   rR   rP   <module>r     si        



              : : : : : : : : : : : : : : : :        , , , , , , , ,       ! ! ! ! ! ! ; ; ; ; ; ; ; ; H H H H H H H H H H H H H H H H ) ) ) ) ) ) , , , , , , , , , , , , 9 9 9 9 9 9 / / / / / / / / K K K K K K K K K K      ew )+  %E$ E$ E$ E$ E$(9 E$ E$ E$P$ Z D     zCH[[]] rR   