
    PiD                     \    d dl mZmZ d dlZd dlmZ d dlmZ  G d dej                  Z	dS )    )OptionalTupleN)rlhfc                       e Zd ZdZ	 	 	 ddededef fdZ	 	 dd	ej        d
ej        dej        dej        dej        dej        deej                 deej                 de	ej        ej        ej        ej        ej        f         fdZ
 xZS )PPOLossa  
    Proximal Policy Optimization (PPO) Loss module.
    This implementation uses the following references:

    https://arxiv.org/abs/1707.06347 eqn. 7

    https://github.com/vwxyzjn/lm-human-preference-details/blob/ccc19538e817e98a60d3253242ac15e2a562cb49/lm_human_preference_details/train_policy_accelerate.py#L719

    https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/model.py#L68-L75


    Args:
        epsilon (float): clipping range for PPO update.
        value_clip_range (float): clipping range for value function update.
        value_coeff (float): coefficient for the value function loss contribution.
    皙?皙?epsilonvalue_clip_rangevalue_coeffc                 r    t                                                       || _        || _        || _        d S )N)super__init__r
   r   r   )selfr
   r   r   	__class__s       k/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/rlhf/loss/ppo.pyr   zPPOLoss.__init__    s8     	 0&    Npi_old_logprobspi_logprobs
advantagesphi_old_values
phi_valuesreturnspadding_masksvalue_padding_masksreturnc	                 r   t          j        ||z
            }	t          j        |	d| j        z
  d| j        z             }
| |
z  }| |	z  }||k                        |j                  }||                                nt          j        ||          }t          j	        ||          }||                                nt          j        ||          }t          j        ||| j
        z
  || j
        z             }t          j	        ||z
  dz  ||z
  dz            }|d|                                z  ndt          j        ||          z  }||| j        z  z   }||                                |                                |	                                                                |                                fS )a  

        Forward pass of the PPO loss module.

        Args:
            pi_old_logprobs (torch.Tensor): Log probabilities of the old policy.
            pi_logprobs (torch.Tensor): Log probabilities of the current policy.
            advantages (torch.Tensor): Advantage values.
            phi_old_values (torch.Tensor): Value predictions of the old value function.
            phi_values (torch.Tensor): Value predictions of the current value function.
            returns (torch.Tensor): Return values.
            padding_masks (Optional[torch.Tensor]): Padding token masks of the same shape as ``pi_logprobs``,
                where True indicates the corresponding loss values should participage in policy loss calculation.
            value_padding_masks (Optional[torch.Tensor]): Padding token masks of the same shape as ``pi_logprobs``,
                where True indicates the corresponding loss values should participage in value loss calculation.

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of five tensors:
                - loss: The total PPO loss.
                - policy_loss: The policy function loss.
                - value_loss: The value function loss.
                - ratios: The ratio between the current and old policy probabilities.
                - clipfrac: The fraction of ratios that were clipped.

        g      ?N   g      ?)torchexpclampr
   todtypemeanr   masked_meanmaximumr   r   detach)r   r   r   r   r   r   r   r   r   ratiosclipped_ratiospolicy_losses_clippedpolicy_losses_unclippedclipfracpolicy_lossvalues_clipped
value_losslosss                     r   forwardzPPOLoss.forward+   s   H ;899VS4<-?t|ASTT!+n <#-+"6),CCGG
 

 $ MMOOO!(M:: 	 m$9;RSS $ !+}== 	 T22T22
 

 ]'!a'.7*Bq)H
 


 #* *//####t'
4GHHH 	 j4+;;<  KKMM  ""OO
 	
r   )r   r	   r   )NN)__name__
__module____qualname____doc__floatr   r   Tensorr   r   r1   __classcell__)r   s   @r   r   r      s2        & "% 		' 	'	'  	' 		' 	' 	' 	' 	' 	'& 156:O
 O
O
 \O
 L	O

 O
 LO
 O
  -O
 &el3O
 
u|U\5<u|S	TO
 O
 O
 O
 O
 O
 O
 O
r   r   )
typingr   r   r   torch.nnnn	torchtuner   Moduler    r   r   <module>r?      s    # " " " " " " "             l
 l
 l
 l
 l
bi l
 l
 l
 l
 l
r   