
    Pi#                        d dl mZmZ d dlZ	 	 ddej        dej        dededej        f
d	Z	 d d
ej        dej        dej        dedeej                 deej        ej        ej        f         fdZ		 d dej        dej        dee         dej        fdZ
	 d!dej        dej        dedej        fdZ	 d"dej        deej                 dedej        fdZ	 d dej        dej        dededeej                 deej        ej        f         fdZdS )#    )OptionalTupleNTpadding_masksseq_lenspenalise_no_eosmin_response_lengthreturnc                     t          j        |                              t                    }|r|                     d           }|
|||k     z  }|S )a0  
    Calculates a mask to penalise scores corresponding to sequences generated during PPO, where True indicates the score
    at the corresponding position should be penalised.
    This function assumes sequences have already been truncated at an EOS, if present, and padded to length,
    e.g. by :func:`torchtune.rlhf.sequence_processing.truncate_sequence_at_first_stop_token`.

    Scores are penalised such that:
    - If ``min_response_length`` is set, scores for sequences with ``length < min_response_length`` are penalised.
    - If ``penalise_no_eos`` is True, scores for sequences with no EOS token are penalised.

    Args:
        padding_masks (torch.Tensor): torch.Tensor where True indicates a padding token in the generated
            sequence, and False otherwise. Shape: ``(b, response_len)``
        seq_lens (torch.Tensor): The length of each generated sequence. Shape: ``(b,)``
        penalise_no_eos (bool, optional): Whether to penalise sequences with no EOS token. Defaults to True.
        min_response_length (int, optional): The minimum length of the response. If set, any responses is shorter
            than this length will be penalised. Defaults to None.
    Returns:
        torch.Tensor: A mask tensor with shape ``(b,)`` where True indicates the corresponding score should be penalised.
    )torch
zeros_liketoboolany)r   r   r   r   reward_penalty_masks        j/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/rlhf/rewards.pyget_reward_penalty_maskr      sd    4  *84477==  5,00444&-@!@AA    scoreslogprobsref_logprobskl_coeffvalid_score_idxsc                     ||z
  }| |z  }|                                 }|>|                    d|                    d          |                     d                     n|dddfxx         | z  cc<   |||fS )a  
    Calculates PPO rewards for the given scores, logprobs, and reference logprobs.

    Args:
        scores (torch.Tensor): Reward model scores, shape ``(b,)``.
        logprobs (torch.Tensor): Policy logprobs, shape ``(b, response_len)``.
        ref_logprobs (torch.Tensor): Reference base model logprobs, shape ``(b, response_len)``.
        kl_coeff (float): KL reward contribution coefficient.
        valid_score_idxs (Optional[torch.Tensor]): A tensor of indexes for valid (non-padded) token predictions.
            This is useful when calculating rewards for padded sequences, as scores and value estimates are defined
            for the last valid predicted token. Shape: ``(b,)``. Default None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of tensors with shape ``(b, response_len)`` each:
            - total_reward: total reward combining per-token kl rewards and reward model score.
            - kl: kl divergence between policy and reference policy logprobs.
            - kl_reward: kl divergence scaled by ``kl_coeff``.

    Notation used for tensor shapes:
        - b: batch size
        - response_len: model response length
    N   r   )clonescatter_add_	unsqueeze)r   r   r   r   r   kl	kl_rewardtotal_rewards           r   get_rewards_ppor"   1   s    < 
L	 B	BI??$$L
 #!!))"--v/?/?/C/C	
 	
 	
 	
 	QQQUv%Y&&r   xmaskdimc                 h    | |z                       |          |                     |          dz   z  S )a  
    Compute mean of tensor with masked values. Taken from https://github.com/huggingface/trl/blob/main/trl/core.py

    Args:
        x (torch.Tensor): The input tensor.
        mask (torch.Tensor): The bool mask tensor, where True indicates the corresponding value in ``x``
            should participate in the mean calculation.
        dim (Optional[int]): The axis to calculate the mean over. Default None.

    Returns:
        torch.Tensor: The mean tensor.
    )r%   :0yE>)sum)r#   r$   r%   s      r   masked_meanr)   a   s4     H>>c>""dhh3h&7&7$&>??r   centered_valuesunbiasedc                     t          |                     d          |          }|r$|                                dz   }||dz
  z  }||z  }|S )aW  
    Compute variance of mean-centered tensor with masked values. Taken from https://github.com/huggingface/trl/blob/main/trl/core.py
    We use ``centered_values`` to avoid repeated calls to ``masked_mean``.
    Args:
        centered_values (torch.Tensor): The mean-centered tensor e.g. ``x - masked_mean(x)``.
        mask (torch.Tensor): The bool mask tensor, where True indicates the corresponding value in ``x``
            should participate in the mean calculation.
        unbiased (bool): Whether to use the unbiased variance.

    Returns:
        torch.Tensor: The variance tensor.

       r'   r   )r)   powr(   )r*   r$   r+   varmask_sumbessel_corrections         r   
masked_varr2   s   sZ      o))!,,d
3
3C &88::$$15%%Jr   
shift_meanc                     |$t          | |          }t          | |z
  |          }n(|                                 |                                 }}| |z
  t	          j        |dz             z  }|r||z  }|S )a*  
    Whiten (normalises) values, optionally with masked values. Taken from https://github.com/huggingface/trl/blob/main/trl/core.py
    Args:
        x (torch.Tensor): The input tensor.
        mask (Optional[torch.Tensor]): The bool mask tensor with the same shape as ``x``, and where True indicates
            the corresponding value in ``x`` should participate in the mean calculation. Default None.
        shift_mean (bool): Whether to shift normalised values by the mean. Default True.

    Returns:
        torch.Tensor: The whitened tensor.
    Nr'   )r)   r2   meanr/   r   rsqrt)r#   r$   r3   r5   r/   whiteneds         r   whitenr8      sz     1d##T4((FFHHaeeggcDEKd
333H DOr   valuesrewardsgammalmbdamasksc                    d}g }| j         d         }t          t          |                    D ][}||dz
  k     r| dd|dz   f         nd}	|dd|f         ||	z  z   | dd|f         z
  }
|
||z  |z  z   }|                    |           \t	          j        |ddd         d          }|| z   }t          ||          }|d|| <   ||fS )a  
    Estimates the advantages and returns for the PPO algorithm using Generalized Advantage Estimation
    https://arxiv.org/pdf/1506.02438.pdf

    Args:
        values (torch.Tensor): The predicted values for each state. Shape: ``(b, response_len)``
        rewards (torch.Tensor): The rewards received at each time step. Shape: ``(b, response_len)``
        gamma (float): The discount factor.
        lmbda (float): The GAE-Lambda parameter.
        masks (Optional[torch.Tensor]): A bool mask tensor, where True indicates the corresponding value in ``values``
            should participate in the mean calculation. Default None.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing the estimated advantages and returns.
            - advantages (torch.Tensor): The estimated advantages. Shape: ``(b, response_len)``
            - returns (torch.Tensor): The estimated returns. Shape: ``(b, response_len)``
    Notation:
        - b: batch size
        - response_len: model response length
    r   r   r   Ng        )axis)r$   )shapereversedrangeappendr   stackr8   )r9   r:   r;   r<   r=   last_gae_lamadvantages_reversedresponse_lengthtnext_valuesdelta
advantagesreturnss                r   estimate_advantagesrM      s   6 Ll2&O eO,,-- 	1 	1*+o.A*A*AfQQQAX&&s 1 33fQQQTlB uu}|;;""<0000026Q???J
 6!G 
///J 
E6wr   )TN)N)T)NT)typingr   r   r   Tensorr   intr   floatr"   r)   r2   r8   rM    r   r   <module>rS      sP   # " " " " " " "  !#	" "<"l" " 	"
 \" " " "T 04-' -'L-'l-' ,-' 	-'
 u|,-' 5<u|34-' -' -' -'b ?C@ @|@ <@.6sm@
\@ @ @ @& IM \).AE
\   2 NR |#EL1FJ
\   < %)8 8L8\8 8 	8
 EL!8 5<%&8 8 8 8 8 8r   