
    PiG                     L   d dl mZ d dlZd dlmc mZ  G d dej        j                  Z G d dej        j                  Z	 G d dej        j                  Z
 G d	 d
ej        j                  Z G d dej        j                  Z G d dej        j                  ZdS )    )ListNc                   r     e Zd ZdZddef fdZ	 ddej        dej        dej        d	ed
ej        f
dZ	 xZ
S )ForwardKLLoss  
    The Kullback-Leibler divergence loss for valid indexes.
    Implementation of https://github.com/jongwooko/distillm/blob/17c0f98bc263b1861a02d5df578c84aea652ee65/distillm/losses.py

    Args:
        ignore_index (int):  Specifies a target value that is ignored and does not contribute to the input gradient.
            The loss is divided over non-ignored targets.
            Default: -100.
    ignore_indexc                 V    t                                                       || _        d S Nsuper__init__r   selfr   	__class__s     t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/modules/loss/kd_losses.pyr   zForwardKLLoss.__init__   '    (    Tstudent_logitsteacher_logitslabels	normalizereturnc                    t          j        |dt          j                  }t          j        |          }t          j        |dt          j                  }t          j        ||z  |d          }t          j        |d                              d          }	|| j	        k    
                                }
|s-t          j        |	|
                    d          z  d           S t          j        |
                    d          d          }|dk    rt          j        d|	j                  S t          j        |	|
                    d          z  d           t          j        |
                    d          d          z  S 5  
        Args:
            student_logits (torch.Tensor): logits from student model of shape
                (batch_size*num_tokens, vocab_size).
            teacher_logits (torch.Tensor): logits from teacher model of shape
                (batch_size*num_tokens, vocab_size).
            labels (torch.Tensor): Ground truth labels of shape
                (batch_size, vocab_size).
            normalize (bool): Whether to normalize the loss by the number of unmasked elements.

        Returns:
            torch.Tensor: KL divergence loss of shape (1,).
        )dimdtyper   r           device)Fsoftmaxtorchfloat32isinflog_softmaxmasked_fillsumviewr   inttensorr"   )r   r   r   r   r   teacher_probinf_maskstudent_logprob
prod_probsxmask	sum_maskss               r   forwardzForwardKLLoss.forward   sK   * yRu}MMM;~..-BemTTT&|o'ExQRSS
Ijb)))..r22$++0022 	8Ia$))B--/Q77777Idiimm333	>><AH5555	!diimm+3333ei		"ST6U6U6UUUr   r   T__name__
__module____qualname____doc__r,   r   r%   Tensorboolr5   __classcell__r   s   @r   r   r      s         ) )S ) ) ) ) ) ) !V !V!V !V 	!V
 !V 
!V !V !V !V !V !V !V !Vr   r   c                   r     e Zd ZdZddef fdZ	 ddej        dej        dej        d	ed
ej        f
dZ	 xZ
S )ReverseKLLossr   r   r   c                 V    t                                                       || _        d S r
   r   r   s     r   r   zReverseKLLoss.__init__K   r   r   Tr   r   r   r   r   c                    t          j        |dt          j                  }t          j        |dt          j                  }t          j        |dt          j                  }t          j        |          t          j        |          z  }t          j        ||z  |d          }	|	t          j        ||z  |d          z  }	t          j        |	d                              d          }
|| j	        k    
                                }|s-t          j        |
|                    d          z  d           S t          j        |                    d          d          dk    rt          j        d|
j                  S t          j        |
|                    d          z  d           t          j        |                    d          d          z  S r   )r#   r$   r%   r&   r(   r'   r)   r*   r+   r   r,   r-   r"   )r   r   r   r   r   student_probr0   teacher_logprobr/   r1   r2   r3   s               r   r5   zReverseKLLoss.forwardO   s   * yRu}MMM-BemTTT-BemTTT;~..^1L1LL&|o'ExQRSS
e'(FRSTTT
Ijb)))..r22$++0022 	8Ia$))B--/Q777779TYYr]]***a//<AH5555	!diimm+3333ei		"ST6U6U6UUUr   r6   r7   r8   r@   s   @r   rB   rB   @   s         ) )S ) ) ) ) ) ) %V %V%V %V 	%V
 %V 
%V %V %V %V %V %V %V %Vr   rB   c                   v     e Zd ZdZddedef fdZ	 ddej        d	ej        d
ej        de	dej        f
dZ
 xZS )SymmetricKLLossa  
    The Symmetric Kullback-Leibler divergence loss for valid indexes.
    Implementation of https://github.com/jongwooko/distillm/blob/17c0f98bc263b1861a02d5df578c84aea652ee65/distillm/losses.py

    Args:
        sym_kd_ratio (float): Ratio of symmetric KL divergence loss.
            When set to 1 this loss reduces to forward KL divergence, when set to 0 this loss reduces to reverse kl divergence.
        ignore_index (int):  Specifies a target value that is ignored and does not contribute to the input gradient.
            The loss is divided over non-ignored targets.
            Default: -100.

    Raises:
        ValueError: If sym_kd_ratio is not in the range [0, 1].
          ?r   sym_kd_ratior   c                     t                                                       |dk     s|dk    rt          d          || _        || _        t          |          | _        t          |          | _        d S )Nr    g      ?z(sym_kd_ratio must be in the range [0, 1])	r   r   
ValueErrorr   rJ   r   fklrB   rkl)r   rJ   r   r   s      r   r   zSymmetricKLLoss.__init__   so    #!3!3GHHH(( .. ..r   Tr   r   r   r   r   c                     | j         |                     ||||          z  d| j         z
  |                     ||||          z  z   S )r      )rJ   rM   rN   )r   r   r   r   r   s        r   r5   zSymmetricKLLoss.forward   s[    *  488NFI$
 $
 
""dhhNFI'
 '
 

 	
r   )rI   r   r7   )r9   r:   r;   r<   floatr,   r   r%   r=   r>   r5   r?   r@   s   @r   rH   rH   w   s         / /U / / / / / / / 
 

 
 	

 
 

 
 
 
 
 
 
 
r   rH   c                        e Zd ZdZddedef fdZdeej                 deej                 d	ej        d
ej        fdZ	 xZ
S )ForwardKLWithChunkedOutputLossa  
    Forward KL with chunked outputs that saves memory by only upcasting one chunk at a time.

    Since the model is trained with bf16, before computing KL divergence, we have to upcast
    it to fp32 for better accuracy and stability. When upcasting happens, the memory usage doubles.
    Models like llama3 have large vocabulary size and, therefore, have a large output
    result (bsz, num_tokens, vocab_size). If we chunk on the token level, you can still compute
    the cross entropy normally, but upcasting only one chunk at a time saves considerable memory.

    Args:
        num_output_chunks (int): Number of chunks to chunk the output into. Each chunk has shape
            (batch_size, num_tokens / num_output_chunks, vocab_size).
            Default: 8
        ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient.
            The loss is divided over non-ignored targets.
            Default: -100
       r   num_output_chunksr   c                     t                                                       || _        || _        t	          |          | _        d S r
   )r   r   rU   r   r   fkl_lossr   rU   r   r   s      r   r   z'ForwardKLWithChunkedOutputLoss.__init__   <    !2(%l33r   r   r   r   r   c                    d |D             }d |D             }|| j         k                                    }d |                    | j        d          D             }d}t	          |||          D ]"\  }}}||                     |||d          z  }#t          j        |                    d	          d
          }	|	d
k    r!t          j	        d|d
         j
                  S |t          j        |                    d	          d
          z  S )a  
        Args:
            student_logits (List[torch.Tensor]): List of chunked logits from student model of length
                ``self.num_output_chunks``, where each chunk has shape
                (batch_size, num_tokens / num_output_chunks, vocab_size).
            teacher_logits (List[torch.Tensor]): List of chunked logits from teacher model of length
                ``self.num_output_chunks``, where each chunk has shape
                (batch_size, num_tokens / num_output_chunks, vocab_size).
            labels (torch.Tensor): Ground truth labels of shape (batch_size, num_tokens).

        Returns:
            torch.Tensor: KL divergence loss of shape (1,).

        Example:
            >>> loss_fn = ForwardKLWithChunkedOutputLoss()
            >>>
            >>> h = torch.tensor([bsz, num_tokens, dim])
            >>> output_chunks = [model.output(chunk) for chunk in h.chunk(num_chunks, dim=1)]
            >>> teacher_chunks = [teacher_model.output(chunk) for chunk in h.chunk(num_chunks, dim=1)]
            >>> labels = torch.tensor([bsz, num_tokens])
            >>> loss = loss_fn(output_chunks, teacher_chunks, labels)
        c                 `    g | ]+}|                     d |                    d                     ,S r   reshapesize.0teacher_logits_chunks     r   
<listcomp>z:ForwardKLWithChunkedOutputLoss.forward.<locals>.<listcomp>   G     
 
 
$ !((-A-F-Fr-J-JKK
 
 
r   c                 `    g | ]+}|                     d |                    d                     ,S r\   r]   ra   student_logits_chunks     r   rc   z:ForwardKLWithChunkedOutputLoss.forward.<locals>.<listcomp>   rd   r   c                 8    g | ]}|                     d           S r\   r^   ra   target_chunks     r   rc   z:ForwardKLWithChunkedOutputLoss.forward.<locals>.<listcomp>   6     
 
 
   $$
 
 
r   rP   r   r    Fr   r   r   r!   )r   r,   chunkrU   ziprW   r%   r*   r+   r-   r"   )
r   r   r   r   r3   total_fkl_lossstudent_chunkteacher_chunklabel_chunkr4   s
             r   r5   z&ForwardKLWithChunkedOutputLoss.forward   sN   <
 
(6
 
 

 
(6
 
 
 $++0022
 
 &T-C K K
 
 

 9<NF:
 :
 	 	5M=+ dmm}kU ,   NN Idiimm333	>><N1,=,DEEEE	$))B--Q ? ? ???r   rT   r   r9   r:   r;   r<   r,   r   r   r%   r=   r5   r?   r@   s   @r   rS   rS      s         $4 4# 4 4 4 4 4 4 49@U\*9@ U\*9@ 	9@
 
9@ 9@ 9@ 9@ 9@ 9@ 9@ 9@r   rS   c                        e Zd ZdZddedef fdZdeej                 deej                 d	ej        d
ej        fdZ	 xZ
S )ReverseKLWithChunkedOutputLossa  
    Reverse KL with chunked outputs that saves memory by only upcasting one chunk at a time.

    Since the model is trained with bf16, before computing KL divergence, we have to upcast
    it to fp32 for better accuracy and stability. When upcasting happens, the memory usage doubles.
    Models like llama3 have large vocabulary size and, therefore, have a large output
    result (bsz, num_tokens, vocab_size). If we chunk on the token level, you can still compute
    the cross entropy normally, but upcasting only one chunk at a time saves considerable memory.

    Args:
        num_output_chunks (int): Number of chunks to chunk the output into. Each chunk has shape
            (batch_size, num_tokens / num_output_chunks, vocab_size).
            Default: 8
        ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient.
            The loss is divided over non-ignored targets.
            Default: -100
    rT   r   rU   r   c                     t                                                       || _        || _        t	          |          | _        d S r
   )r   r   rU   r   rB   rkl_lossrX   s      r   r   z'ReverseKLWithChunkedOutputLoss.__init__  rY   r   r   r   r   r   c                 |   d |D             }d |D             }|| j         k                                    }d |                    | j        d          D             }d}t	          |||          D ]"\  }}}||                     |||d          z  }#|t          j        |                    d	          d
          z  S )a  
        Args:
            student_logits (List[torch.Tensor]): List of chunked logits from student model of length
                ``self.num_output_chunks``, where each chunk has shape
                (batch_size, num_tokens / num_output_chunks, vocab_size).
            teacher_logits (List[torch.Tensor]): List of chunked logits from teacher model of length
                ``self.num_output_chunks``, where each chunk has shape
                (batch_size, num_tokens / num_output_chunks, vocab_size).
            labels (torch.Tensor): Ground truth labels of shape (batch_size, num_tokens).

        Returns:
            torch.Tensor: KL divergence loss of shape (1,).

        Example:
            >>> loss_fn = ReverseKLWithChunkedOutputLoss()
            >>>
            >>> h = torch.tensor([bsz, num_tokens, dim])
            >>> output_chunks = [model.output(chunk) for chunk in h.chunk(num_chunks, dim=1)]
            >>> teacher_chunks = [teacher_model.output(chunk) for chunk in h.chunk(num_chunks, dim=1)]
            >>> labels = torch.tensor([bsz, num_tokens])
            >>> loss = loss_fn(output_chunks, teacher_chunks, labels)
        c                 `    g | ]+}|                     d |                    d                     ,S r\   r]   r`   s     r   rc   z:ReverseKLWithChunkedOutputLoss.forward.<locals>.<listcomp>8  rd   r   c                 `    g | ]+}|                     d |                    d                     ,S r\   r]   rf   s     r   rc   z:ReverseKLWithChunkedOutputLoss.forward.<locals>.<listcomp><  rd   r   c                 8    g | ]}|                     d           S r\   ri   rj   s     r   rc   z:ReverseKLWithChunkedOutputLoss.forward.<locals>.<listcomp>B  rl   r   rP   r   r    Frm   r   r   )	r   r,   rn   rU   ro   ry   r%   r*   r+   )	r   r   r   r   r3   total_rkl_lossrq   rr   rs   s	            r   r5   z&ReverseKLWithChunkedOutputLoss.forward  s
   <
 
(6
 
 

 
(6
 
 
 $++0022
 
 &T-C K K
 
 

 9<NF:
 :
 	 	5M=+ dmm}kU ,   NN 	$))B--Q ? ? ???r   rt   ru   r@   s   @r   rw   rw     s         $4 4# 4 4 4 4 4 4 45@U\*5@ U\*5@ 	5@
 
5@ 5@ 5@ 5@ 5@ 5@ 5@ 5@r   rw   c                        e Zd ZdZ	 	 	 ddededef fdZd	eej	                 d
eej	                 dej	        dej	        fdZ
 xZS ) SymmetricKLWithChunkedOutputLossa  
    Symmetric KL with chunked outputs that saves memory by only upcasting one chunk at a time.

    Since the model is trained with bf16, before computing KL divergence, we have to upcast
    it to fp32 for better accuracy and stability. When upcasting happens, the memory usage doubles.
    Models like llama3 have large vocabulary size and, therefore, have a large output
    result (bsz, num_tokens, vocab_size). If we chunk on the token level, you can still compute
    the cross entropy normally, but upcasting only one chunk at a time saves considerable memory.

    Args:
        num_output_chunks (int): Number of chunks to chunk the output into. Each chunk has shape
            (batch_size, num_tokens / num_output_chunks, vocab_size).
            Default: 8
        sym_kd_ratio (float): Ratio of symmetric KL divergence loss.
            When set to 1 this loss reduces to forward KL divergence, when set to 0 this loss reduces to reverse kl divergence.
            Default: 0.5
        ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient.
            The loss is divided over non-ignored targets.
            Default: -100
    rT   rI   r   rU   rJ   r   c                     t                                                       || _        || _        || _        t          | j        | j                  | _        d S )N)rJ   r   )r   r   rU   rJ   r   rH   sym_kl_loss)r   rU   rJ   r   r   s       r   r   z)SymmetricKLWithChunkedOutputLoss.__init__h  s[     	!2((**9J
 
 
r   r   r   r   r   c                 |   d |D             }d |D             }|| j         k                                    }d |                    | j        d          D             }d}t	          |||          D ]"\  }}}||                     |||d          z  }#|t          j        |                    d	          d
          z  S )a  
        Args:
            student_logits (List[torch.Tensor]): List of chunked logits from student model of length
                ``self.num_output_chunks``, where each chunk has shape
                (batch_size, num_tokens / num_output_chunks, vocab_size).
            teacher_logits (List[torch.Tensor]): List of chunked logits from teacher model of length
                ``self.num_output_chunks``, where each chunk has shape
                (batch_size, num_tokens / num_output_chunks, vocab_size).
            labels (torch.Tensor): Ground truth labels of shape (batch_size, num_tokens).

        Returns:
            torch.Tensor: KL divergence loss of shape (1,).

        Example:
            >>> loss_fn = SymmetricKLWithChunkedOutputLoss()
            >>>
            >>> h = torch.tensor([bsz, num_tokens, dim])
            >>> output_chunks = [model.output(chunk) for chunk in h.chunk(num_chunks, dim=1)]
            >>> teacher_chunks = [teacher_model.output(chunk) for chunk in h.chunk(num_chunks, dim=1)]
            >>> labels = torch.tensor([bsz, num_tokens])
            >>> loss = loss_fn(output_chunks, teacher_chunks, labels)
        c                 `    g | ]+}|                     d |                    d                     ,S r\   r]   r`   s     r   rc   z<SymmetricKLWithChunkedOutputLoss.forward.<locals>.<listcomp>  rd   r   c                 `    g | ]+}|                     d |                    d                     ,S r\   r]   rf   s     r   rc   z<SymmetricKLWithChunkedOutputLoss.forward.<locals>.<listcomp>  rd   r   c                 8    g | ]}|                     d           S r\   ri   rj   s     r   rc   z<SymmetricKLWithChunkedOutputLoss.forward.<locals>.<listcomp>  rl   r   rP   r   r    Frm   r   r   )	r   r,   rn   rU   ro   r   r%   r*   r+   )	r   r   r   r   r3   total_sym_kl_lossrq   rr   rs   s	            r   r5   z(SymmetricKLWithChunkedOutputLoss.forwardv  s   <
 
(6
 
 

 
(6
 
 
 $++0022
 
 &T-C K K
 
 

  9<NF:
 :
 	 	5M=+ !1!1}kU "2 " "  !59TYYr]]#B#B#BBBr   )rT   rI   r   )r9   r:   r;   r<   r,   rQ   r   r   r%   r=   r5   r?   r@   s   @r   r   r   R  s         . "#! 	
 

 
 	
 
 
 
 
 
5CU\*5C U\*5C 	5C
 
5C 5C 5C 5C 5C 5C 5C 5Cr   r   )typingr   r%   torch.nn.functionalnn
functionalr#   Moduler   rB   rH   rS   rw   r    r   r   <module>r      s                  0V 0V 0V 0V 0VEHO 0V 0V 0Vf4V 4V 4V 4V 4VEHO 4V 4V 4Vn2
 2
 2
 2
 2
eho 2
 2
 2
jR@ R@ R@ R@ R@UX_ R@ R@ R@jN@ N@ N@ N@ N@UX_ N@ N@ N@bYC YC YC YC YCux YC YC YC YC YCr   