
    PiF-                     v   d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ  e
j        d          Z	 ddej        ded	ed
ej        fdZ	 ddej        ded	ed
ej        fdZ	 ddej        ded	ed
ej        fdZ	 ddej        ded	ed
ej        fdZ	 ddej        ded	ed
ej        fdZ	 ddej        ded	ed
ej        fdZdefdedeeej        f         dej        dej        j        d	edeej        eegej        f         d
ej        fdZ G d d          Z G d de          Z G d de          ZdS )    N)CallableDictListOptional)utils)TransformerDecoderDEBUG      ?	layer_idsn_layerse_scalereturnc                     t          j        t          |           | j                  }|t          j        | |dz
  k     |d          z  }|t          j        |          z  S )N)device   r
   )torchoneslenr   wheresumr   r   r   loss_scaless       u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/modules/early_exit_loss.pyuniform_loss_scaler      sW     *S^^I4DEEEKI1,Dgs S SSK;////    c                     t          j        | dz             }|t          j        | |dz
  k     |d          z  }|t          j        |          z  S Nr   r
   )r   Tensorr   r   r   s       r   linear_l_loss_scaler      sM     ,y1}--KI1,Dgs S SSK;////r   c                     t          j        | dz   d          }|t          j        | |dz
  k     |d          z  }|t          j        |          z  S )Nr   r   )dimr
   )r   cumsumr   r   r   s       r   sum_l_loss_scaler#   #   sR     ,y1}!444KI1,Dgs S SSK;////r   c                     t          j        | dz             }|t          j        | |dz
  k     |d          z  }|t          j        |          z  S r   )r   sqrtr   r   r   s       r   sqrt_l_loss_scaler&   +   sM     *Y]++KI1,Dgs S SSK;////r   c                     d| dz   z  }|t          j        | |dz
  k     |d          z  }|t          j        |          z  S )Nr
   r   )r   r   r   r   s       r   inv_l_loss_scaler(   3   sH     Q'KI1,Dgs S SSK;////r   c                     t          j        t          j        | dz                       }|t          j        | |dz
  k     |d          z  }|t          j        |          z  S r   )r   
reciprocalr%   r   r   r   s       r   inv_sqrt_l_loss_scaler+   ;   sX     "5:i!m#<#<==KI1,Dgs S SSK;////r   modelhidden_states_dictlabelsloss_fnloss_scale_fnc                    t          j        |          }d|_        t          |          }t	          |                                          }t	          |                                          }	t          j        |          }
| 	                    |
          }|
                    d|                    d                    }|                                }|                    |d          
                    d          } |||          }|                    |d          }||j        k                                    }|                                                    d          |z  } |t          j        |	                              |          t          | j                  |          }t          j        ||z            S )a  
    Compute the early exit loss for a given model and outputs of intermediate layers.
    This function takes in a model, a dictionary of hidden states, labels, a loss function,
    and optional parameters for scaling the loss. It computes the early exit loss by
    iterating over the hidden states, computing the logits and losses at each layer,
    and then scaling and summing these losses.

    Args:
        model (TransformerDecoder): The model to compute the early exit loss for.
        hidden_states_dict (Dict[int, torch.Tensor]): A dictionary of hidden states,
            where each key is a layer index and each value is a tensor of shape [b, s, d].
        labels (torch.Tensor): The labels for the input data.
        loss_fn (torch.nn.Module): The loss function to use (should be the same as the standard loss function for last layer).
        e_scale (float): A scaling factor for the early exit losses. Defaults to 1.0.
        loss_scale_fn (Callable[[torch.Tensor, int, float], torch.Tensor]): A function to determine scale of each
            layer's loss. Defaults to uniform_loss_scale.
    Returns:
        torch.Tensor: The computed early exit loss.
    noner   )copydeepcopy	reductionr   tuplevalueskeysr   stackunembedreshapesize
contiguousrepeatviewignore_indexr   floatr   tolayers)r,   r-   r.   r/   r   r0   batch_loss_fnehidden_stateshidden_layer_idshidden_states_stackedlogits_earlylabels_repeatedlosses_early
s_unpaddedlosses_scaless                   r   early_exit_lossrO   C   s   : M'**M$MA,335566M/446677!K66==!677L''L,=,=b,A,ABBL**,,LmmAq))11"55O =??L$$Q++LG005577J%%''++B//*<L!M%&&)),77EL M 9]\1222r   c                   l    e Zd ZdZ	 	 	 ddee         dededee         d	ef
d
ZddZ	de
j        fdZdS )EarlyExitCurriculuma  
    A curriculum for early exit loss training, which controls which layers to use their hidden states
    during training.

    Args:
        do_output_hidden_states (List[bool]): A list indicating whether each layer's hidden state
            should be output to calculate their losses.
        max_steps (int): The maximum number of steps in the curriculum.
        train_last_layer (bool, optional): Whether to always calculate loss for the last layer. Defaults to True.
        last_step (Optional[int]): The last step the curriculum stopped at in a previous run.
            This is used when resuming training.
        verbose (bool, optional): Whether to print verbose logs. Defaults to False.
    TNFdo_output_hidden_states	max_stepstrain_last_layer	last_stepverbosec                 b    || _         || _        || _        || _        || _        |dn|| _        d S )Nr   )_init_do_output_hidden_states_do_output_hidden_statesrT   rV   rS   _step)selfrR   rS   rT   rU   rV   s         r   __init__zEarlyExitCurriculum.__init__   s?     .E*(?% 0"#+QQ


r   r   c                     dS )zr
        Perform a step in the curriculum. Should be called at the end of each iteration during training.
        N r[   s    r   stepzEarlyExitCurriculum.step   s	     	r   c                 P    t          j        | j                  }| j        rd|d<   |S )z
        Get the current output hidden states.
        Returns:
            np.ndarray: A list indicating whether we should calculate loss for each layer.
        Tr3   )npr4   rY   rT   )r[   rR   s     r   getzEarlyExitCurriculum.get   s3     #%'$*G"H"H  	/*.#B'&&r   TNF)r   N)__name__
__module____qualname____doc__r   boolintr   r\   r`   rb   ndarrayrc   r^   r   r   rQ   rQ      s         $ "&#'; ;!%d; ; 	;
 C=; ; ; ; ;   
'RZ 
' 
' 
' 
' 
' 
'r   rQ   c                   ^     e Zd ZdZ	 	 	 ddee         dededee         d	ef
 fd
Zd Z	 xZ
S )RotationalEarlyExitCurriculuma  
    A rotational early exit curriculum, which rotates the layer enablement one step forward
    at each step.

    Args:
        do_output_hidden_states (List[bool]): A list indicating whether each layer's hidden state
            should be output to calculate their losses.
        max_steps (int): The maximum number of steps in the curriculum.
        train_last_layer (bool, optional): Whether to always calculate loss for the last layer. Defaults to True.
        last_step (Optional[int]): The last step the curriculum stopped at in a previous run.
            This is used when resuming training.
        verbose (bool, optional): Whether to print verbose logs. Defaults to False.
    TNFrR   rS   rT   rU   rV   c                     t                                          |||||           t          j        | j                  | _        d S )NrR   rS   rT   rU   rV   )superr\   rb   r4   rY    _initial_do_output_hidden_states)r[   rR   rS   rT   rU   rV   	__class__s         r   r\   z&RotationalEarlyExitCurriculum.__init__   sR     	$;- 	 	
 	
 	
 138U0V0V---r   c                     t          j        | j        d          | _        | xj        dz  c_        | j        r%t
                              d| j         d           dS dS )z
        Rotate the layer enablement one step forward.
        This method updates the `do_output_hidden_states` attribute by rotating it one position to the right.
        r   )Updated self._do_output_hidden_states to .N)rb   rollrY   rZ   rV   loginfor_   s    r   r`   z"RotationalEarlyExitCurriculum.step   so     )+0Mq(Q(Q%

a

< 	HH\D<Y\\\    	 	r   rd   )re   rf   rg   rh   r   ri   rj   r   r\   r`   __classcell__rr   s   @r   rm   rm      s         $ "&#'W W!%dW W 	W
 C=W W W W W W W"      r   rm   c                   d     e Zd ZdZ	 	 	 	 ddee         deded	ee         d
edef fdZ	d Z
 xZS )GradualEarlyExitCurriculuma  
    A gradual early exit curriculum, which gradually enables more layers (starting from the last layer) as training progresses.

    Args:
        do_output_hidden_states (List[bool]): A list indicating whether each layer's hidden state
            should be output to calculate their losses.
        max_steps (int): The maximum number of steps in the curriculum.
        train_last_layer (bool): Whether to always calculate loss for the last layer. Defaults to True.
        last_step (Optional[int]): The last step the curriculum stopped at in a previous run.
            This is used when resuming training.
        fraction_scale (float): A scaling factor to determine at which fraction
            of steps, all the layers will be enabled. At `steps = max_steps * fraction_scale`, all the layers will be
            enabled. Defaults to 0.5.
        verbose (bool): Whether to print verbose logs. Defaults to False.
    TN      ?FrR   rS   rT   rU   fraction_scalerV   c                    t                                          |||||           t          j        | j                  | _        d| _        || _        t          t          | j                            D ]}d| j        |<   d S )Nro   r   F)
rp   r\   rb   r4   rY   _final_do_output_hidden_statesrZ   _fraction_scaleranger   )	r[   rR   rS   rT   rU   r~   rV   irr   s	           r   r\   z#GradualEarlyExitCurriculum.__init__   s     	$;- 	 	
 	
 	
 /1gd6S.T.T+
- s4899:: 	5 	5A/4D)!,,	5 	5r   c                    | j         | j        z  }t          | j                  }t	          t          | j                            D ] }|| j        ||z
  z  |z  k    }|| j        |<   !t          j        | j        | j                  | _        | xj         dz  c_         | j	        r%t                              d| j         d           dS dS )z
        Perform a step in the curriculum.
        This method updates the `_do_output_hidden_states` attribute based on the current
            step and the fraction of completed training steps.
        r   rt   ru   N)rZ   rS   r   rY   r   r   rb   logical_andr   rV   rw   rx   )r[   fraction_trainedr   layer_indexshould_trains        r   r`   zGradualEarlyExitCurriculum.step  s      :6t455 T%B!C!CDD 	F 	FK '8k+ABXMN  :FD)+66 )+)4+N)
 )
% 	

a

< 	HH\D<Y\\\    	 	r   )TNr}   F)re   rf   rg   rh   r   ri   rj   r   rB   r\   r`   ry   rz   s   @r   r|   r|      s         ( "&#' #5 5!%d5 5 	5
 C=5 5 5 5 5 5 5 50      r   r|   )r
   )r4   typingr   r   r   r   numpyrb   r   	torchtuner   torchtune.modules.transformerr   
get_loggerrw   r   rj   rB   r   r   r#   r&   r(   r+   nnModulerO   rQ   rm   r|   r^   r   r   <module>r      s7    1 1 1 1 1 1 1 1 1 1 1 1            < < < < < <ew >A0 0|0'*05:0
\0 0 0 0 >A0 0|0'*05:0
\0 0 0 0 >A0 0|0'*05:0
\0 0 0 0 >A0 0|0'*05:0
\0 0 0 0 >A0 0|0'*05:0
\0 0 0 0 >A0 0|0'*05:0
\0 0 0 0  	;3 ;3;3S%,./;3 L;3 X_	;3
 ;3 	sE"EL0;3 \;3 ;3 ;3 ;3~.' .' .' .' .' .' .' .'b, , , , ,$7 , , ,^B B B B B!4 B B B B Br   