
     `i53                        d dl Z d dlZd dlmZ d dlZd dlmZmZ ddlmZ ddl	m
Z
 ddlmZ  e
j        e          Z ed           G d	 d
ej                              Z ed           G d dej                              Z ed           G d dej                              Z ed           G d dej                              Z ed           G d dej                              Z ed           G d dej                              Z G d dej                  Z G d dej                  Z G d dej                  Z G d  d!ej                  Z G d" d#ej                  Z G d$ d%ej                  Z G d& d'e          Z G d( d)ej                  Zi d*ed+ed,d-d.fd/ed0ed1ed2d3ifd4ed5ed6d3ifd7ed8ed9ej        d:ed;ed<ed=ej         d>ed?ej!        d@ej"        eej#        ej$        ej%        edAZ& ee&          Z'dB Z( e(d1          Z) e(d0          Z* e(d*          Z+ e(d/          Z, e(d<          Z- e(dC          Z. e(d;          Z/ e(d:          Z0dS )D    N)OrderedDict)Tensornn   )use_kernel_forward_from_hub)logging)is_torchdynamo_compilingGeluTanhc                   J     e Zd ZdZd	def fdZdedefdZdedefdZ xZ	S )
GELUTanha&  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://huggingface.co/papers/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    Fuse_gelu_tanh_pythonc                     t                                                       |r| j        | _        d S t	          j        t          j        j        d          | _        d S )Ntanh)approximate)	super__init___gelu_tanh_pythonact	functoolspartialr   
functionalgelu)selfr   	__class__s     l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/activations.pyr   zGELUTanh.__init__(   sP     	Q-DHHH ();PPPDHHH    inputreturnc                     |dz  dt          j        t          j        dt          j        z            |dt          j        |d          z  z   z            z   z  S N      ?      ?       @Hm?g      @torchr   mathsqrtpipowr   r   s     r   r   zGELUTanh._gelu_tanh_python/   sP    s{cEJtytw/G/G5S[^c^ghmor^s^sSsKs/t$u$uuvvr   c                 ,    |                      |          S Nr   r+   s     r   forwardzGELUTanh.forward2       xxr   F)
__name__
__module____qualname____doc__boolr   r   r   r/   __classcell__r   s   @r   r   r      s         Q QT Q Q Q Q Q Qwv w& w w w wV         r   r   NewGELUc                   "    e Zd ZdZdedefdZdS )NewGELUActivationz
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    r   r   c                     d|z  dt          j        t          j        dt          j        z            |dt          j        |d          z  z   z            z   z  S r    r%   r+   s     r   r/   zNewGELUActivation.forward=   sP    U{cEJtytw/G/G5S[^c^ghmor^s^sSsKs/t$u$uuvvr   Nr2   r3   r4   r5   r   r/    r   r   r;   r;   6   sH         
wV w w w w w w wr   r;   GeLUc                   J     e Zd ZdZd	def fdZdedefdZdedefdZ xZ	S )
GELUActivationa  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    Fuse_gelu_pythonc                     t                                                       |r| j        | _        d S t          j        j        | _        d S r-   )r   r   _gelu_pythonr   r   r   r   )r   rB   r   s     r   r   zGELUActivation.__init__J   s?     	*(DHHH})DHHHr   r   r   c                 f    |dz  dt          j        |t          j        d          z            z   z  S )Nr!   r"   r#   )r&   erfr'   r(   r+   s     r   rD   zGELUActivation._gelu_pythonQ   s-    s{cEIedinn.D$E$EEFFr   c                 ,    |                      |          S r-   r.   r+   s     r   r/   zGELUActivation.forwardT   r0   r   r1   )
r2   r3   r4   r5   r6   r   r   rD   r/   r7   r8   s   @r   rA   rA   A   s         * * * * * * * *G& GV G G G GV         r   rA   SiLUc                   "    e Zd ZdZdedefdZdS )SiLUActivationa  
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    r   r   c                 @    t           j                            |          S r-   )r   r   silur+   s     r   r/   zSiLUActivation.forwardb   s    }!!%(((r   Nr=   r>   r   r   rJ   rJ   X   s@         )V ) ) ) ) ) ) )r   rJ   FastGELUc                   "    e Zd ZdZdedefdZdS )FastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 Z    d|z  dt          j        |dz  dd|z  |z  z   z            z   z  S )Nr!   r"   g3E?r$   )r&   r   r+   s     r   r/   zFastGELUActivation.forwardl   s;    U{cEJu|/CsXX]M]`eMeGe/f$g$gghhr   Nr=   r>   r   r   rO   rO   f   sH         iV i i i i i i ir   rO   	QuickGELUc                   "    e Zd ZdZdedefdZdS )QuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 6    |t          j        d|z            z  S )NgZd;?)r&   sigmoidr+   s     r   r/   zQuickGELUActivation.forwardv   s    u}UU]3333r   Nr=   r>   r   r   rS   rS   p   s@         4V 4 4 4 4 4 4 4r   rS   c                   <     e Zd ZdZdedef fdZdedefdZ xZS )ClippedGELUActivationa  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://huggingface.co/papers/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
    minmaxc                     ||k    rt          d| d| d          t                                                       || _        || _        d S )Nzmin should be < max (got min: z, max: ))
ValueErrorr   r   rX   rY   )r   rX   rY   r   s      r   r   zClippedGELUActivation.__init__   sV    99PcPP#PPPQQQr   xr   c                 \    t          j        t          |          | j        | j                  S r-   )r&   clipr   rX   rY   )r   r]   s     r   r/   zClippedGELUActivation.forward   s     z$q''48TX666r   )	r2   r3   r4   r5   floatr   r   r/   r7   r8   s   @r   rW   rW   z   sw        
 
E       7 7F 7 7 7 7 7 7 7 7r   rW   c                   2     e Zd ZdZ fdZdedefdZ xZS )AccurateGELUActivationz
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                     t                                                       t          j        dt          j        z            | _        d S )N   )r   r   r'   r(   r)   precomputed_constantr   r   s    r   r   zAccurateGELUActivation.__init__   s7    $(Ia$'k$:$:!!!r   r   r   c                 ~    d|z  dt          j        | j        |dt          j        |d          z  z   z            z   z  S )Nr!   r   r$      )r&   r   re   r*   r+   s     r   r/   zAccurateGELUActivation.forward   sC    U{a%*T-F%RZ]b]fglno]p]pRpJp-q"r"rrssr   )r2   r3   r4   r5   r   r   r/   r7   r8   s   @r   rb   rb      sn         ; ; ; ; ;tV t t t t t t t t tr   rb   c                   B     e Zd ZdZ fdZdedefdZdedefdZ xZS )MishActivationz
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                 t    t                                                       t          j        j        | _        d S r-   )r   r   r   r   mishr   rf   s    r   r   zMishActivation.__init__   s)    =%r   r   r   c                 j    |t          j        t          j                            |                    z  S r-   )r&   r   r   r   softplusr+   s     r   _mish_pythonzMishActivation._mish_python   s'    uz"-"8"8"?"?@@@@r   c                 ,    |                      |          S r-   r.   r+   s     r   r/   zMishActivation.forward   r0   r   )	r2   r3   r4   r5   r   r   ro   r/   r7   r8   s   @r   rj   rj      s         
& & & & &A& AV A A A AV         r   rj   c                   "    e Zd ZdZdedefdZdS )LinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r   r   c                     |S r-   r>   r+   s     r   r/   zLinearActivation.forward   s    r   Nr=   r>   r   r   rr   rr      s@         V       r   rr   c                       e Zd ZdZddZdS )LaplaceActivationz
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://huggingface.co/papers/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    绹۞? ^/?c                     ||z
                       |t          j        d          z            }ddt          j        |          z   z  S )Nr#   r!   r"   )divr'   r(   r&   rF   )r   r   musigmas       r   r/   zLaplaceActivation.forward   s@      3!788cEIe,,,--r   N)rv   rw   r2   r3   r4   r5   r/   r>   r   r   ru   ru      s2         . . . . . .r   ru   c                       e Zd ZdZd ZdS )ReLUSquaredActivationz`
    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668v2
    c                 l    t           j                            |          }t          j        |          }|S r-   )r   r   relur&   square)r   r   relu_appliedsquareds       r   r/   zReLUSquaredActivation.forward   s+    }))%00,|,,r   Nr|   r>   r   r   r~   r~      s-             r   r~   c                        e Zd Z fdZ xZS )ClassInstantierc                     t                                          |          }t          |t                    r|n|i f\  }} |di |S )Nr>   )r   __getitem__
isinstancetuple)r   keycontentclskwargsr   s        r   r   zClassInstantier.__getitem__   sL    ''%%c**!+GU!;!;Ngg'2Vs}}V}}r   )r2   r3   r4   r   r7   r8   s   @r   r   r      s8                r   r   c                   j     e Zd ZdZddddej        df fd	Zdedefd	Zdedefd
Z	dedefdZ
 xZS )XIELUActivationz
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    g?r!   gưFc           
         t                                                       t          j        t	          j        t	          j        t	          j        ||                                                  d                    | _	        t          j        t	          j        t	          j        t	          j        ||z
  |                                                  d                    | _
        |                     dt	          j        ||                     |                     dt	          j        ||                     || _        t          | j                                                                                                                                                  | _        t          | j                                                                                                                                                  | _        d | _        	 dd l}t          j        j                                        | _        d}	 ddlm}	  |	| j                  | _        |dz  }n,# t<          $ r}
|d|
 d	z  }| j        | _        Y d }
~
nd }
~
ww xY wt>                               |           d S # t<          $ r3}
t>                               d
tC          |
                     Y d }
~
d S d }
~
ww xY w)N)dtyper   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u   CUDA-fused xIELU not available (%s) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)"r   r   r   	Parameterr&   logexpm1tensor	unsqueezealpha_palpha_nregister_bufferwith_vector_loadsr`   r   detachcpuitem_beta_scalarr   _eps_scalar_xielu_cuda_obj	xielu.opsclassesxieluXIELUtorch._dynamor   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_oncestr)r   alpha_p_initalpha_n_initr   r   r   r   r   msgr   errr   s              r   r   zXIELUActivation.__init__   s    	|EIek%,|[`:a:a:a.b.b$c$c$m$mno$p$pqq|Iek%,|d/B%"P"P"PQQRR\\]^__
 
 	VU\$e%D%D%DEEEUELE$B$B$BCCC!2!$)"2"2"4"4"8"8":":"@"@"B"B"G"G"I"IJJ !2!2!6!6!8!8!>!>!@!@!E!E!G!GHH#	#(=#6#<#<#>#>D 2C7888888&4nT5E&F&F#?? 7 7 7tStttt&*&6######7 $$$$$ 	 	 	jC        	sB   .J*  I" !J* "
J,JJ* JJ* *
K'4(K""K'r]   r   c           
      n   t           j                            | j                  }| j        t           j                            | j                  z   }t          j        |dk    ||z  |z  | j        |z  z   t          j        t          j	        || j
                            |z
  |z  | j        |z  z             S )Nr   )r   r   rn   r   r   r   r&   wherer   rX   r   )r   r]   r   r   s       r   _xielu_pythonzXIELUActivation._xielu_python  s    -((66)bm44T\BBB{EaK!Odi!m+[1dh//0014?$)a-O
 
 	
r   c                 p   |j         }|                                dk     r-|                    d          }|                                dk     -|                                dk    r*|                    dd|                    d                    }||j         k    r!t
                              d||j                    | j                            || j	        
                    |j                  | j        
                    |j                  | j        | j        | j                  }|                    |          S )zDFirewall function to prevent torch.compile from seeing .item() callsrh   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimr   viewsizer   r   r   r/   r   tor   r   r   r   r   )r   r]   original_shaperesults       r   r   zXIELUActivation._xielu_cuda  s   eeggkkAA eeggkk5577Q;;r1affRjj))AQW$$q  
 %--LOOAG$$LOOAG$$"
 
 {{>***r   r   c                     | j         D|j        r=t                      s|                     |          S t                              d           |                     |          S )Nz:torch._dynamo is compiling, using Python version of xIELU.)r   is_cudar	   r   r   r   r   r+   s     r   r/   zXIELUActivation.forward1  s^    +++-- b**5111##$`aaa!!%(((r   )r2   r3   r4   r5   r&   bfloat16r   r   r   r   r/   r7   r8   s   @r   r   r      s          n) ) ) ) ) )V
v 
& 
 
 
 
+V + + + + +2)V ) ) ) ) ) ) ) ) )r   r   r   gelu_10i
   )rX   rY   	gelu_fastgelu_newgelu_pythonrB   Tgelu_pytorch_tanhgelu_python_tanhr   gelu_accuratelaplace
leaky_relulinearrl   
quick_gelur   relu2relu6rU   )rL   swishr   prelur   c           	          | t           v rt           |          S t          d|  dt          t                                                                )Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_strings    r   get_activationr   U  sO    F""'((h#4hhSWX^XcXcXeXeSfSfhhiiir   rL   )1r   r'   collectionsr   r&   r   r   integrations.hub_kernelsr   utilsr   utils.import_utilsr	   
get_loggerr2   r   Moduler   r;   rA   rJ   rO   rS   rW   rb   rj   rr   ru   r~   r   r   	LeakyReLUReLUReLU6SigmoidrH   TanhPReLUACT2CLSr   r   r   r   r   r   r   rL   rl   
linear_actr>   r   r   <module>r      sy        # # # # # #          A A A A A A       8 8 8 8 8 8 
	H	%	% Z((    ry   )(. Y''w w w w w	 w w ('w V$$    RY   %$, V$$
) 
) 
) 
) 
)RY 
) 
) %$
) Z((i i i i i i i )(i [))4 4 4 4 4") 4 4 *)47 7 7 7 7BI 7 7 72t t t t tRY t t t     RY   "    ry   
. 
. 
. 
. 
.	 
. 
. 
.    BI       k   [) [) [) [) [)bi [) [) [)|
N%s2'>'>? # !	
 N%6$=>  $:D#AB +   ",  N % BG "  RX!" rz#$ WGX-  0 
	!	!j j j n]++>*%%~fN;''	^L))
~f~f^H%%


r   