
     `iQ                     >   d dl Z d dlZd dlmZmZmZ d dlZd dlZd dlm	Z	 d dl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z#  e!j$        e%          Z& G d de          Z' G d de          Z( G d de          Z) G d de	j*                  Z+ G d de	j*                  Z, G d de	j*                  Z- G d de	j*                  Z.	 	 	 dCd e	j*        d!ej/        d"ej/        d#ej/        d$eej/                 d%ee0         d&e0d'eej/                 fd(Z1 G d) d*e	j*                  Z2 G d+ d,e	j*                  Z3 G d- d.e          Z4 G d/ d0e	j*                  Z5e  G d1 d2e                      Z6	 	 dDd3e7e8e8f         d4e0d5e8d$eej9                 d6e8d7ej:        fd8Z;e  G d9 d:e6                      Z<dZ= e d;<           G d= d>e6                      Z> e d?<           G d@ dAe6                      Z?g dBZ@dS )E    N)CallableOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )	SEWConfigc                   &     e Zd Zd fd	Zd Z xZS )SEWNoLayerNormConvLayerr   c                 Z   t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        d S )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/sew/modeling_sew.pyr    z SEWNoLayerNormConvLayer.__init__/   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@    c                 Z    |                      |          }|                     |          }|S N)r(   r*   r,   hidden_statess     r0   forwardzSEWNoLayerNormConvLayer.forward=   s*    		-0066r1   r   __name__
__module____qualname__r    r6   __classcell__r/   s   @r0   r   r   .   sR        A A A A A A      r1   r   c                   &     e Zd Zd fd	Zd Z xZS )SEWLayerNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   r   T)elementwise_affine)r   r    r!   r"   r#   r   r$   r%   r&   r'   r(   	LayerNorm
layer_normr	   r)   r*   r+   s      r0   r    zSEWLayerNormConvLayer.__init__D   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@r1   c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )N)r(   	transposerC   r*   r4   s     r0   r6   zSEWLayerNormConvLayer.forwardS   se    		-00%//B7766%//B7766r1   r7   r8   r=   s   @r0   r?   r?   C   sR        A A A A A A      r1   r?   c                   &     e Zd Zd fd	Zd Z xZS )SEWGroupNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        t          j        | j        | j        d          | _        d S )Nr   r   r   T)
num_groupsnum_channelsaffine)r   r    r!   r"   r#   r   r$   r%   r&   r'   r(   r	   r)   r*   	GroupNormrC   r+   s      r0   r    zSEWGroupNormConvLayer.__init___   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@,$2CRVRclpqqqr1   c                     |                      |          }|                     |          }|                     |          }|S r3   )r(   rC   r*   r4   s     r0   r6   zSEWGroupNormConvLayer.forwardo   s;    		-006666r1   r7   r8   r=   s   @r0   rI   rI   ^   sR        r r r r r r       r1   rI   c                   $     e Zd Z fdZd Z xZS )SEWPositionalConvEmbeddingc                    t                                                       t          j        |j        |j        |j        |j        dz  |j        |j                  | _        t          j	        j
        }t          t          j	        j        d          rt          j	        j        j
        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t/          |j                  | _        t2          |j                 | _        d S )	N   )r   paddinggroupsr   weight_normr   modifier_rankweight)namedimparametrizations)r   r    r   r$   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupssqueeze_factorr(   utilsrV   hasattrr\   r
   	deepspeedzeroGatheredParametersrY   	original0	original1weight_gweight_vregister_external_parameterSEWSamePadLayerrT   r	   r)   r*   )r,   r-   rV   rc   rh   ri   r/   s         r0   r    z#SEWPositionalConvEmbedding.__init__w   s   I62a77(
 
 
	 h*28,m<< 	@(3?K%'' 	E22493CST2UU I I'K	aHHH	I I I I I I I I I I I I I I Ity"455 .95<F95<F9-9-N66tXFFFN66tXFFFF#DIH!DDDDI&v'EFF !?@s    DD	D	c                     |                      |          }|                     |          }|                     |          }|S r3   )r(   rT   r*   r4   s     r0   r6   z"SEWPositionalConvEmbedding.forward   s;    		-00]3366r1   r8   r=   s   @r0   rQ   rQ   v   sM         A  A  A  A  AD      r1   rQ   c                   $     e Zd Z fdZd Z xZS )rk   c                 l    t                                                       |dz  dk    rdnd| _        d S )NrS   r   r   )r   r    num_pad_remove)r,   r^   r/   s     r0   r    zSEWSamePadLayer.__init__   s:    #:Q#>!#C#Caar1   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )ro   r4   s     r0   r6   zSEWSamePadLayer.forward   s;    "")!!!QQQ0F43F2F0F*FGMr1   r8   r=   s   @r0   rk   rk      sL        K K K K K      r1   rk   c                   $     e Zd Z fdZd Z xZS )SEWUpsamplingc                     t                                                       t          j        |j        |j        |j        z            | _        t          |j                 | _	        |j        | _        d S r3   )
r   r    r   Linearr]   r`   
projectionr	   r)   r*   r,   r-   r/   s     r0   r    zSEWUpsampling.__init__   sZ    )F$68JVMb8bcc !?@$3r1   c                 0   |                      |          }|                     |          }| j        dk    r`|                                \  }}}|| j        z  }|| j        z  }|                    ||| j        |          }|                    |||          }|S )Nr   )rv   r*   r`   sizereshape)r,   r5   bszsrc_lensrc_embed_dimtgt_lentgt_embed_dims          r0   r6   zSEWUpsampling.forward   s    6666""*7*<*<*>*>'C- 33G)T-@@M)11#w@SUbccM)11#wNNMr1   r8   r=   s   @r0   rs   rs      sG        4 4 4 4 4      r1   rs   c                   .     e Zd ZdZ fdZd Zd Z xZS )SEWFeatureEncoderz.Construct the features from raw audio waveformc                    t                                                       j        dk    r7t          d          gfdt	          j        dz
            D             z   }nDj        dk    r!fdt	          j                  D             }nt          dj         d	          t          j        |          | _	        d
| _
        d| _        d S )Ngroupr   r.   c                 8    g | ]}t          |d z             S )r   r   )r   .0ir-   s     r0   
<listcomp>z.SEWFeatureEncoder.__init__.<locals>.<listcomp>   s>     I I IDE'Q???I I Ir1   r   layerc                 2    g | ]}t          |           S )r   )r?   r   s     r0   r   z.SEWFeatureEncoder.__init__.<locals>.<listcomp>   s'    ttt0!DDDtttr1   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r   r    feat_extract_normrI   rangenum_feat_extract_layers
ValueErrorr   
ModuleListconv_layersgradient_checkpointing_requires_grad)r,   r-   r   r/   s    ` r0   r    zSEWFeatureEncoder.__init__   s   #w..0!DDDE I I I IINvOmpqOqIrIrI I I KK %00tttteTZTrNsNstttKKt1Ittt   =55&+#"r1   c                 P    |                                  D ]	}d|_        
d| _        d S NF)
parametersrequires_gradr   r,   params     r0   _freeze_parametersz$SEWFeatureEncoder._freeze_parameters   s4    __&& 	( 	(E"'E#r1   c                 r    |d d d f         }| j         r| j        rd|_        | j        D ]} ||          }|S )NT)r   trainingr   r   )r,   input_valuesr5   
conv_layers       r0   r6   zSEWFeatureEncoder.forward   s[    $QQQW-  	/4= 	/*.M'* 	6 	6J&J}55MMr1   )r9   r:   r;   __doc__r    r   r6   r<   r=   s   @r0   r   r      s\        88# # # # #"$ $ $

 
 
 
 
 
 
r1   r           modulequerykeyvalueattention_maskscalingdropout	head_maskc                    ||                     d          dz  }t          j        ||                    dd                    |z  }	||	|z   }	t          j                            |	d          }	||	|                    dddd          z  }	t          j                            |	|| j	                  }	t          j        |	|          }
|
                    dd          
                                }
|
|	fS )NrF         rS   r   r[   r   )pr   )ry   torchmatmulrG   r   
functionalsoftmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r0   eager_attention_forwardr      s     **R..D(<s}}Q':':;;gEL!#n4=((2(>>L#innQAq&A&AA=((6?([[L,|U33K''1--88::K$$r1   c                   >    e Zd ZdZ	 	 	 	 	 ddededed	ed
ededee         f fdZ		 	 	 	 dde
j        dee
j                 dee
j                 dee
j                 dee         dee         dee
j        ee
j                 eee
j                          f         fdZ xZS )SEWAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr   
is_decoderr   	is_causalr-   c                 
   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r   )r   r    r   r   r   head_dimr-   r   r   r   r   r   ru   k_projv_projq_projout_proj)	r,   r   r   r   r   r   r   r-   r/   s	           r0   r    zSEWAttention.__init__	  s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr1   r5   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                    |du}|j         dd         \  }}	|r|j         d         n|	}
||	d| j        f}||
d| j        f} |                     |          j        |                     dd          }|r|n|} |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j        j	        dk    rt          | j        j	                 } || ||||f| j        sdn| j        | j        ||d|\  }}|                    ||	d                                          }|                     |          }||dfS )z#Input shape: Batch x Time x ChannelNrF   r   rS   eagerr   )r   r   r   r   )shaper   r   r   rG   r   r   r   r-   _attn_implementationr   r   r   r   rz   r   r   )r,   r5   r   r   r   r   r   is_cross_attentionr{   r~   r|   q_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       r0   r6   zSEWAttention.forward(  s    .T9 %*3B3/W/AN"(++wgr4=9wDM: 7t{{=116FPPQRTUVV-?R))]5T[[005~FPPQRTUVV
7t{{>227HRRSTVWXX(?;+w66"9$+:Z"[$7$7%
  $}>CC$,L/%%
 %
 %
 %
!\ "))#w;;FFHHmmK00L$..r1   )r   FTFN)NNNF)r9   r:   r;   r   intfloatboolr   r   r    r   Tensorr   r   tupler6   r<   r=   s   @r0   r   r     s^       GG  &*C CC C 	C
 C C C #C C C C C CD 481526,13/ 3/|3/ #5<03/ !.	3/
 "%,/3/ $D>3/ -.3/ 
u|Xel3XeEL>Q5RR	S3/ 3/ 3/ 3/ 3/ 3/ 3/ 3/r1   r   c                   $     e Zd Z fdZd Z xZS )SEWFeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |j                  | _	        t          |j        t                    rt          |j                 | _        n|j        | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S r3   )r   r    r   Dropoutactivation_dropoutintermediate_dropoutru   r]   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutrw   s     r0   r    zSEWFeedForward.__init___  s    $&Jv/H$I$I!"$)F,>@X"Y"Yf'-- 	9'-f.?'@D$$'-'8D$If&>@RSS j)>??r1   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r3   )r   r   r   r   r   r4   s     r0   r6   zSEWFeedForward.forwardl  sg    //>>00??11-@@))-88++M::r1   r8   r=   s   @r0   r   r   ^  sL        @ @ @ @ @      r1   r   c                   &     e Zd Z fdZddZ xZS )SEWEncoderLayerc                    t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _        d S )NF)r   r   r   r   r-   eps)r   r    r   r]   num_attention_headsattention_dropout	attentionr   r   r   r   rB   layer_norm_epsrC   r   feed_forwardfinal_layer_normrw   s     r0   r    zSEWEncoderLayer.__init__w  s    %(0,
 
 
 z&"788,v'9v?TUUU*622 "V-?VEZ [ [ [r1   NFc                    |}|                      |||          \  }}}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}|r||fz  }|S )Nr   r   )r   r   rC   r   r   )r,   r5   r   r   attn_residualr   _outputss           r0   r6   zSEWEncoderLayer.forward  s    %)-.L] *8 *
 *
&|Q ]33%566%(9(9-(H(HH--m<< " 	'&Gr1   r   r8   r=   s   @r0   r   r   v  sQ        \ \ \ \ \       r1   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )
SEWEncoderc                    t                                                       | _        t                    | _        t          j        j        j                  | _        t          j	        j
        j                  | _        t          j        j                  | _        t          j        fdt#          j                  D                       | _        t)                    | _        d| _        d S )Nr   c                 .    g | ]}t                    S  )r   )r   r   r-   s     r0   r   z'SEWEncoder.__init__.<locals>.<listcomp>  s!    $f$f$f_V%<%<$f$f$fr1   F)r   r    r-   rQ   pos_conv_embedr   	AvgPool1dr`   poolrB   r]   r   rC   r   r   r   r   r   num_hidden_layerslayersrs   upsampler   rw   s    `r0   r    zSEWEncoder.__init__  s    8@@L!68MNN	,v'9v?TUUUz&"788m$f$f$f$feFLdFeFe$f$f$fgg%f--&+###r1   NFTc           	      `   |rdnd }|rdnd }||                     d                              dd|j        d                   }| j        j        dk    rd|| <   |d|v r|nd }nWd|| <   |                                                    d          }	|	| j        j        z  }
|j        d         | j        j        z  }t          j	        d||
j
                                      dd                              |
j        d         d          }||
                    dd          k                                     }d	|d d d d d d f                             |j        
          z
  }|t          j        |j                  j        z  }|                    |j        d         d|j        d         |j        d                   }|j        d         }|                    dd          }|                     |          }|                     |          }t!          |                    d          |                    d                    }|dd |f         |dd |f         z   }|                    dd          }|                     |          }|                     |          }t/                      pt1          |           }| j        D ]a}|r||fz   }t          j        g           }| j        o|| j        j        k     }|r|r ||||          }|d         }|rd}|r||d         fz   }b|r||fz   }|                     |          }|j        d         |k     r2t<          j                             |ddd||j        d         z
  f          }|stC          d |||fD                       S tE          |||          S )Nr   rF   r   rS   flash_attention_2r   r   device      ?dtype.r   NNc              3      K   | ]}||V  	d S r3   r   )r   vs     r0   	<genexpr>z%SEWEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr1   last_hidden_stater5   
attentions)#	unsqueezerepeatr   r-   r   longsumr`   r   aranger  r   expandtor	  finfominrG   r   r   ry   rC   r   r
   r   r  randr   	layerdropr  r   r   padr   r   )r,   r5   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskinput_lengthsoutput_lengthsmax_encoder_lengthattention_idsn_input_timestepsposition_embeddingspooled_hidden_states
min_lengthsynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                         r0   r6   zSEWEncoder.forward  s=    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!{/3FFF8;4454B4NSTXfSfSfmq 9<445!/!4!4!6!6 ; ;B ? ?!.$+2L!L%2%8%;t{?Y%Y"L$6~?TUUUT!R[[VN03R88 
 #0.2E2Eb!2L2L"L!R!R!T!T "%~aaatQQQ6F'G'J'JQ^Qd'J'e'e!e!/%+m>Q2R2R2V!V!/!6!6"(+Q0DR0H.J^_aJb" " */2%//155"11-@@#yy77,11"557K7P7PQS7T7TUU
,S+:+-=>ATUXZe[eZeUeAff%//15566]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 %!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D Dm44q!$555M--maAGX[h[nop[qGq=rssM 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r1   )NFFTr8   r=   s   @r0   r   r     sb        	, 	, 	, 	, 	, "W
 W
 W
 W
 W
 W
 W
 W
r1   r   c                   x    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej        ef         fdZd	ed
ej        fdZdS )SEWPreTrainedModelr-   sewr   TFc           
         t          |t                    rt          j                            |j        j        ddt          j        d|j        j	        d         |j        j
        z  z            z             t          j                            |j        j        d           nt          |t          j                  r-|j        j                            d| j        j                   nt          |t          j        t          j        f          r?|j        j                                         |j        j                            d           nCt          |t          j                  r(t-                      rddl}t1          |d          rzt1          |d	          rj|j                            |j        |j        gd
          5  t          j                            |j        j                   ddd           n# 1 swxY w Y   n|j                            |j        d
          5  t          j                            |j        j                   ddd           n# 1 swxY w Y   n)t          j                            |j        j                   t          |t          j        t          j        f          r'|j        "|j        j                                         dS dS dS )zInitialize the weightsr   rS   r   )meanstdr   r  Nri   rh   rW   )r   rQ   r   initnormal_r(   rY   mathsqrtr   in_channels	constant_r   ru   datar-   initializer_rangerB   rN   zero_fill_r$   r
   rc   rb   rd   re   ri   rh   kaiming_normal_)r,   r   rc   s      r0   _init_weightsz SEWPreTrainedModel._init_weights
  s   f899 	<GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.2222	** 	< M&&CT[5R&SSSSr| <== 	<K""$$$M$$S))))	** 	<)++ 
<    6:.. D76:3N3N D"::FOV_;]mn:oo D D//0BCCCD D D D D D D D D D D D D D D #::6=XY:ZZ D D//0BCCCD D D D D D D D D D D D D D D ''(:;;;fry")455 	%&+:QK""$$$$$	% 	%:Q:Qs$   *G::G>G>'*II!$I!r"  c                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r   div)input_lengthr   r   s      r0   _conv_out_lengthzMSEWPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length/  s&     9\K7wWWWZ[[[r1   )zipr-   r%   r&   )r,   r"  rF  r   r   s        r0    _get_feat_extract_output_lengthsz3SEWPreTrainedModel._get_feat_extract_output_lengths*  s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMr1   feature_vector_lengthr   c                    |                      |                    d                                        t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                              d          
                    dg                                          }|S )NrF   r   )r	  r  r   r  )rH  r  r  r   r  r   zerosr	  r  r  flipcumsumr   )r,   rI  r   r#  
batch_sizes        r0   "_get_feature_vector_attention_maskz5SEWPreTrainedModel._get_feature_vector_attention_mask9  s    >>~?Q?QRT?U?UVVYYZ_Zdee#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr1   N)r9   r:   r;   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr?  r   r   
LongTensorr   rH  rO  r   r1   r0   r/  r/     s         $O&*#N% % %@eEDTVYDY>Z    
 
]b]m 
 
 
 
 
 
r1   r/  r   	mask_probmask_length	min_masksr   c                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                                                    d                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)rE  num_masked_spanepsilonrY  rX  rZ  sequence_lengths     r0   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_spanl  s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFOr1   NrF   c                     g | ]}S r   r   )r   r   ra  s     r0   r   z)_compute_mask_indices.<locals>.<listcomp>  s    999!o999r1   r  r   F)replace)r   nprandomr  itemdetachr  tolistr   rK  r   choicer  lenconcatenateonesint32appendarraybroadcast_torz   r^  put_along_axis)r   rX  rY  r   rZ  rN  rb  r"  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrE  r_  spec_aug_mask_idxdummy_mask_idxoffsetsr`  ra  s    `` `           @@r0   _compute_mask_indicesry  F  sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	##B''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???r1   c                   *    e Zd Zdef fdZ	 	 ddej        deej                 deej                 fdZ	e
	 	 	 	 	 ddeej                 deej                 deej                 d	ee         d
ee         dee         deeef         fd            Z xZS )SEWModelr-   c                    t                                          |           || _        t          |          | _        t          j        |j        d         |j                  | _	        |j        d         |j
        k    | _        | j        r*t          j        |j        d         |j
                  | _        t          j        |j                  | _        |j        dk    s|j        dk    rBt          j        t)          j        |j
                                                            | _        t1          |          | _        |                                  d S )NrF   r   r   )r   r    r-   r   feature_extractorr   rB   r!   r   rC   r]   project_featuresru   feature_projectionr   feat_proj_dropoutfeature_dropoutmask_time_probmask_feature_prob	Parameterr   r   uniform_masked_spec_embedr   encoder	post_initrw   s     r0   r    zSEWModel.__init__  s      !26!:!:,vr':@UVVV & 3v7I I  	Y&(i0CVEW&X&XD#!z&*BCC 3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"!&)) 	r1   Nr5   mask_time_indicesr   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )rX  rY  r   rZ  )r  r	  )rX  rY  rZ  rF   )getattrr-   ry   r  r  r	  r  r   ry  mask_time_lengthmask_time_min_masksr   tensorr  r   r  mask_feature_lengthmask_feature_min_masksr  )r,   r5   r  r   rN  ra  r]   mask_feature_indicess           r0   _mask_hidden_stateszSEWModel._mask_hidden_states  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r1   r   r   r  r  r   c                 T   ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|                     |          }| j        r|                     |          }| 	                    |          }|!| 
                    |j        d         |          }|                     ||          }|                     |||||          }	|	d         }|s|f|	dd         z   S t          ||	j        |	j                  S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   rS   )r  r   r   r  r  r   r  )r-   r   r  use_return_dictr}  rG   rC   r~  r  r  rO  r   r  r  r   r5   r  )
r,   r   r   r  r   r  r  extract_featuresr5   encoder_outputss
             r0   r6   zSEWModel.forward  sr    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;??+;<<  	I#667GHH,,-=>>%!DD]EXYZE[]kllN00Rc0dd,,)/!5# ' 
 
 (* 	:!#oabb&999+)7&1
 
 
 	
r1   r
  NNNNN)r9   r:   r;   r   r    r   FloatTensorr   rW  r  r   r   r   r   r   r   r6   r<   r=   s   @r0   r{  r{    s;       y      . :>59	, ,(, $E$56, !!12	, , , ,\  269=,0/3&*3
 3
u|,3
 !.3
 $E$56	3

 $D>3
 'tn3
 d^3
 
uo%	&3
 3
 3
 ^3
 3
 3
 3
 3
r1   r{  zk
    SEW Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                        e Zd Zddee         f fdZd Zd Zd Zd Z	e
	 	 	 	 	 ddeej                 d	eej                 d
ee         dee         dee         deej                 deeef         fd            Z xZS )	SEWForCTCNtarget_langc                    t                                          |           t          |          | _        t	          j        |j                  | _        || _        |j	        t          d| j         d          t          |d          r|j        r|j        n|j        }t	          j        ||j	                  | _        |                                  dS )a-  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`SEWForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SEWForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r   r    r{  r0  r   r   final_dropoutr   r  
vocab_sizer   r/   rb   r  output_hidden_sizer]   ru   lm_headr  )r,   r-   r  r  r/   s       r0   r    zSEWForCTC.__init__A  s     	   F##z&"677&$H H H H   *1)G)GvFL^vF%%djdv 	 y!3V5FGG 	r1   c                    | j         }|)t          | j        dd          t          d| d          |2t          | j        dd          t                              d           dS ||                     |d           dS dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  r  r-   r   loggerinfoload_adapter)r,   r  s     r0   tie_weightszSEWForCTC.tie_weights^  s     &"wt{<NPT'U'U']u;uuuvvv WT[:Ld%S%S%_KKCDDDDD$kd;;;;; %$r1   c                 b    t          j        dt                     |                                  dS )
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr,   s    r0   freeze_feature_extractorz"SEWForCTC.freeze_feature_extractors  ;    
 	Q	
 	
 	

 	##%%%%%r1   c                 B    | j         j                                         dS r  Nr0  r}  r   r  s    r0   r  z SEWForCTC.freeze_feature_encoder  !    
 	"5577777r1   c                 L    | j                                         D ]	}d|_        
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr0  r   r   r   s     r0   freeze_base_modelzSEWForCTC.freeze_base_model  6    
 X((** 	( 	(E"'E	( 	(r1   r   r   r   r  r  labelsr   c           
      p   ||n| j         j        }|>|                                | j         j        k    rt	          d| j         j                   |                     |||||          }|d         }|                     |          }|                     |          }	d}
|Z||nt          j	        |t          j
                  }|                     |                    d                                        t          j
                  }|dk    }|                    d          }|                    |          }t          j                            |	dt          j                                      dd          }t          j        j                            d	
          5  t          j                            ||||| j         j        | j         j        | j         j                  }
ddd           n# 1 swxY w Y   |s|	f|t6          d         z   }|
|
f|z   n|S t9          |
|	|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r  rF   )r[   r	  r   F)enabled)blank	reductionzero_infinitylosslogitsr5   r  )r-   r  r^  r  r   r0  r   r  r   	ones_liker  rH  r  r  masked_selectr   r   log_softmaxfloat32rG   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r5   r  )r,   r   r   r   r  r  r  r   r5   r  r  r"  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r0   r6   zSEWForCTC.forward  s   " &1%<kk$+B]&**,,$+2H"H"H\DKDZ\\]]](()/!5#  
 
  
]33m,, #1"<%/R^fkfpBqBqBq  !AA.BTBTUWBXBXYY\\]b]ghhM !A+K(__R00N & 4 4[ A A 11&b1VV``abdeffI%++E+:: 	 	}--%!"+2"k<"&+"? .  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  	FY)F)G)G!HHF)-)9TGf$$vEfG4IV]Vh
 
 
 	
s    AG11G58G5r3   r  )r9   r:   r;   r   r   r    r  r  r  r  r   r   r   r   r   r   r   r6   r<   r=   s   @r0   r  r  ;  s>        HSM      :< < <*
& 
& 
&8 8 8( ( (  26,0/3&*)-D
 D
u|,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
 D
 ^D
 D
 D
 D
 D
r1   r  z
    SEW Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j
                 dee	j
                 dee         d	ee         d
ee         dee	j
                 deeef         fd            Z xZS )SEWForSequenceClassificationc                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        |                                  d S )Nr  zZSequence classification does not support the use of SEW adapters (config.add_adapter=True)r   )r   r    rb   r  r   r{  r0  r   use_weighted_layer_sumr   r  r   rm  layer_weightsru   r]   classifier_proj_size	projector
num_labels
classifierr  )r,   r-   
num_layersr/   s      r0   r    z%SEWForSequenceClassification.__init__  s       6=)) 	f.@ 	l   F##-1
( 	S!#ej.D.Dz.Q!R!RD6#5v7RSS)F$?ARSS 	r1   c                 b    t          j        dt                     |                                  dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        r  Nr  r  s    r0   r  z5SEWForSequenceClassification.freeze_feature_extractor  r  r1   c                 B    | j         j                                         dS r  r  r  s    r0   r  z3SEWForSequenceClassification.freeze_feature_encoder  r  r1   c                 L    | j                                         D ]	}d|_        
dS r  r  r   s     r0   r  z.SEWForSequenceClassification.freeze_base_model  r  r1   Nr   r   r   r  r  r  r   c                 d   ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }||                    d          }
n|                     |j        d         |          }|                    d                              dd|j        d                   }d	|| <   |                    d          |                    d                              dd          z  }
|                     |
          }d}|Kt)                      } ||                    d| j         j                  |                    d                    }|s|f|t          d         z   }||f|z   n|S t-          |||j        |j        
          S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   rF   r   rS   r   r  )r-   r  r  r0  r  r   stackr   r   r   r  r   r  r  r2  rO  r   r  r  r  r   r  r   r5   r  )r,   r   r   r   r  r  r  r   r5   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  s                    r0   r6   z$SEWForSequenceClassification.forward	  sW   . &1%<kk$+B]'+{'IcttOc(()/!5#  
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55!)..1.55MMBB=CVWXCY[ijjL"."8"8"<"<"C"CAq-J]^_J`"a"a25M../)--!-44|7G7GA7G7N7N7S7STVXY7Z7ZZM//'))H8FKKDK,BCCV[[QS__UUD 	FY)F)G)G!HHF)-)9TGf$$vE'!/)	
 
 
 	
r1   r  )r9   r:   r;   r    r  r  r  r   r   r   r   r   r   r   r   r6   r<   r=   s   @r0   r  r    s           "
& 
& 
&8 8 8( ( (  26,0/3&*)-B
 B
u|,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
 B
 ^B
 B
 B
 B
 B
r1   r  )r  r  r{  r/  )Nr   Nrq   )Ar6  r  typingr   r   r   numpyre  r   r   torch.nnr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   ra   r   r   configuration_sewr   
get_loggerr9   r  r   r?   rI   ModulerQ   rk   rs   r   r   r   r   r   r   r   r   r/  r   r   rW  ndarrayry  r{  r  r  r  __all__r   r1   r0   <module>r     s  ,   , , , , , , , , , ,            % % % % % % ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7 B B B B B B 9 9 9 9 9 9 Y Y Y Y Y Y Y Y Y Y F F F F F F F F & & & & & & , , , , , , , , ( ( ( ( ( ( 
	H	%	%    8   *    6   6    6   0( ( ( ( ( ( ( (V    bi       BI   ,# # # # #	 # # #X  $(,% %I%<% 
% <	%
 U\*% e_% % %% % % %<U/ U/ U/ U/ U/29 U/ U/ U/p    RY   0! ! ! ! !0 ! ! !Hc
 c
 c
 c
 c
 c
 c
 c
L B B B B B B B BR 26t tc?tt t U-.	t
 t Zt t t tn w
 w
 w
 w
 w
! w
 w
 w
t !"    
S
 S
 S
 S
 S
" S
 S
 
S
l   p
 p
 p
 p
 p
#5 p
 p
 p
f Z
Y
Yr1   