
     `iI                        d Z ddlZddlZddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddl m!Z! dZ" G d de          Z# G d de          Z$ G d de          Z% G d dej&                  Z' G d de          Z( G d dej&                  Z) G d de          Z* G d de*          Z+ G d  d!e          Z, G d" d#e          Z- G d$ d%e          Z. G d& d'ej&                  Z/e G d( d)e                      Z0e G d* d+e0                      Z1 G d, d-e          Z2 G d. d/e          Z3g d0Z4dS )1zPyTorch SEW model.    N)OptionalUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)BaseModelOutput)PreTrainedModel)auto_docstring   )Wav2Vec2AttentionWav2Vec2EncoderLayerWav2Vec2FeatureEncoderWav2Vec2FeedForwardWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GroupNormConvLayerWav2Vec2LayerNormConvLayerWav2Vec2NoLayerNormConvLayerWav2Vec2SamePadLayer_compute_mask_indices   )	SEWConfigc                       e Zd ZdS )SEWNoLayerNormConvLayerN__name__
__module____qualname__     w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/sew/modular_sew.pyr   r   1           Dr"   r   c                       e Zd ZdS )SEWLayerNormConvLayerNr   r!   r"   r#   r&   r&   5   r$   r"   r&   c                       e Zd ZdS )SEWGroupNormConvLayerNr   r!   r"   r#   r(   r(   9   r$   r"   r(   c                   $     e Zd Z fdZd Z xZS )SEWPositionalConvEmbeddingc                    t                                                       t          j        |j        |j        |j        |j        dz  |j        |j                  | _        t          j	        j
        }t          t          j	        j        d          rt          j	        j        j
        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t/          |j                  | _        t2          |j                 | _        d S )	Nr   )kernel_sizepaddinggroupsstrideweight_normr   modifier_rankweight)namedimparametrizations)super__init__r   Conv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupssqueeze_factorconvutilsr0   hasattrr6   r   	deepspeedzeroGatheredParametersr3   	original0	original1weight_gweight_vregister_external_parameterSEWSamePadLayerr-   r   feat_extract_activation
activation)selfconfigr0   rA   rF   rG   	__class__s         r#   r8   z#SEWPositionalConvEmbedding.__init__>   s   I62a77(
 
 
	 h*28,m<< 	@(3?K%'' 	E22493CST2UU I I'K	aHHH	I I I I I I I I I I I I I I Ity"455 .95<F95<F9-9-N66tXFFFN66tXFFFF#DIH!DDDDI&v'EFF !?@s    DD	D	c                     |                      |          }|                     |          }|                     |          }|S N)r>   r-   rK   )rL   hidden_statess     r#   forwardz"SEWPositionalConvEmbedding.forward`   s;    		-00]3366r"   r   r   r    r8   rR   __classcell__rN   s   @r#   r*   r*   =   sM         A  A  A  A  AD      r"   r*   c                       e Zd ZdS )rI   Nr   r!   r"   r#   rI   rI   h   r$   r"   rI   c                   $     e Zd Z fdZd Z xZS )SEWUpsamplingc                     t                                                       t          j        |j        |j        |j        z            | _        t          |j                 | _	        |j        | _        d S rP   )
r7   r8   r   Linearr:   r=   
projectionr   rJ   rK   rL   rM   rN   s     r#   r8   zSEWUpsampling.__init__m   sZ    )F$68JVMb8bcc !?@$3r"   c                 0   |                      |          }|                     |          }| j        dk    r`|                                \  }}}|| j        z  }|| j        z  }|                    ||| j        |          }|                    |||          }|S )Nr   )r[   rK   r=   sizereshape)rL   rQ   bszsrc_lensrc_embed_dimtgt_lentgt_embed_dims          r#   rR   zSEWUpsampling.forwards   s    6666""*7*<*<*>*>'C- 33G)T-@@M)11#w@SUbccM)11#wNNMr"   rS   rU   s   @r#   rX   rX   l   sG        4 4 4 4 4      r"   rX   c                       e Zd ZdS )SEWFeatureEncoderNr   r!   r"   r#   rf   rf      r$   r"   rf   c                        e Zd Z fdZ xZS )SEWFeatureExtractorc                     t                                          |           t          j        d| j        j         d| j        j        d         j         dt                     d S )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)r7   r8   warningswarnrN   r   	__bases__FutureWarningr\   s     r#   r8   zSEWFeatureExtractor.__init__   sy       E$.1 E EN,Q/8E E E 		
 	
 	
 	
 	
r"   )r   r   r    r8   rT   rU   s   @r#   rh   rh      s8        
 
 
 
 
 
 
 
 
r"   rh   c                       e Zd ZdS )SEWAttentionNr   r!   r"   r#   ro   ro      r$   r"   ro   c                       e Zd ZdS )SEWFeedForwardNr   r!   r"   r#   rq   rq      r$   r"   rq   c                       e Zd ZdS )SEWEncoderLayerNr   r!   r"   r#   rs   rs      r$   r"   rs   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )
SEWEncoderc                    t                                                       | _        t                    | _        t          j        j        j                  | _        t          j	        j
        j                  | _        t          j        j                  | _        t          j        fdt#          j                  D                       | _        t)                    | _        d| _        d S )Nepsc                 .    g | ]}t                    S r!   )rs   ).0_rM   s     r#   
<listcomp>z'SEWEncoder.__init__.<locals>.<listcomp>   s!    $f$f$f_V%<%<$f$f$fr"   F)r7   r8   rM   r*   pos_conv_embedr   	AvgPool1dr=   pool	LayerNormr:   layer_norm_eps
layer_normDropouthidden_dropoutdropout
ModuleListrangenum_hidden_layerslayersrX   upsamplegradient_checkpointingr\   s    `r#   r8   zSEWEncoder.__init__   s    8@@L!68MNN	,v'9v?TUUUz&"788m$f$f$f$feFLdFeFe$f$f$fgg%f--&+###r"   NFTc           	      `   |rdnd }|rdnd }||                     d                              dd|j        d                   }| j        j        dk    rd|| <   |d|v r|nd }nWd|| <   |                                                    d          }	|	| j        j        z  }
|j        d         | j        j        z  }t          j	        d||
j
                                      dd                              |
j        d         d          }||
                    dd          k                                     }d	|d d d d d d f                             |j        
          z
  }|t          j        |j                  j        z  }|                    |j        d         d|j        d         |j        d                   }|j        d         }|                    dd          }|                     |          }|                     |          }t!          |                    d          |                    d                    }|dd |f         |dd |f         z   }|                    dd          }|                     |          }|                     |          }t/                      pt1          |           }| j        D ]a}|r||fz   }t          j        g           }| j        o|| j        j        k     }|r|r ||||          }|d         }|rd}|r||d         fz   }b|r||fz   }|                     |          }|j        d         |k     r2t<          j                             |ddd||j        d         z
  f          }|stC          d |||fD                       S tE          |||          S )Nr!   r   r   flash_attention_2        r   device      ?)dtype.)attention_maskoutput_attentionsNNc              3      K   | ]}||V  	d S rP   r!   )rz   vs     r#   	<genexpr>z%SEWEncoder.forward.<locals>.<genexpr>   s(      mmq_`_l_l_l_l_lmmr"   last_hidden_staterQ   
attentions)#	unsqueezerepeatshaperM   _attn_implementationlongsumr=   torcharanger   viewexpandtor   finfomin	transposer}   r   r^   r   r   r   r	   r   randtraining	layerdropr   r   
functionalpadtupler
   )rL   rQ   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskinput_lengthsoutput_lengthsmax_encoder_lengthattention_idsn_input_timestepsposition_embeddingspooled_hidden_states
min_lengthsynced_gpuslayerdropout_probabilityskip_the_layerlayer_outputss                         r#   rR   zSEWEncoder.forward   s=    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!{/3FFF8;4454B4NSTXfSfSfmq 9<445!/!4!4!6!6 ; ;B ? ?!.$+2L!L%2%8%;t{?Y%Y"L$6~?TUUUT!R[[VN03R88 
 #0.2E2Eb!2L2L"L!R!R!T!T "%~aaatQQQ6F'G'J'JQ^Qd'J'e'e!e!/%+m>Q2R2R2V!V!/!6!6"(+Q0DR0H.J^_aJb" " */2%//155"11-@@#yy77,11"557K7P7PQS7T7TUU
,S+:+-=>ATUXZe[eZeUeAff%//15566]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 %!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D Dm44q!$555M--maAGX[h[nop[qGq=rssM 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r"   )NFFTrS   rU   s   @r#   ru   ru      sb        	, 	, 	, 	, 	, "W
 W
 W
 W
 W
 W
 W
 W
r"   ru   c                   x    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej        ef         fdZd	ed
ej        fdZdS )SEWPreTrainedModelrM   sewinput_valuesTFc           
         t          |t                    rt          j                            |j        j        ddt          j        d|j        j	        d         |j        j
        z  z            z             t          j                            |j        j        d           nt          |t          j                  r-|j        j                            d| j        j                   nt          |t          j        t          j        f          r?|j        j                                         |j        j                            d           nCt          |t          j                  r(t-                      rddl}t1          |d          rzt1          |d	          rj|j                            |j        |j        gd
          5  t          j                            |j        j                   ddd           n# 1 swxY w Y   n|j                            |j        d
          5  t          j                            |j        j                   ddd           n# 1 swxY w Y   n)t          j                            |j        j                   t          |t          j        t          j        f          r'|j        "|j        j                                         dS dS dS )zInitialize the weightsr   r   r   )meanstdr   r   NrG   rF   r1   )
isinstancer*   r   initnormal_r>   r3   mathsqrtr,   in_channels	constant_biasrZ   datarM   initializer_ranger   	GroupNormzero_fill_r9   r   rA   r@   rB   rC   rG   rF   kaiming_normal_)rL   modulerA   s      r#   _init_weightsz SEWPreTrainedModel._init_weights  s   f899 	<GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.2222	** 	< M&&CT[5R&SSSSr| <== 	<K""$$$M$$S))))	** 	<)++ 
<    6:.. D76:3N3N D"::FOV_;]mn:oo D D//0BCCCD D D D D D D D D D D D D D D #::6=XY:ZZ D D//0BCCCD D D D D D D D D D D D D D D ''(:;;;fry")455 	%&+:QK""$$$$$	% 	%:Q:Qs$   *G::G>G>'*II!$I!r   c                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r   div)input_lengthr,   r/   s      r#   _conv_out_lengthzMSEWPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length2  s&     9\K7wWWWZ[[[r"   )ziprM   conv_kernelconv_stride)rL   r   r   r,   r/   s        r#    _get_feat_extract_output_lengthsz3SEWPreTrainedModel._get_feat_extract_output_lengths-  s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMr"   feature_vector_lengthr   c                    |                      |                    d                                        t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                              d          
                    dg                                          }|S )Nr   r   )r   r   r   r   )r   r   r   r   r   r   zerosr   r   r   flipcumsumbool)rL   r   r   r   
batch_sizes        r#   "_get_feature_vector_attention_maskz5SEWPreTrainedModel._get_feature_vector_attention_mask<  s    >>~?Q?QRT?U?UVVYYZ_Zdee#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr"   N)r   r   r    r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   r   r   
LongTensorintr   r   r!   r"   r#   r   r     s         $O&*#N% % %@eEDTVYDY>Z    
 
]b]m 
 
 
 
 
 
r"   r   c                   *    e Zd Zdef fdZ	 	 ddej        deej                 deej                 fdZ	e
	 	 	 	 	 ddeej                 deej                 deej                 d	ee         d
ee         dee         deeef         fd            Z xZS )SEWModelrM   c                    t                                          |           || _        t          |          | _        t          j        |j        d         |j                  | _	        |j        d         |j
        k    | _        | j        r*t          j        |j        d         |j
                  | _        t          j        |j                  | _        |j        dk    s|j        dk    rBt          j        t)          j        |j
                                                            | _        t1          |          | _        |                                  d S )Nr   rw   r   )r7   r8   rM   rf   feature_extractorr   r   conv_dimr   r   r:   project_featuresrZ   feature_projectionr   feat_proj_dropoutfeature_dropoutmask_time_probmask_feature_prob	Parameterr   Tensoruniform_masked_spec_embedru   encoder	post_initr\   s     r#   r8   zSEWModel.__init__K  s      !26!:!:,vr':@UVVV & 3v7I I  	Y&(i0CVEW&X&XD#!z&*BCC 3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"!&)) 	r"   NrQ   mask_time_indicesr   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )	mask_probmask_lengthr   	min_masks)r   r   )r
  r  r  r   )getattrrM   r^   r  r   r   r   r   r   mask_time_lengthmask_time_min_masksr   tensorr   r   r   mask_feature_lengthmask_feature_min_masksr   )rL   rQ   r  r   r   sequence_lengthr:   mask_feature_indicess           r#   _mask_hidden_stateszSEWModel._mask_hidden_states_  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r"   r   r   r   r   returnc                 T   ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|                     |          }| j        r|                     |          }| 	                    |          }|!| 
                    |j        d         |          }|                     ||          }|                     |||||          }	|	d         }|s|f|	dd         z   S t          ||	j        |	j                  S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r  )r   r   r   r   r   r   )rM   r   r   use_return_dictr   r   r   r   r   r   r   r   r  r  r
   rQ   r   )
rL   r   r   r  r   r   r   extract_featuresrQ   encoder_outputss
             r#   rR   zSEWModel.forward  sr    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;??+;<<  	I#667GHH,,-=>>%!DD]EXYZE[]kllN00Rc0dd,,)/!5# ' 
 
 (* 	:!#oabb&999+)7&1
 
 
 	
r"   r   )NNNNN)r   r   r    r   r8   r   FloatTensorr   r   r  r   r  r   r   r   r
   rR   rT   rU   s   @r#   r   r   I  s;       y      . :>59	, ,(, $E$56, !!12	, , , ,\  269=,0/3&*3
 3
u|,3
 !.3
 $E$56	3

 $D>3
 'tn3
 d^3
 
uo%	&3
 3
 3
 ^3
 3
 3
 3
 3
r"   r   c                       e Zd ZdS )	SEWForCTCNr   r!   r"   r#   r  r    r$   r"   r  c                       e Zd ZdS )SEWForSequenceClassificationNr   r!   r"   r#   r  r    r$   r"   r  )r  r  r   r   )5__doc__r   rj   typingr   r   r   r   activationsr   integrations.deepspeedr   integrations.fsdpr	   modeling_outputsr
   modeling_utilsr   r?   r   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   r   r   configuration_sewr   _HIDDEN_STATES_START_POSITIONr   r&   r(   Moduler*   rI   rX   rf   rh   ro   rq   rs   ru   r   r   r  r  __all__r!   r"   r#   <module>r,     sS       " " " " " " " "        ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7 / / / / / / - - - - - - # # # # # #                          ) ( ( ( ( ( !" 	 	 	 	 	: 	 	 		 	 	 	 	6 	 	 		 	 	 	 	6 	 	 	( ( ( ( ( ( ( (V	 	 	 	 	* 	 	 	    BI   ,	 	 	 	 	. 	 	 	
 
 
 
 
+ 
 
 
	 	 	 	 	$ 	 	 		 	 	 	 	( 	 	 		 	 	 	 	* 	 	 	c
 c
 c
 c
 c
 c
 c
 c
L B B B B B B B BJ w
 w
 w
 w
 w
! w
 w
 w
t	 	 	 	 	 	 	 		 	 	 	 	#D 	 	 	 Z
Y
Yr"   