
     `iZ                        d dl Z d dlmZmZ d dlZd dlmZ d dlmc mZ	 ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZmZm Z  ddl!m"Z"  ej#        e$          Z% G d de          Z& G d de          Z' G d dej(                  Z) G d de          Z* G d de          Z+ G d de          Z, G d dej(                  Z- G d dej(                  Z. G d dej(                  Z/ G d  d!ee           Z0eZ1 G d" d#e          Z2 G d$ d%e          Z3 G d& d'e          Z4 G d( d)e          Z5 G d* d+e          Z6g d,Z7dS )-    N)OptionalUnion   )is_deepspeed_zero3_enabled)is_fsdp_managed_module)GradientCheckpointingLayer)BaseModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)logging   )	Wav2Vec2FeatureProjectionWav2Vec2FeedForward#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PositionalConvEmbeddingWav2Vec2PreTrainedModel   )WavLMConfigc                       e Zd ZdS )WavLMPositionalConvEmbeddingN__name__
__module____qualname__     {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/wavlm/modular_wavlm.pyr   r              Dr    r   c                       e Zd ZdS )WavLMFeatureProjectionNr   r   r    r!   r$   r$   #   r"   r    r$   c                       e Zd ZdZ	 	 	 	 ddededed	ed
edef fdZ	 	 	 	 ddej	        de
ej	                 de
ej	                 dedeej	        e
ej	                 e
eej	                          f         f
dZdej        deej        ej        f         dej        dedeej        ej        f         f
dZdededej        fdZdej        dej        fdZ xZS )WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        @     T	embed_dim	num_headsdropoutnum_bucketsmax_distancehas_relative_position_biasc                    t                                                       || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        t          j	        ||          | _
        t          j	        ||          | _        t          j	        ||          | _        t          j	        ||          | _        || _        || _        t          j        t#          j        d| j        dd                    | _        t          j	        | j        d          | _        |r&t          j        | j        | j                  | _        d S d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )super__init__r*   r+   r,   head_dim
ValueErrorscalingnnLineark_projv_projq_projout_projr-   r.   	Parametertorchonesgru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)selfr*   r+   r,   r-   r.   r/   	__class__s          r!   r3   zWavLMAttention.__init__*   sa    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*i	955i	955i	955	)Y77&(!#ejDNAq.Q.Q!R!R"$)DM1"="=% 	Q"$,t/?"P"PD	Q 	Qr    NFr   hidden_statesattention_maskposition_biasoutput_attentionsreturnc                 F   |                                 \  }}}|^|                     ||          }|                    d                              |ddd                              || j        z  ||          }|                    |j        dd         | j        dfz             }	|	                    dddd          }	|                     |	          }
|
                    |	j        dd         dz             	                    d          }
t          j        |
                              dd          \  }}||| j        z  d	z
  z  d
z   }|                    || j        z  dd          |z  }|                    d||f          }|                     ||||          \  }}|||fS )z'Attention layer with relative attentionNr   r   r   r   )r      dim      ?g       @)sizecompute_bias	unsqueezerepeatviewr+   shapepermuterA   sumr>   sigmoidchunkr@   torch_multi_head_self_attention)rD   rF   rG   rH   rI   indexbsztgt_len_gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightss                    r!   forwardzWavLMAttention.forwardN   s    (,,..Wa   --gw??M''**11#q!Q??DDS4>EY[bdkll  ,001DSbS1IT^]_L`1`aa199!Q1EE "&!8!89L!M!M!7!<!<=P=VWZXZWZ=[^d=d!e!e!i!ijl!m!m '=>>DDQBDOO)? ?# EFL *..sT^/CRKKm[166GW7MNN$($H$H>+>@Q%
 %
!\ L-77r    re   c                    |                     dd          x}x}}||                    d          nd}dx}	}
d}t          j        |||| j        | j        t          j        dg          t          j        | j	        j
        | j        j
        | j        j
        f          |	|
|| j        | j        j        | j        j
        | j        |||d| j	        j        | j        j        | j        j                  \  }}|                     dd          }|E|dddf                             |j        dd         | j        fz   |j        dd         z             }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)	transposeneFmulti_head_attention_forwardr*   r+   r>   emptycatr;   biasr9   r:   r,   r<   weighttrainingbroadcast_torV   )rD   rF   rG   re   rI   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnrf   rg   s                 r!   r[   z.WavLMAttention.torch_multi_head_self_attentionw   sx    ,55a;;;;e3A3M>,,Q///SW  %&$BNNKIt{')94;;KLMMLM MM%)+,+,+,+%
 %
 %
!\2 "++Aq11# (40=="2A2&$.)::\=OPQPRPR=SS L L((r    query_length
key_lengthc                    t          j        |t           j                  d d d f         }t          j        |t           j                  d d d f         }||z
  }|                     |          }|                    | j        j        j                  }|                     |          }|                    g d          }|S )N)dtype)r   r   r   )	r>   arangelong_relative_positions_buckettorC   ru   devicerW   )rD   r   r   context_positionmemory_positionrelative_positionrelative_position_bucketvaluess           r!   rR   zWavLMAttention.compute_bias   s     <EJGGG4P,zDDDT111WM+.>>#'#B#BCT#U#U #;#>#>t?R?Y?`#a#a $$%=>>			**r    relative_positionsc                    | j         dz  }|dk                        t          j                  |z  }t          j        |          }|dz  }||k     }t          j        |                                |z            }|t          j        | j        |z            z  }|||z
  z  }||z                       t          j                  }t          j	        |t          j
        ||dz
                      }|t          j        |||          z  }|S )Nr   r   r   )r-   r   r>   r   abslogfloatmathr.   min	full_likewhere)rD   r   r-   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larges           r!   r   z)WavLMAttention._relative_positions_bucket   s   &!+.266uzBB[P"Y'9::1$	%	1&+i0B0H0H0J0JY0V&W&W#&ADHTM^ajMjDkDk&k#&A[S\E\&]#&/2M&M%Q%QRWR\%]%]"%*Y&8RT_bcTc(d(d&
 &
" 	EK2DF`aaar    )r'   r(   r)   TNNFr   )r   r   r   __doc__intr   boolr3   r>   Tensorr   tuplerh   FloatTensorr   
LongTensor
BoolTensorr[   rR   r   __classcell__rE   s   @r!   r&   r&   '   s       GG +/"Q "Q"Q "Q 	"Q
 "Q "Q %)"Q "Q "Q "Q "Q "QN 2604"''8 '8|'8 !.'8  -	'8
  '8 
u|Xel3XeEL>Q5RR	S'8 '8 '8 '8R5)(5) e.0@@A5) #.	5)
  5) 
u %"33	45) 5) 5) 5)n # %BS     U=N  SXSd                r    r&   c                       e Zd ZdS )WavLMFeedForwardNr   r   r    r!   r   r      r"   r    r   c                   2     e Zd Zd	dedef fdZd
dZ xZS )WavLMEncoderLayerTconfigr/   c                    t                                                       t          |j        |j        |j        |j        |j        |          | _        t          j
        |j                  | _        t          j        |j        |j                  | _        t!          |          | _        t          j        |j        |j                  | _        d S N)r*   r+   r,   r-   r.   r/   epsr2   r3   r&   hidden_sizenum_attention_headsattention_dropoutr-   max_bucket_distance	attentionr7   Dropouthidden_dropoutr,   	LayerNormlayer_norm_eps
layer_normr   feed_forwardfinal_layer_normrD   r   r/   rE   s      r!   r3   zWavLMEncoderLayer.__init__       '(0,*3'A
 
 
 z&"788,v'9v?TUUU,V44 "V-?VEZ [ [ [r    NFr   c                    |}|                      |||||          \  }}}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }||f}|r||fz  }|S )NrG   rH   rI   r\   )r   r,   r   r   r   )	rD   rF   rG   rH   rI   r\   attn_residualrg   outputss	            r!   rh   zWavLMEncoderLayer.forward   s    %59^^)'/ 6D 6
 6
2|] ]33%566%(9(9-(H(HH--m<< -0 	'&Gr    Tr   r   r   r   r   r   r3   rh   r   r   s   @r!   r   r      sm        \ \{ \ \ \ \ \ \ \       r    r   c                   2     e Zd Zddedef fdZd	dZ xZS )
 WavLMEncoderLayerStableLayerNormTr   r/   c                    t                                                       t          |j        |j        |j        |j        |j        |          | _        t          j
        |j                  | _        t          j        |j        |j                  | _        t!          |          | _        t          j        |j        |j                  | _        d S r   r   r   s      r!   r3   z)WavLMEncoderLayerStableLayerNorm.__init__   r   r    NFc                    |}|                      |          }|                     ||||          \  }}}|                     |          }||z   }||                     |                     |                    z   }||f}|r||fz  }|S )N)rG   rH   rI   )r   r   r,   r   r   )rD   rF   rG   rH   rI   r   rg   r   s           r!   rh   z(WavLMEncoderLayerStableLayerNorm.forward
  s    %6659^^)'/	 6D 6
 6
2|] ]33%5%(9(9$:O:OP]:^:^(_(__ -0 	'&Gr    r   )NNFr   r   s   @r!   r   r      sm        \ \{ \ \ \ \ \ \ \       r    r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 :    g | ]}t          |d k              S r   )r/   )r   .0ir   s     r!   
<listcomp>z)WavLMEncoder.__init__.<locals>.<listcomp>'  s,    uuuPQv16KKKuuur    Fr2   r3   r   r   pos_conv_embedr7   r   r   r   r   r   r   r,   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingrD   r   rE   s    `r!   r3   zWavLMEncoder.__init__   s    :6BB,v'9v?TUUUz&"788muuuuUZ[a[sUtUtuuu
 
 ',###r    NFTc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     |          }	||	z   }|                     |          }|                     |          }t                      pt          |           }
d }t          | j	                  D ]q\  }}|r||fz   }t          j        g           }| j        o|dk    o|| j        j        k     }|r|
r ||||||          }|d d         \  }}|rd}|r||d         fz   }r|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nr   rL   r   r   r   r   NNNc              3      K   | ]}||V  	d S Nr   r   vs     r!   	<genexpr>z'WavLMEncoder.forward.<locals>.<genexpr>a  (      mmq_`_l_l_l_l_lmmr    last_hidden_staterF   
attentions)rS   rT   rV   r   r   r,   r   r   	enumerater   r>   randrv   r   	layerdropr   r	   rD   rF   rG   rI   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrH   r   layerdropout_probabilityskip_the_layerlayer_outputss                    r!   rh   zWavLMEncoder.forward+  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001"11-@@%(;;66]33022R6LT6R6R!$+.. 	P 	PHAu# I$58H$H! #(*R..!]fq1uf:MPTP[Pe:eN! 
A[ 
A %!#1"/&7! ! ! 0=RaR/@,} 3 2  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r    NFFTr   r   r   r3   rh   r   r   s   @r!   r   r     sZ        	, 	, 	, 	, 	, ";
 ;
 ;
 ;
 ;
 ;
 ;
 ;
r    r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderStableLayerNormc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 :    g | ]}t          |d k              S r   )r   r   s     r!   r   z8WavLMEncoderStableLayerNorm.__init__.<locals>.<listcomp>q  s=        1UVZ[U[]]]  r    Fr   r   s    `r!   r3   z$WavLMEncoderStableLayerNorm.__init__j  s    :6BB,v'9v?TUUUz&"788m   v788  
 
 ',###r    NFTc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     |          }	||	z   }|                     |          }t                      pt          |           }
d }t          | j                  D ]p\  }}|r||fz   }t          j
        g           }| j        o|dk    o|| j        j        k     }|r|
r |||||          }|d d         \  }}|rd}|r||d         fz   }q|                     |          }|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nr   rL   r   r   r   )rG   rI   rH   r   c              3      K   | ]}||V  	d S r   r   r   s     r!   r   z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>  r   r    r   )rS   rT   rV   r   r,   r   r   r   r   r>   r   rv   r   r   r   r   r	   r   s                    r!   rh   z#WavLMEncoderStableLayerNorm.forwardx  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001"11-@@%(;;]33022R6LT6R6R!$+.. 	P 	PHAu# I$58H$H! #(*R..!]fq1uf:MPTP[Pe:eN! 	A[ 	A !&!#1&7"/	! ! ! 0=RaR/@,} 3 2  P&9]1=M<O&O#66 	E 1]4D D 	nmm]4EGZ$[mmmmmm+;LYl
 
 
 	
r    r   r   r   s   @r!   r   r   i  sZ        , , , , ," "9
 9
 9
 9
 9
 9
 9
 9
r    r   c                   >     e Zd ZdZ fdZed             Zd Z xZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                    t                                                       |j        | _        |j        | _        |j        | j        z  dk    r t          d|j         d| j         d          t          j	        t          j        d| j        | j        z  |j        | j        z                      | _        t          j        |j        d         | j        | j        z            | _        d| _        d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   rL   r   )r2   r3   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimr5   r7   r=   r>   r   codevectorsr8   conv_dimweight_projtemperaturer   s     r!   r3   z#WavLMGumbelVectorQuantizer.__init__  s     68 4?2a77%&*? % %6:o% % %   <a4=!@&BW[_[jBjkk
 
 9V_R%8$/DM:YZZ r    c           	          |                      d          }t          j        t          j        |t          j        |dz             z  d                                                     }|S )Nr   rN   gHz>rL   )meanr>   exprX   r   )probsmarginal_probs
perplexitys      r!   _compute_perplexityz.WavLMGumbelVectorQuantizer._compute_perplexity  s^    **Y	.59^VZEZ;[;[*[ac d d ddeeiikk
r    c                    |j         \  }}}|                     |          }|                    ||z  | j        z  d          }| j        rt
          j                            |                                | j	        d          }|
                    |          }t          j        |                    ||z  | j        d                                          d          }|                     |          }n|                    d          } |j        |j                              d|                    dd          d          }|                    ||z  | j        d          }|                     |          }|                    ||z  d          }|                    d          | j        z  }	|	                    ||z  | j        | j        d          }
|
                    d                              ||d          }
|
|fS )NrL   T)tauhardrN   r   rP   )rV   r  rU   r   rv   r7   
functionalgumbel_softmaxr   r  type_asr>   softmaxr  argmax	new_zerosscatter_rS   r  r  rX   )rD   rF   
batch_sizesequence_lengthr   codevector_probscodevector_soft_distr  codevector_idxcodevectors_per_groupr  s              r!   rh   z"WavLMGumbelVectorQuantizer.forward  s   3@3F0
O[ ((77%**:+G$/+Y[]^^= 	D!};;M<O<O<Q<QW[Wgnr;ss/77FF $)="":#?RTUU[[]]ce$ $ $  112FGGJJ +11b199N6}68KLUUN''A..     044Z/5QSWSbdfgg112BCCJ+00o1MrRR 0 : :2 > >AQ Q+00o1Mt`d`moqrr!oob))..z?BOOJ&&r    )	r   r   r   r   r3   staticmethodr  rh   r   r   s   @r!   r   r     sl         
    *   \
"' "' "' "' "' "' "'r    r   c                   J    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zd Zd Zd	 Zd
S )WavLMPreTrainedModelr   wavlminput_valuesTFc           
      \   t          |t                    ro|j        j        j                            dd           |j        j        j                                         t          j	        
                    |j                   dS t          |t                    rt          j	                            |j        j        ddt          j        d|j        j        d         |j        j        z  z            z             t          j	                            |j        j        d           dS t          |t&                    r}t          j        d|j        j        z            }t          j	        
                    |j        j        | |           t          j	        
                    |j        j        | |           dS t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j                                         dS dS t          |t          j        t          j        f          r?|j        j                                         |j        j                            d           dS t          |t          j                  rt          j	                            |j                   |j        [t          j        |j        |j        |j        d         z  z            }t          j	        
                    |j        | |           dS dS dS )	zInitialize the weightsr'   r   )r  stdr   r   )abNrP   )
isinstancer   r  ru   datanormal_rt   zero_r7   inituniform_r  r   convr   sqrtkernel_sizein_channels	constant_r$   
projectionin_featuresr8   r   initializer_ranger   	GroupNormfill_Conv1dkaiming_normal_groups)rD   moduleks      r!   _init_weightsz"WavLMPreTrainedModel._init_weights  s    f899 	9%*222CCC#(..000GV/00000 <== 	9GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.22222 677 	9	!f/;;<<AGV.5!qAAAGV.3rQ?????	** 	9M&&CT[5R&SSS{& &&((((( '&r| <== 	9K""$$$M$$S)))))	** 	9G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888	9 	9 '&r    c                      t          d          NzNot needed for WavLMAttributeErrorrD   s    r!   _get_adaptersz"WavLMPreTrainedModel._get_adapters$      3444r    c                      t          d          r?  r@  rB  s    r!   init_adapter_layersz(WavLMPreTrainedModel.init_adapter_layers'  rD  r    c                      t          d          r?  r@  rB  s    r!   load_adapterz!WavLMPreTrainedModel.load_adapter*  rD  r    N)r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr=  rC  rF  rH  r   r    r!   r!  r!    s         $O&*# N9 9 9B5 5 55 5 55 5 5 5 5r    r!  c                       e Zd ZdS )
WavLMModelNr   r   r    r!   rQ  rQ  1  r"   r    rQ  c                       e Zd ZdS )WavLMForCTCNr   r   r    r!   rS  rS  5  r"   r    rS  c                       e Zd ZdS )WavLMForSequenceClassificationNr   r   r    r!   rU  rU  9  r"   r    rU  c                       e Zd ZdS ) WavLMForAudioFrameClassificationNr   r   r    r!   rW  rW  =  r"   r    rW  c                       e Zd ZdS )WavLMForXVectorNr   r   r    r!   rY  rY  A  r"   r    rY  )rW  rS  rU  rY  rQ  r!  )8r   typingr   r   r>   torch.nnr7   torch.nn.functionalr  rp   integrations.deepspeedr   integrations.fsdpr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   utilsr   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_wavlmr   
get_loggerr   loggerr   r$   Moduler&   r   r   r   r   r   r   r!  WavLMBaseModelOutputrQ  rS  rU  rW  rY  __all__r   r    r!   <module>rj     s1    " " " " " " " "                 @ @ @ @ @ @ 7 7 7 7 7 7 9 9 9 9 9 9 H H H H H H H H - - - - - -      
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 - , , , , , 
	H	%	%	 	 	 	 	#B 	 	 		 	 	 	 	6 	 	 	c  c  c  c  c RY c  c  c L	 	 	 	 	* 	 	 	& & & & &2 & & &R" " " " "'A " " "JG
 G
 G
 G
 G
29 G
 G
 G
TH
 H
 H
 H
 H
") H
 H
 H
VC' C' C' C' C' C' C' C'L15 15 15 15 15?,C 15 15 15h / 	 	 	 	 	 	 	 		 	 	 	 	. 	 	 		 	 	 	 	%F 	 	 		 	 	 	 	'J 	 	 		 	 	 	 	( 	 	 	  r    