
     `il                    N   d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z
ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5 dZ6dZ7 e2            rddl8m9Z9  e3j:        e;          Z<dZ=e e.d           G d de-                                  Z>	 	 dnde?e@e@f         deAde@d eejB                 d!e@d"e
jC        fd#ZD	 dod$e?d%e@d&ee
jC                 fd'ZE G d( d)e          ZF G d* d+e          ZG G d, d-e          ZH G d. d/ejI                  ZJ G d0 d1ejI                  ZK G d2 d3ejI                  ZL G d4 d5eL          ZM G d6 d7ejI                  ZN	 	 	 dpd9ejI        d:ejO        d;ejO        d<ejO        d eejO                 d=eeA         d>eAd?eejO                 fd@ZP G dA dBejI                  ZQ G dC dDejI                  ZR G dE dFe          ZS G dG dHe          ZT G dI dJejI                  ZU G dK dLejI                  ZV G dM dNejI                  ZW G dO dPejI                  ZX G dQ dRejI                  ZY G dS dTejI                  ZZe. G dU dVe)                      Z[e. G dW dXe[                      Z\ e.dY           G dZ d[e[                      Z]e. G d\ d]e[                      Z^ e.d^           G d_ d`e[                      Z_ e.da           G db dce[                      Z`e. G dd dee[                      Za G df dgejI                  Zb G dh diejI                  Zc e.dj           G dk dle[                      Zdg dmZedS )qzPyTorch Wav2Vec2 model.    N)	dataclass)CallableOptionalUnion)	load_file)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputMaskedLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputauto_docstringcached_filecheck_torch_load_is_safeis_peft_availableis_torch_flex_attn_availablelogging   )Wav2Vec2Configzadapter.{}.binzadapter.{}.safetensors)make_flex_block_causal_mask   za
    Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions.
    )custom_introc                   L   e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed	<   dZeej                 ed
<   dS )Wav2Vec2ForPreTrainingOutputa  
    loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
        The contrastive loss (L_m) as stated in the [official paper](https://huggingface.co/papers/2006.11477).
    diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
        The diversity loss (L_d) as stated in the [official paper](https://huggingface.co/papers/2006.11477).
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentionscontrastive_lossdiversity_loss)__name__
__module____qualname____doc__r*   r   torchFloatTensor__annotations__r+   r,   r-   r.   tupler/   r0   r1        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.pyr)   r)   I   s          $ )-D(5$
%,,,48hu01888>B): ;BBB9=8E$56===8<M8E%"345<<<59Ju01299948hu0188826NHU./66666r;   r)   shape	mask_probmask_lengthattention_mask	min_masksreturnc                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                                                    d                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r#   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr#   r   )intmax)input_lengthnum_masked_spanepsilonr?   r>   rA   sequence_lengths     r<   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span   s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFOr;   Nc                     g | ]}S r:   r:   ).0_rK   s     r<   
<listcomp>z)_compute_mask_indices.<locals>.<listcomp>   s    999!o999r;   dtyper   F)replace)
ValueErrornprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaperG   put_along_axis)r=   r>   r?   r@   rA   
batch_sizerL   input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrH   rI   spec_aug_mask_idxdummy_mask_idxoffsetsrJ   rK   s    `` `           @@r<   _compute_mask_indicesrs   l   sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	##B''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???r;   features_shapenum_negativesmask_time_indicesc                    | \  }}t          j        |          }t          j        |||ft           j                  }||                    t
                    nt          j        | t
                    }t          |          D ]}||                                         dz
  }|||                  }	t          j	        t          j        |dz             dddf         |dz   |f          }
t           j
                            d||dz   |f          }|||
k    xx         dz  cc<   |	|         ||         ||         <   ||xx         ||z  z  cc<   |S )z>
    Sample `num_negatives` vectors from feature vectors.
    )r=   rS   NrR   r#   r   )size)rV   ra   r^   re   astyper_   rd   r]   r[   rh   rW   randint)rt   ru   rv   rk   rK   sequence_length_rangesampled_negative_indices	batch_idxhighmapped_masked_indicesfeature_indicessampled_indicess               r<   _sample_negative_indicesr      s    #1J Io66  "xz?M.Zbdbjkkk +<*G  &&&RWUckoMpMpMp  :&& K K	 +//11A5 56G	6R S/")D1H*=*=aaag*FPQS`Habb)++At4!8]:S+TT?:;;;q@;;; MbbqLr +,=i,HI 	!+++y?/JJ++++##r;   c                   &     e Zd Zd fd	Zd Z xZS )Wav2Vec2NoLayerNormConvLayerr   c                 Z   t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        d S )Nr   r#   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr   feat_extract_activation
activationselfconfiglayer_id	__class__s      r<   r   z%Wav2Vec2NoLayerNormConvLayer.__init__  s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@r;   c                 Z    |                      |          }|                     |          }|S N)r   r   r   r.   s     r<   forwardz$Wav2Vec2NoLayerNormConvLayer.forward  s*    		-0066r;   r   r2   r3   r4   r   r   __classcell__r   s   @r<   r   r     sR        A A A A A A      r;   r   c                   &     e Zd Zd fd	Zd Z xZS )Wav2Vec2LayerNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r#   r   T)elementwise_affine)r   r   r   r   r   r   r   r   r   r   r   	LayerNorm
layer_normr   r   r   r   s      r<   r   z#Wav2Vec2LayerNormConvLayer.__init__  s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@r;   c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )NrM   )r   	transposer   r   r   s     r<   r   z"Wav2Vec2LayerNormConvLayer.forward,  se    		-00%//B7766%//B7766r;   r   r   r   s   @r<   r   r     sR        A A A A A A      r;   r   c                   &     e Zd Zd fd	Zd Z xZS )Wav2Vec2GroupNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        t          j        | j        | j        d          | _        d S )Nr   r#   r   T)
num_groupsnum_channelsaffine)r   r   r   r   r   r   r   r   r   r   r   r   r   r   	GroupNormr   r   s      r<   r   z#Wav2Vec2GroupNormConvLayer.__init__8  s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@,$2CRVRclpqqqr;   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   r   s     r<   r   z"Wav2Vec2GroupNormConvLayer.forwardH  s;    		-006666r;   r   r   r   s   @r<   r   r   7  sR        r r r r r r       r;   r   c                   $     e Zd Z fdZd Z xZS )Wav2Vec2PositionalConvEmbeddingc                    t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        t          j        j	        }t          t          j        j        d          rt          j        j        j	        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t-          |j                  | _        t0          |j                 | _        d S )	Nr&   )r   paddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r   r   r   r   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsr   utilsr   hasattrr   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterWav2Vec2SamePadLayerr   r   r   r   )r   r   r   r   r   r   r   s         r<   r   z(Wav2Vec2PositionalConvEmbedding.__init__P  s   I62a77
 
 
	 h*28,m<< 	@(3?K%'' 	E22493CST2UU I I'K	aHHH	I I I I I I I I I I I I I I Ity"455 .95<F95<F9-9-N66tXFFFN66tXFFFF#DIH!DDDDI+F,JKK !?@s   C??DDc                     |                     dd          }|                     |          }|                     |          }|                     |          }|                     dd          }|S Nr#   r&   )r   r   r   r   r   s     r<   r   z'Wav2Vec2PositionalConvEmbedding.forwardq  se    %//155		-00]3366%//155r;   r   r   s   @r<   r   r   O  sM        A A A A AB      r;   r   c                   $     e Zd Z fdZd Z xZS )r   c                 l    t                                                       |dz  dk    rdnd| _        d S )Nr&   r   r#   )r   r   num_pad_remove)r   r   r   s     r<   r   zWav2Vec2SamePadLayer.__init__}  s:    #:Q#>!#C#Caar;   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )r   r   s     r<   r   zWav2Vec2SamePadLayer.forward  s;    "")!!!QQQ0F43F2F0F*FGMr;   r   r   s   @r<   r   r   |  sL        K K K K K      r;   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )Wav2Vec2FeatureEncoderz.Construct the features from raw audio waveformc                    t                                                       j        dk    r7t          d          gfdt	          j        dz
            D             z   }nDj        dk    r!fdt	          j                  D             }nt          dj         d	          t          j        |          | _	        d
| _
        d| _        d S )Ngroupr   r   c                 8    g | ]}t          |d z             S )r#   r   )r   rO   ir   s     r<   rQ   z3Wav2Vec2FeatureEncoder.__init__.<locals>.<listcomp>  s>     N N NIJ,Va!eDDDN N Nr;   r#   layerc                 2    g | ]}t          |           S )r   )r   r   s     r<   rQ   z3Wav2Vec2FeatureEncoder.__init__.<locals>.<listcomp>  s4       CD*6A>>>  r;   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r   r   feat_extract_normr   r]   num_feat_extract_layersrU   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r   r   r   r   s    ` r<   r   zWav2Vec2FeatureEncoder.__init__  s   #w..5fqIIIJ N N N NNSTZTruvTvNwNwN N N KK %00   HMfNlHmHm  KK t1Ittt   =55&+#"r;   c                 P    |                                  D ]	}d|_        
d| _        d S NF)
parametersrequires_gradr   r   params     r<   _freeze_parametersz)Wav2Vec2FeatureEncoder._freeze_parameters  s4    __&& 	( 	(E"'E#r;   c                 r    |d d d f         }| j         r| j        rd|_        | j        D ]} ||          }|S )NT)r   trainingr   r   )r   input_valuesr.   
conv_layers       r<   r   zWav2Vec2FeatureEncoder.forward  s[    $QQQW-  	/4= 	/*.M'* 	6 	6J&J}55MMr;   )r2   r3   r4   r5   r   r   r   r   r   s   @r<   r   r     s\        88# # # # #&$ $ $

 
 
 
 
 
 
r;   r   c                        e Zd Z fdZ xZS )Wav2Vec2FeatureExtractorc                     t                                          |           t          j        d| j        j         d| j        j        d         j         dt                     d S )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)r   r   warningswarnr   r2   	__bases__FutureWarningr   r   r   s     r<   r   z!Wav2Vec2FeatureExtractor.__init__  sy       E$.1 E EN,Q/8E E E 		
 	
 	
 	
 	
r;   )r2   r3   r4   r   r   r   s   @r<   r   r     s8        
 
 
 
 
 
 
 
 
r;   r   c                   $     e Zd Z fdZd Z xZS )Wav2Vec2FeatureProjectionc                 .   t                                                       t          j        |j        d         |j                  | _        t          j        |j        d         |j                  | _	        t          j
        |j                  | _        d S )NrM   eps)r   r   r   r   r   layer_norm_epsr   Linearr   
projectionDropoutfeat_proj_dropoutdropoutr   s     r<   r   z"Wav2Vec2FeatureProjection.__init__  sn    ,vr':@UVVV)FOB$79KLLz&":;;r;   c                     |                      |          }|                     |          }|                     |          }||fS r   )r   r   r  )r   r.   norm_hidden_statess      r<   r   z!Wav2Vec2FeatureProjection.forward  sC    !__];;(:;;]33000r;   r   r   s   @r<   r   r     sG        < < < < <1 1 1 1 1 1 1r;   r           modulequerykeyvaluescalingr  	head_maskc                    ||                     d          dz  }t          j        ||                    dd                    |z  }	||	|z   }	t          j                            |	d          }	||	|                    dddd          z  }	t          j                            |	|| j	                  }	t          j        |	|          }
|
                    dd          
                                }
|
|	fS )NrM         r&   r
   r   r#   )pr   )rx   r6   matmulr   r   
functionalsoftmaxviewr  r   
contiguous)r  r  r  r  r@   r	  r  r
  kwargsattn_weightsattn_outputs              r<   eager_attention_forwardr    s     **R..D(<s}}Q':':;;gEL!#n4=((2(>>L#innQAq&A&AA=((6?([[L,|U33K''1--88::K$$r;   c                   >    e Zd ZdZ	 	 	 	 	 ddededed	ed
ededee         f fdZ		 	 	 	 dde
j        dee
j                 dee
j                 dee
j                 dee         dee         dee
j        ee
j                 eee
j                          f         fdZ xZS )Wav2Vec2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperr  FTN	embed_dim	num_headsr  
is_decoderr   	is_causalr   c                 
   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r  )r   )r   r   r  r  r  head_dimr   rU   r	  r  r  r   r   k_projv_projq_projout_proj)	r   r  r  r  r  r   r  r   r   s	           r<   r   zWav2Vec2Attention.__init__  s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr;   r.   key_value_statesr@   layer_head_maskoutput_attentionsr  rB   c                    |du}|j         dd         \  }}	|r|j         d         n|	}
||	d| j        f}||
d| j        f} |                     |          j        |                     dd          }|r|n|} |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j        j	        dk    rt          | j        j	                 } || ||||f| j        sdn| j        | j        ||d|\  }}|                    ||	d                                          }|                     |          }||dfS )z#Input shape: Batch x Time x ChannelNrM   r#   r&   eagerr  )r  r	  r&  r
  )r=   r  r"  r  r   r   r!  r  r   _attn_implementationr   r   r  r	  ri   r  r#  )r   r.   r$  r@   r%  r&  r  is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer  r  s                       r<   r   zWav2Vec2Attention.forward
  s    .T9 %*3B3/W/AN"(++wgr4=9wDM: 7t{{=116FPPQRTUVV-?R))]5T[[005~FPPQRTUVV
7t{{>227HRRSTVWXX(?;+w66"9$+:Z"[$7$7%
  $}>CC$,L/%%
 %
 %
 %
!\ "))#w;;FFHHmmK00L$..r;   )r  FTFN)NNNF)r2   r3   r4   r5   rF   floatr_   r   r$   r   r6   Tensorr   r   r9   r   r   r   s   @r<   r  r    s^       GG  +/C CC C 	C
 C C C (C C C C C CD 481526,13/ 3/|3/ #5<03/ !.	3/
 "%,/3/ $D>3/ -.3/ 
u|Xel3XeEL>Q5RR	S3/ 3/ 3/ 3/ 3/ 3/ 3/ 3/r;   r  c                   $     e Zd Z fdZd Z xZS )Wav2Vec2FeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |j                  | _	        t          |j        t                    rt          |j                 | _        n|j        | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S r   )r   r   r   r   activation_dropoutintermediate_dropoutr   r   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r<   r   zWav2Vec2FeedForward.__init__A  s    $&Jv/H$I$I!"$)F,>@X"Y"Yf'-- 	9'-f.?'@D$$'-'8D$If&>@RSS j)>??r;   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r=  rA  r;  rB  rD  r   s     r<   r   zWav2Vec2FeedForward.forwardN  sg    //>>00??11-@@))-88++M::r;   r   r   s   @r<   r8  r8  @  sL        @ @ @ @ @      r;   r8  c                   &     e Zd Z fdZddZ xZS )Wav2Vec2EncoderLayerc                    t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _        d S )NFr  r  r  r  r   r   )r   r   r  r   num_attention_headsattention_dropout	attentionr   r   rC  r  r   r   r   r8  feed_forwardfinal_layer_normr   s     r<   r   zWav2Vec2EncoderLayer.__init__Y  s    *(0,
 
 
 z&"788,v'9v?TUUU/77 "V-?VEZ [ [ [r;   NFc                    |}|                      |||          \  }}}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}|r||fz  }|S Nr@   r&  )rL  r  r   rM  rN  r   r.   r@   r&  attn_residualr  rP   outputss           r<   r   zWav2Vec2EncoderLayer.forwardh  s    %)-.L] *8 *
 *
&|Q ]33%566%(9(9-(H(HH--m<< " 	'&Gr;   r   r   r   s   @r<   rG  rG  X  sQ        \ \ \ \ \       r;   rG  c                   X     e Zd Z fdZ	 	 ddej        deej                 defdZ xZ	S )	#Wav2Vec2EncoderLayerStableLayerNormc                    t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _        t#          |dd           t%          |          | _        d S d | _        d S )NFrI  r   adapter_attn_dim)r   r   r  r   rJ  rK  rL  r   r   rC  r  r   r   r   r8  rM  rN  getattrWav2Vec2AttnAdapterLayeradapter_layerr   s     r<   r   z,Wav2Vec2EncoderLayerStableLayerNorm.__init__}  s    *(0,
 
 
 z&"788,v'9v?TUUU/77 "V-?VEZ [ [ [6-t44@!9&!A!AD!%Dr;   NFr.   r@   r&  c                 J   |}|                      |          }|                     |||          \  }}}|                     |          }||z   }||                     |                     |                    z   }| j        ||                     |          z   }|f}|r||fz  }|S rP  )r   rL  r  rM  rN  r[  rR  s           r<   r   z+Wav2Vec2EncoderLayerStableLayerNorm.forward  s     &66)-.L] *8 *
 *
&|Q ]33%5%(9(9$:O:OP]:^:^(_(__))D,>,>},M,MMM " 	'&Gr;   r   )
r2   r3   r4   r   r6   r6  r   r_   r   r   r   s   @r<   rV  rV  |  s~        & & & & &, 26"'	 | !.  	       r;   rV  c                        e Zd Z fdZ	 	 	 	 ddej        deej                 deded	ef
d
Z	de
ej        df         dej        fdZ xZS )Wav2Vec2Encoderc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 .    g | ]}t                    S r:   )rG  rO   rP   r   s     r<   rQ   z,Wav2Vec2Encoder.__init__.<locals>.<listcomp>  s"    $k$k$ka%9&%A%A$k$k$kr;   Fr   r   r   r   pos_conv_embedr   r   r   r   r   r   rC  r  r   r]   num_hidden_layerslayersr   r   s    `r<   r   zWav2Vec2Encoder.__init__  s    =fEE,v'9v?TUUUz&"788m$k$k$k$k5QWQiKjKj$k$k$kll&+###r;   NFTr.   r@   r&  output_hidden_statesreturn_dictc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     ||          }|                     |          }	||	z   }|                     |          }|                     |          }t                      pt          |           }
| j	        D ]a}|r||fz   }t          j        g           }| j        o|| j        j        k     }|r|
r ||||          }|d         }|rd}|r||d         fz   }b|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nr:   rM   r#   r&   r   rQ  NNc              3      K   | ]}||V  	d S r   r:   rO   vs     r<   	<genexpr>z*Wav2Vec2Encoder.forward.<locals>.<genexpr>  (      mmq_`_l_l_l_l_lmmr;   last_hidden_stater.   r/   )	unsqueezerepeatr=   _update_full_maskrc  r   r  r   r   re  r6   rX   r   r   	layerdropr9   r   r   r.   r@   r&  rf  rg  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                  r<   r   zWav2Vec2Encoder.forward  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001//
 

 #11-@@%(;;66]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 %!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r;   inputs_embedsc                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S Nflash_attention_2r   sdpaflex_attentionF)r  	r   r)  r   rS   r>  r6   r6  r%   r   r   r@   r~  s      r<   rs  z!Wav2Vec2Encoder._update_full_mask      
 %{/3FFF343F3FD1V;; "E^UbUh!i!i15EEEnel;; b%@[`%a%a%aN "<NML_!`!`r;   NFFT)r2   r3   r4   r   r6   tensorr   r6  r_   r   r   rs  r   r   s   @r<   r^  r^    s        , , , , , 26"'%* :
 :
|:
 !.:
  	:

 #:
 :
 :
 :
 :
zelD01 |       r;   r^  c                   b     e Zd Z fdZ	 	 	 	 d	dZdeej        df         dej        fdZ xZ	S )
Wav2Vec2EncoderStableLayerNormc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 .    g | ]}t                    S r:   )rV  ra  s     r<   rQ   z;Wav2Vec2EncoderStableLayerNorm.__init__.<locals>.<listcomp>  s"    bbbQ088bbbr;   Frb  r   s    `r<   r   z'Wav2Vec2EncoderStableLayerNorm.__init__	  s    =fEE,v'9v?TUUUz&"788mbbbb%H`BaBabbb
 
 ',###r;   NFTc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     ||          }|                     |          }	||	z   }|                     |          }t                      pt          |           }
| j        D ]a}|r||fz   }t          j
        g           }| j        o|| j        j        k     }|r|
r ||||          }|d         }|rd}|r||d         fz   }b|                     |          }|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nr:   rM   r#   r&   r   rQ  ri  c              3      K   | ]}||V  	d S r   r:   rk  s     r<   rm  z9Wav2Vec2EncoderStableLayerNorm.forward.<locals>.<genexpr>K  rn  r;   ro  )rq  rr  r=   rs  rc  r  r   r   re  r6   rX   r   r   rt  r   r9   r   ru  s                  r<   r   z&Wav2Vec2EncoderStableLayerNorm.forward  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001//
 

 #11-@@%(;;]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 !&!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O#66 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r;   r@   r~  c                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S r  r  r  s      r<   rs  z0Wav2Vec2EncoderStableLayerNorm._update_full_maskS  r  r;   r  )
r2   r3   r4   r   r   r   r6   r6  rs  r   r   s   @r<   r  r    s        	, 	, 	, 	, 	, "<
 <
 <
 <
~elD01 |       r;   r  c                   B     e Zd ZdZ fdZedd            ZddZ xZS )Wav2Vec2GumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                    t                                                       |j        | _        |j        | _        |j        | j        z  dk    r t          d|j         d| j         d          t          j	        t          j        d| j        | j        z  |j        | j        z                      | _        t          j        |j        d         | j        | j        z            | _        d| _        d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationr#   rM   r&   )r   r   num_codevector_groupsr   num_codevectors_per_groupnum_varscodevector_dimrU   r   	Parameterr6   r7   codevectorsr   r   weight_projtemperaturer   s     r<   r   z&Wav2Vec2GumbelVectorQuantizer.__init__p  s     68 4?2a77Y&*? Y Y59_Y Y Y   <a4=!@&BW[_[jBjkk
 
 9V_R%8$/DM:YZZ r;   Nc           	         ||                                 d d d d f                             | j                  }t          j        || t          j        |                     } |                     d          |                                z  }n|                     d          }t          j        t          j        |t          j	        |dz             z  d                                                     }|S )Nr   r  gHz>rM   )
flattenexpandr=   r6   where
zeros_liker[   meanexplog)probsmaskmask_extendedmarginal_probs
perplexitys        r<   _compute_perplexityz1Wav2Vec2GumbelVectorQuantizer._compute_perplexity  s     LLNN111dD=9@@MMMKue6Fu6M6MNNE"YY1Y--

:NN"ZZAZ..NY	.59^VZEZ;[;[*[ac d d ddeeiikk
r;   c                    |j         \  }}}|                     |          }|                    ||z  | j        z  d          }| j        rt
          j                            |                                | j	        d          
                    |          }t          j        |                    ||z  | j        d                                          d          }|                     ||          }n|                    d          }	|                    |j                                       d|	                    dd          d          }|                    ||z  | j        d          }|                     ||          }|                    ||z  d          }|                    d          | j        z  }
|
                    ||z  | j        | j        d          }|                    d                              ||d          }||fS )NrM   T)tauhardr  r#         ?r   )r=   r  r  r   r   r   r  gumbel_softmaxr5  r  type_asr6   r  r  argmax	new_zerosscatter_rq  r  r  r[   )r   r.   rv   rk   rK   r   codevector_probscodevector_soft_distr  codevector_idxcodevectors_per_groupr  s               r<   r   z%Wav2Vec2GumbelVectorQuantizer.forward  s   3@3F0
O[ ((77%**:+G$/+Y[]^^= 	W!};;##%%4+;$  <    gm$$ 
 $)="":#?RTUU[[]]ce$ $ $  112FHYZZJJ +11b199N,66}7JKKTTN''A..     044Z/5QSWSbdfgg112BDUVVJ+00o1MrRR 0 : :2 > >AQ Q+00o1Mt`d`moqrr!oob))..z?BOOJ&&r;   r   )	r2   r3   r4   r5   r   staticmethodr  r   r   r   s   @r<   r  r  j  sv         
    ( 	 	 	 \	#' #' #' #' #' #' #' #'r;   r  c                   $     e Zd Z fdZd Z xZS )Wav2Vec2Adapterc                    t                                                       j        j        k    rCt	          j        j        j                  | _        t	          j        j                  | _        nd x| _        | _        t	          j	        fdt          j                  D                       | _        j        | _        d S )Nc              3   6   K   | ]}t                    V  d S r   )Wav2Vec2AdapterLayerra  s     r<   rm  z+Wav2Vec2Adapter.__init__.<locals>.<genexpr>  s,      #k#kQ$8$@$@#k#k#k#k#k#kr;   )r   r   output_hidden_sizer   r   r   projr   proj_layer_normr   r]   num_adapter_layersre  rt  r   s    `r<   r   zWav2Vec2Adapter.__init__  s     $(:::	&"4f6OPPDI#%<0I#J#JD  /33DI,m#k#k#k#k%PVPiJjJj#k#k#kkk)r;   c                 X   | j         1| j        *|                      |          }|                     |          }|                    dd          }| j        D ]=}t          j                                        }| j        r|| j        k    r ||          }>|                    dd          }|S r   )r  r  r   re  rV   rW   r   rt  )r   r.   r   layerdrop_probs       r<   r   zWav2Vec2Adapter.forward  s    9 T%9%E IIm44M 00??M%//155[ 	5 	5EY--//N= 5^dn%D%D %m 4 4%//155r;   r   r   s   @r<   r  r    sG        * * * * *      r;   r  c                   $     e Zd Z fdZd Z xZS )r  c                     t                                                       t          j        |j        d|j        z  |j        |j        d          | _        d S )Nr&   r#   )r   r   )r   r   r   r   r  adapter_kernel_sizeadapter_strider   r   s     r<   r   zWav2Vec2AdapterLayer.__init__  sU    I%))&(
 
 
			r;   c                 r    |                      |          }t          j                            |d          }|S )Nr#   r  )r   r   r  glur   s     r<   r   zWav2Vec2AdapterLayer.forward  s3    		-00))-Q)??r;   r   r   s   @r<   r  r    sG        
 
 
 
 
      r;   r  c                   4     e Zd Z fdZdej        fdZ xZS )rZ  c                 t   t                                                       |j        | _        |j        | _        t          j        | j                  | _        t          j	        | j        | j                  | _
        t          j                    | _        t          j	        | j        | j                  | _        dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r   r   rX  	input_dimr   
hidden_dimr   r   normr   linear_1ReLUact_fnlinear_2r   s     r<   r   z!Wav2Vec2AttnAdapterLayer.__init__  s    
 	0 ,L11		$/4>BBgii	$.$/BBr;   r.   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S r   )r  r  r  r  r   s     r<   r   z Wav2Vec2AttnAdapterLayer.forward  sL    		-00m44M22m44r;   )r2   r3   r4   r   r6   r7   r   r   r   s   @r<   rZ  rZ    s[        C C C C CU%6        r;   rZ  c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
d Z	 ddeej        ef         dee         fd	Z	 dd
edej        fdZd Zd ZddefdZdS )Wav2Vec2PreTrainedModelr   wav2vec2r   Tc           
         t          |t                    rL|j                                         |j                                         d|j        _        d|j        _        dS t          |t                    ro|j        j        j	        
                    dd           |j        j        j	                                         t          j                            |j                   dS t          |t"                    rt          j        
                    |j        j        ddt'          j        d|j        j        d         |j        j        z  z            z             t          j                            |j        j        d           dS t          |t0                    r}t'          j        d|j        j        z            }t          j                            |j        j        | |           t          j                            |j        j        | |           dS t          |t          j                  rT|j        j	        
                    d| j        j                   |j         |j        j	                                         dS dS t          |t          j        t          j        f          r?|j        j	                                         |j        j	                             d	           dS t          |t          j!                  rt          j        "                    |j                   |j        [t'          j        |j#        |j        |j        d         z  z            }t          j                            |j        | |           dS dS dS )
zInitialize the weightsTr  r#   )r  stdr   r&   )abNr  )$r>  Wav2Vec2ForPreTrainingproject_hidreset_parameters	project_q_is_hf_initializedr  r  r   datanormal_r   zero_r   inituniform_r  r   r   mathsqrtr   in_channels	constant_r   r   in_featuresr   r   initializer_ranger   r   fill_r   kaiming_normal_r   )r   r  ks      r<   _init_weightsz%Wav2Vec2PreTrainedModel._init_weights
  s    f455 "	9//111--///48F126F/// =>> 	9%*222CCC#(..000GV/00000 ?@@ 	9GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.22222 9:: 	9	!f/;;<<AGV.5!qAAAGV.3rQ?????	** 	9M&&CT[5R&SSS{& &&((((( '&r| <== 	9K""$$$M$$S)))))	** 	9G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888	9 	9 '&r;   Nrl   add_adapterc                    || j         j        n|}d }t          | j         j        | j         j                  D ]\  }} ||||          }|r3t          | j         j                  D ]} ||d| j         j                  }|S )zH
        Computes the output length of the convolutional layers
        Nc                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder#   )r6   divrH   r   r   s      r<   _conv_out_lengthzRWav2Vec2PreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length:  s&     9\K7wWWWZ[[[r;   r#   )r   r  zipr   r   r]   r  r  )r   rl   r  r  r   r   rP   s          r<    _get_feat_extract_output_lengthsz8Wav2Vec2PreTrainedModel._get_feat_extract_output_lengths1  s     2=1Ddk--+	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMM 	_4;9:: _ _ 0 04;C] ^ ^r;   feature_vector_lengthr@   c                    |                     d          d d df         }|                     ||          }|                    t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                               d          
                    dg                                          }|S )NrM   r  r  r   )rS   devicer#   )r  )cumsumr  tor6   longr=   r^   rS   r  ra   flipr_   )r   r  r@   r  non_padded_lengthsoutput_lengthsrk   s          r<   "_get_feature_vector_attention_maskz:Wav2Vec2PreTrainedModel._get_feature_vector_attention_maskH  s   
 ,22r2::111b5A>>?Q_j>kk'**5:66#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr;   c                    | j         j        t          | j         d          i }|                                 D ]N\  }}t          |t                    r4|                                D ]\  }}||d                    ||g          <    Ot          | t                    r9| j
                                        D ]\  }}||d                    d|g          <    |S )NzF has no adapter layers. Make sure to define `config.adapter_attn_dim`..lm_head)r   rX  rU   r   named_modulesr>  rZ  named_parametersjoinWav2Vec2ForCTCr  )r   adapter_weightsr   r  
param_namer   s         r<   _get_adaptersz%Wav2Vec2PreTrainedModel._get_adapters\  s    ;'/vvvwww ..00 	J 	JLD&&":;; J)/)@)@)B)B J J%JDIOCHHdJ-?$@$@AAdN++ 	E#|<<>> E Ee?D)T): ; ;<<r;   c                     |                                  D ],}t          |t                    r|                     |           -t          | t                    r|                     | j                   dS dS )zc
        (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning
        N)modulesr>  rZ  r  r  r  )r   r  s     r<   init_adapter_layersz+Wav2Vec2PreTrainedModel.init_adapter_layersl  s}    
 llnn 	+ 	+F&":;; +""6*** dN++ 	-t|,,,,,	- 	-r;   target_langc                 2   | j         j        t          d| d          || j        k    r"|s t                              d| d           dS |                    dd          }|                    dd          }|                    d	d          }|                    d
d          }|                    dd          }|                    dd          }	|                    dd          }
|                    dd          }|                    dd          }|
-t          j        dt                     |	t          d          |
}	| j         j
        }d}|dur|t                              |          }	 t          |||||||	||	  	        }t          |          }n9# t          $ r |r Y n*t           $ r |rt          d| d| d| d          Y nw xY w|t"                              |          }	 t          |||||||	||	  	        }t%                       t'          j        |dd          }n;# t          $ r  t          $ r  t           $ r t          d| d| d| d          w xY w|                                 t-          |                                          t-                                                    z
  }t-                                                    t-          |                                          z
  }t1          |          dk    r)t          d| dd                    |           d          t1          |          dk    r)t          d| dd                    |           d          |d         j        d         }|| j         j        k    r=t9          j        | j         j        || j        | j                   | _!        || j         _        fd |"                                D             }| #                    |d!           || _        dS )"a  
        Load a language adapter model from a pre-trained adapter model.

        Parameters:
            target_lang (`str`):
                Has to be a language id of an existing adapter weight. Adapter weights are stored in the format
                adapter.<lang>.safetensors or adapter.<lang>.bin
            force_load (`bool`, defaults to `True`):
                Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            local_files_only(`bool`, *optional*, defaults to `False`):
                Whether or not to only look at local files (i.e., do not try to download the model).
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `hf auth login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.

                <Tip>

                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.

                </Tip>

            mirror (`str`, *optional*):
                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                Please refer to the mirror site for more information.

        <Tip>

        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
        use this method in a firewalled environment.

        </Tip>

        Examples:

        ```python
        >>> from transformers import Wav2Vec2ForCTC, AutoProcessor

        >>> ckpt = "facebook/mms-1b-all"
        >>> processor = AutoProcessor.from_pretrained(ckpt)
        >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
        >>> # set specific language
        >>> processor.tokenizer.set_target_lang("spa")
        >>> model.load_adapter("spa")
        ```
        NzCannot load_adapter for - if `config.adapter_attn_dim` is not defined.z#Adapter weights are already set to r  	cache_dirforce_downloadFresume_downloadproxieslocal_files_onlytokenuse_auth_tokenrevisionuse_safetensorszrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.zV`token` and `use_auth_token` are both specified. Please set only the argument `token`.)filenamer  r  r  r  r  r  r  zCan't load the model for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named cpuT)map_locationweights_onlyr   zThe adapter weights z has unexpected keys: z, z has missing keys: zlm_head.weightr  rS   c                 N    i | ]!\  }}||                     |                   "S r:   )r  )rO   r  rl  r	  s      r<   
<dictcomp>z8Wav2Vec2PreTrainedModel.load_adapter.<locals>.<dictcomp>5  s0    QQQdaaoa011QQQr;   )strict)$r   rX  rU   r  loggerwarningpopr   r   r   _name_or_pathWAV2VEC2_ADAPTER_SAFE_FILEformatr   safe_load_fileOSError	ExceptionWAV2VEC2_ADAPTER_PT_FILEr   r6   loadr  setkeysrb   r  r=   
vocab_sizer   r   r  r  rS   r  itemsload_state_dict)r   r  
force_loadr  r  r  r  r  r  r  r  r  r  model_path_or_id
state_dictfilepathweight_pathunexpected_keysmissing_keystarget_vocab_sizer	  s                       @r<   load_adapterz$Wav2Vec2PreTrainedModel.load_adaptery  s6   ~ ;'/rrrrsss$***:*NNOOOOPPPFJJ{D11	$4e<< **%6==**Y--!::&8%@@

7D))$4d;;::j$// **%6==%M E     l   #E;4
 %''188EEH)$%#1$3#%5%'
 
 
 ,K88

   "   
    " !J5E J J=MJ J ?GJ J J    /66{CCH#)$%#1$3#%5%'
 
 
 )***"Z!&!%  

           F1A F F9IF F ;CF F F   ,,..joo//0037K7K7M7M3N3NN?//1122S9J9J5K5KK!##tKttW[W`W`apWqWqtttuuu""nKnnTXT]T]^jTkTknnnooo ''78>qA 6669.0A$+]a]g  DL &7DK" RQQQj>N>N>P>PQQQ
Z666 's$   0'F G'$GG.=H, ,8I$r   )T)r2   r3   r4   r$   r8   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr  r   r6   
LongTensorrF   r   r_   r  r  r  r  r@  r;  r:   r;   r<   r  r     s        "$O&*#N%9 %9 %9P Z^ "5#3S#89HPQU   0 Y] %(:?:J   (   - - -@' @' @' @' @' @' @' @'r;   r  c                   6    e Zd Zdef fdZd Zd Z	 	 ddej        de	ej                 de	ej
                 fd	Ze	 	 	 	 	 dd
e	ej                 de	ej                 de	ej                 de	e         de	e         de	e         deeef         fd            Z xZS )Wav2Vec2Modelr   c                    t                                          |           || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        |j        rt#          |          | _        nt'          |          | _        |j        rt+          |          nd | _        |                                  d S )Nr  )r   r   r   r   feature_extractorr   feature_projectionmask_time_probmask_feature_probr   r  r6   r6  r   r  masked_spec_embeddo_stable_layer_normr  encoderr^  r  r  adapter	post_initr   s     r<   r   zWav2Vec2Model.__init__>  s       !7!?!?";F"C"C  3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"& 	39&AADLL*622DL282DNv...$ 	r;   c                 b    t          j        dt                     |                                  dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.Nr   r   r   freeze_feature_encoderr   s    r<   freeze_feature_extractorz&Wav2Vec2Model.freeze_feature_extractorR  ;    
 	Q	
 	
 	

 	##%%%%%r;   c                 8    | j                                          dS 
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rF  r   rT  s    r<   rS  z$Wav2Vec2Model.freeze_feature_encoder^  s    
 	1133333r;   Nr.   rv   r@   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r>   r?   r@   rA   r  )r>   r?   rA   rM   )rY  r   rx   rJ  r  rS   rH  r   rs   mask_time_lengthmask_time_min_masksr6   r  r  r_   rI  mask_feature_lengthmask_feature_min_masksr  )r   r.   rv   r@   rk   rK   r   mask_feature_indicess           r<   _mask_hidden_statesz!Wav2Vec2Model._mask_hidden_statese  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r;   r   r&  rf  rg  rB   c                 :   ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|#|                     |j        d         |d          }|                     |          \  }}| 	                    |||          }| 
                    |||||          }	|	d         }| j        |                     |          }|s||f|	dd         z   S t          |||	j        |	j        	          S )
a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr#   r&   Fr  )rv   r@   r@   r&  rf  rg  r   )rp  extract_featuresr.   r/   )r   r&  rf  use_return_dictrF  r   r  r=   rG  ra  rL  rM  r   r.   r/   )
r   r   r@   rv   r&  rf  rg  rd  r.   encoder_outputss
             r<   r   zWav2Vec2Model.forward  s|    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;%!DD &q)>u E  N +/*A*ABR*S*S''00->~ 1 
 
 ,,)/!5# ' 
 
 (*<# LL77M 	K!#34qrr7JJJ&+-)7&1	
 
 
 	
r;   ri  NNNNN)r2   r3   r4   r$   r   rU  rS  r6   r7   r   rB  ra  r   r6  r_   r   r9   r   r   r   r   s   @r<   rD  rD  <  sZ       ~      (
& 
& 
&4 4 4 :>59	, ,(, $E$56, !!12	, , , ,\  269=,0/3&*7
 7
u|,7
 !.7
 $E$56	7

 $D>7
 'tn7
 d^7
 
u--	.7
 7
 7
 ^7
 7
 7
 7
 7
r;   rD  z?
    Wav2Vec2 Model with a quantizer and `VQ` head on top.
    c                   X    e Zd Zdef fdZdefdZd Zd Ze		 dde
j        d	e
j        d
e
j        defd            Ze	 	 	 	 	 	 ddee
j                 dee
j                 dee
j                 dee
j                 dee         dee         dee         deeef         fd            Z xZS )r  r   c                    t                                          |           t          |          | _        t	          j        |j                  | _        t          |          | _	        t	          j
        |j        |j                  | _        t	          j
        |j        |j                  | _        |                                  d S r   )r   r   rD  r  r   r   feat_quantizer_dropoutdropout_featuresr  	quantizerr   r   proj_codevector_dimr  r  r  rN  r   s     r<   r   zWav2Vec2ForPreTraining.__init__  s       %f-- "
6+H I I6v>>9V%79STT6#8&:TUU 	r;   r  c                     || j         _        dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)rl  r  )r   r  s     r<   set_gumbel_temperaturez-Wav2Vec2ForPreTraining.set_gumbel_temperature  s     &1"""r;   c                 b    t          j        dt                     |                                  dS rP  rR  rT  s    r<   rU  z/Wav2Vec2ForPreTraining.freeze_feature_extractor  rV  r;   c                 B    | j         j                                         dS rX  r  rF  r   rT  s    r<   rS  z-Wav2Vec2ForPreTraining.freeze_feature_encoder  !    
 	'::<<<<<r;   皙?target_featuresnegative_featurespredicted_featuresc                     t          j        | |gd          } t          j        |                                |                                 d                              |           }||z  }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r  rM   )r6   catcosine_similarityr5  r  )ru  rv  rw  r  logitss        r<   compute_contrastive_logitsz1Wav2Vec2ForPreTraining.compute_contrastive_logits  ss      )_6G$HaPPP();)A)A)C)C_EZEZE\E\bdeeemm
 

 +%r;   Nr   r@   rv   r|   r&  rf  rg  rB   c           
      `   ||n| j         j        }||                    t          j                  }|                     ||||||          }|                     |d                   }	|                     |d                   }
|#|                     |
j	        d         |d          }| 
                    |
|          \  }}|                    | j        j        j                  }|                     |          }dx}x}}||j	        \  }}}|                    d|          |                                                    d                   }|                    ||d|                              d	ddd
          }|                     |dddf         ||	| j         j                  }||k                        d          }|                                rt+          d          |dd         |<   |                    dd	                              d|                    d                    }d|                                z
  dz                      dd                                          }t4          j                            |                                |d          }| j         j        | j         j        z  }||z
  |z  |                                z  }|| j         j         |z  z   }|s#|||	||f|d	d         z   S |	||f|d	d         z   S tC          ||	|||j"        |j#        ||          S )a  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
            Required input for pre-training.

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
        >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
        >>> from datasets import load_dataset

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
        >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1

        >>> # compute masked indices
        >>> batch_size, raw_sequence_length = input_values.shape
        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
        >>> mask_time_indices = _compute_mask_indices(
        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
        ... )
        >>> sampled_negative_indices = _sample_negative_indices(
        ...     features_shape=(batch_size, sequence_length),
        ...     num_negatives=model.config.num_negatives,
        ...     mask_time_indices=mask_time_indices,
        ... )
        >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
        >>> sampled_negative_indices = torch.tensor(
        ...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
        ... )

        >>> with torch.no_grad():
        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)

        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)

        >>> # show that cosine similarity is much higher than random
        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
        tensor(True)

        >>> # for contrastive loss training model should be put into train mode
        >>> model = model.train()
        >>> loss = model(
        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
        ... ).loss
        ```N)r@   r&  rf  rv   rg  r   r#   Fr  )rv   rM   r&   r
   z-infir[   )	reduction)r*   r+   r,   r-   r.   r/   r0   r1   )$r   re  r  r6   r_   r  r  rk  r  r=   rl  r  r   rS   r  r  permuter|  contrastive_logits_temperatureallanyr5  r   ri   rx   r  r   r  cross_entropyr  r  r[   diversity_loss_weightr)   r.   r/   )r   r   r@   rv   r|   r&  rf  rg  rT  transformer_featuresrd  quantized_featuresr-   r*   r0   r1   rk   rK   r   negative_quantized_featuresr{  
neg_is_postargetnum_codevectorss                           r<   r   zWav2Vec2ForPreTraining.forward  s   D &1%<kk$+B]( 1 4 4UZ @ @--)/!5/#   
 
  $//
;;  00<<%!DD &q)>u E  N 59NN0A 5C 5
 5
11 0224>3H3NOO!^^,>??3777.#/7I7O4J +=*A*A"k*R*R(--//44R88+' +F*J*JOR+ +gaAq!! ( 44"47++$:	 F -0KKPPQSTTJ~~ 7).vqrr
:& %%a++33BAGGF,11333t;FFq!LLTTVVF!}::6<<>>6]b:cc"kCdkFggO.1FF/Y]n]r]r]t]ttN $dk&G.&XXD 	c24FH]^ahijikikalll(*<>STW^_`_a_aWbbb+1'9"7!/)-)	
 	
 	
 		
r;   )rt  )NNNNNN)r2   r3   r4   r$   r   rF   ro  rU  rS  r  r6   r7   r|  r   r   r6  
BoolTensorr_   r   r9   r)   r   r   r   s   @r<   r  r    s       ~      1# 1 1 1 1
& 
& 
&= = = 
 	 * , "- 	   \(  268<?C,0/3&*\
 \
u|,\
 !.\
 $E$45	\

 #+5+;"<\
 $D>\
 'tn\
 d^\
 
u22	3\
 \
 \
 ^\
 \
 \
 \
 \
r;   r  c                        e Zd Z fdZe	 	 	 	 	 ddej        deej                 dee	         dee	         dee	         deej
                 d	eeef         fd
            Z xZS )Wav2Vec2ForMaskedLMc                 R   t                                          |           t          j        dt                     t          |          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        |                                  d S )NzSThe class `Wav2Vec2ForMaskedLM` is deprecated. Please use `Wav2Vec2ForCTC` instead.)r   r   r   r   r   rD  r  r   r   final_dropoutr  r   r   r0  r  rN  r   s     r<   r   zWav2Vec2ForMaskedLM.__init__  s       acp	
 	
 	
 &f--z&"677y!3V5FGG 	r;   Nr   r@   r&  rf  rg  labelsrB   c                    ||n| j         j        }|                     ||||          }|d         }|                     |          }|                     |          }	|s|	f|dd          z   }
|
S t          |	|j        |j                  S )N)r&  rf  rg  r   r&   )r{  r.   r/   )r   re  r  r  r  r   r.   r/   )r   r   r@   r&  rf  rg  r  rT  r.   r{  outputs              r<   r   zWav2Vec2ForMaskedLM.forward  s     &1%<kk$+B]--/!5#	   
 
  
]33m,, 	Y,FMV7;P]d]oppppr;   rg  )r2   r3   r4   r   r   r6   r7   r   rB  r_   r6  r   r9   r   r   r   r   s   @r<   r  r    s              6:,0/3&*)-q q'q !!12q $D>	q
 'tnq d^q &q 
un$	%q q q ^q q q q qr;   r  zp
    Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    c                        e Zd Zddee         f fdZd Zd Zd Zd Z	e
	 	 	 	 	 ddeej                 d	eej                 d
ee         dee         dee         deej                 deeef         fd            Z xZS )r  Nr  c                    t                                          |           t          |          | _        t	          j        |j                  | _        || _        |j	        t          d| j         d          t          |d          r|j        r|j        n|j        }t	          j        ||j	                  | _        |                                  dS )a2  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`Wav2Vec2ForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r  )r   r   rD  r  r   r   r  r  r  r0  rU   r   r   r  r  r   r   r  rN  )r   r   r  r  r   s       r<   r   zWav2Vec2ForCTC.__init__  s     	   %f--z&"677&$H H H H   *1)G)GvFL^vF%%djdv 	 y!3V5FGG 	r;   c                    | j         }|)t          | j        dd          t          d| d          |2t          | j        dd          t                              d           dS ||                     |d           dS dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        NrX  zCannot pass `target_lang`: r  z)By default `target_lang` is set to 'eng'.T)r3  )r  rY  r   rU   r#  infor;  )r   r  s     r<   tie_weightszWav2Vec2ForCTC.tie_weights   s     &"wt{<NPT'U'U']u;uuuvvv WT[:Ld%S%S%_KKCDDDDD$kd;;;;; %$r;   c                 b    t          j        dt                     |                                  dS rY  rQ  NrR  rT  s    r<   rU  z'Wav2Vec2ForCTC.freeze_feature_extractor  rV  r;   c                 B    | j         j                                         dS rX  rr  rT  s    r<   rS  z%Wav2Vec2ForCTC.freeze_feature_encoder!  rs  r;   c                 L    | j                                         D ]	}d|_        
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr  r   r   r   s     r<   freeze_base_modelz Wav2Vec2ForCTC.freeze_base_model(  6    
 ]--// 	( 	(E"'E	( 	(r;   r   r@   r&  rf  rg  r  rB   c           
      p   ||n| j         j        }|>|                                | j         j        k    rt	          d| j         j                   |                     |||||          }|d         }|                     |          }|                     |          }	d}
|Z||nt          j	        |t          j
                  }|                     |                    d                                        t          j
                  }|dk    }|                    d          }|                    |          }t          j                            |	dt          j                                      dd          }t          j        j                            d	
          5  t          j                            ||||| j         j        | j         j        | j         j                  }
ddd           n# 1 swxY w Y   |s|	f|t6          d         z   }|
|
f|z   n|S t9          |
|	|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: rc  r   rR   rM   )r   rS   r#   F)enabled)blankr~  zero_infinityr*   r{  r.   r/   )r   re  rG   r0  rU   r  r  r  r6   	ones_liker  r  r[   r  masked_selectr   r  log_softmaxfloat32r   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r.   r/   )r   r   r@   r&  rf  rg  r  rT  r.   r{  r*   rl   labels_masktarget_lengthsflattened_targets	log_probsr  s                    r<   r   zWav2Vec2ForCTC.forward0  s   " &1%<kk$+B]&**,,$+2H"H"H\DKDZ\\]]]--)/!5#   
 
  
]33m,, #1"<%/R^fkfpBqBqBq  !AA.BTBTUWBXBXYY\\]b]ghhM !A+K(__R00N & 4 4[ A A 11&b1VV``abdeffI%++E+:: 	 	}--%!"+2"k<"&+"? .  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  	FY)F)G)G!HHF)-)9TGf$$vEfG4IV]Vh
 
 
 	
s    AG11G58G5r   rg  )r2   r3   r4   r   r@  r   r  rU  rS  r  r   r6   r6  r_   r   r9   r   r   r   r   s   @r<   r  r    s>        HSM      :< < <*
& 
& 
&= = =( ( (  26,0/3&*)-D
 D
u|,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
 D
 ^D
 D
 D
 D
 D
r;   r  z
    Wav2Vec2 Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j
                 dee	j
                 dee         d	ee         d
ee         dee	j
                 deeef         fd            Z xZS )!Wav2Vec2ForSequenceClassificationc                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        |                                  d S )Nr  z_Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)r#   )r   r   r   r  rU   rD  r  rd  use_weighted_layer_sumr   r  r6   rd   layer_weightsr   r   classifier_proj_size	projector
num_labels
classifierrN  r   r   
num_layersr   s      r<   r   z*Wav2Vec2ForSequenceClassification.__init__  s       6=)) 	f.@ 	q   &f---1
( 	S!#ej.D.Dz.Q!R!RD6#5v7RSS)F$?ARSS 	r;   c                 b    t          j        dt                     |                                  dS rP  rR  rT  s    r<   rU  z:Wav2Vec2ForSequenceClassification.freeze_feature_extractor  rV  r;   c                 B    | j         j                                         dS rX  rr  rT  s    r<   rS  z8Wav2Vec2ForSequenceClassification.freeze_feature_encoder  rs  r;   c                 L    | j                                         D ]	}d|_        
dS r  r  r   s     r<   r  z3Wav2Vec2ForSequenceClassification.freeze_base_model  r  r;   Nr   r@   r&  rf  rg  r  rB   c                 d   ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }||                    d          }
n|                     |j        d         |          }|                    d                              dd|j        d                   }d	|| <   |                    d          |                    d                              dd          z  }
|                     |
          }d}|Kt)                      } ||                    d| j         j                  |                    d                    }|s|f|t          d         z   }||f|z   n|S t-          |||j        |j        
          S )  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTrc  r#   r  rM   r   r&   r  r  )r   re  r  r  r  r6   stackr   r  r  r  r  r[   r  r  r  r=   rq  rr  r  r	   r  r   r.   r/   )r   r   r@   r&  rf  rg  r  rT  r.   norm_weightspooled_outputpadding_maskexpand_padding_maskr{  r*   loss_fctr  s                    r<   r   z)Wav2Vec2ForSequenceClassification.forward  sW   . &1%<kk$+B]'+{'IcttOc--)/!5#   
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55!)..1.55MMBB=CVWXCY[ijjL"."8"8"<"<"C"CAq-J]^_J`"a"a25M../)--!-44|7G7GA7G7N7N7S7STVXY7Z7ZZM//'))H8FKKDK,BCCV[[QS__UUD 	FY)F)G)G!HHF)-)9TGf$$vE'!/)	
 
 
 	
r;   rg  )r2   r3   r4   r   rU  rS  r  r   r   r6   r6  r_   r   r9   r   r   r   r   s   @r<   r  r  x  s           "
& 
& 
&= = =( ( (  26,0/3&*)-B
 B
u|,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
 B
 ^B
 B
 B
 B
 B
r;   r  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 d	ee         d
ee         dee         deeef         fd            Z xZS )#Wav2Vec2ForAudioFrameClassificationc                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        |j        | _        |                                  d S )Nr  zbAudio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)r#   )r   r   r   r  rU   rD  r  rd  r  r   r  r6   rd   r  r   r   r  r  init_weightsr  s      r<   r   z,Wav2Vec2ForAudioFrameClassification.__init__  s       6=)) 	f.@ 	t   &f---1
( 	S!#ej.D.Dz.Q!R!RD)F$68IJJ +r;   c                 b    t          j        dt                     |                                  dS r  rR  rT  s    r<   rU  z<Wav2Vec2ForAudioFrameClassification.freeze_feature_extractor  rV  r;   c                 B    | j         j                                         dS rX  rr  rT  s    r<   rS  z:Wav2Vec2ForAudioFrameClassification.freeze_feature_encoder  rs  r;   c                 L    | j                                         D ]	}d|_        
dS r  r  r   s     r<   r  z5Wav2Vec2ForAudioFrameClassification.freeze_base_model  r  r;   Nr   r@   r  r&  rf  rg  rB   c           	         ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }
d}|`t                      } ||
                    d| j                  t          j        |                    d| j                  d                    }|s|
f|t          d         z   }|S t#          ||
|j        |j        	          S )
r  NTrc  r#   r  rM   r   )axisr  )r   re  r  r  r  r6   r  r   r  r  r  r  r[   r  r	   r  r  r   r.   r/   )r   r   r@   r  r&  rf  rg  rT  r.   r  r{  r*   r  r  s                 r<   r   z+Wav2Vec2ForAudioFrameClassification.forward  s   . &1%<kk$+B]'+{'IcttOc--)/!5#   
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM//'))H8FKKDO<<el6;;WY[_[jKkKkrs>t>t>tuuD 	Y)F)G)G!HHFM$!/)	
 
 
 	
r;   rg  )r2   r3   r4   r   rU  rS  r  r   r   r6   r6  r_   r   r9   r   r   r   r   s   @r<   r  r    s            
& 
& 
&= = =( ( (  26)-,0/3&*9
 9
u|,9
 !.9
 &	9

 $D>9
 'tn9
 d^9
 
u++	,9
 9
 9
 ^9
 9
 9
 9
 9
r;   r  c                   &     e Zd Zd fd	Zd Z xZS )AMSoftmaxLoss      >@皙?c                     t                                                       || _        || _        || _        t          j        t          j        ||          d          | _	        t          j
                    | _        d S )NT)r   )r   r   scalemarginr  r   r  r6   randnr   r	   r*   )r   r  r  r  r  r   s        r<   r   zAMSoftmaxLoss.__init__\  se    
$l5;y*#E#EUYZZZ'))			r;   c                    |                                 }t          j                            | j        d          }t          j                            |d          }t          j        ||          }|| j        z
  }t          j                            || j	                  }| j
        t          j        |                                ||          z  }|                     ||          }|S )Nr   r  r#   )r  r   r  	normalizer   r6   mmr  one_hotr  r  r  r_   r*   )	r   r.   r  r   	cos_thetapsionehotr{  r*   s	            r<   r   zAMSoftmaxLoss.forwardd  s    !!((!(<<//1/EEH]F33	$+%&&vt??ek&++--iHHHyy((r;   )r  r  r   r   s   @r<   r  r  [  sL        * * * * * *      r;   r  c                   D     e Zd Zd fd	Zdej        dej        fdZ xZS )	TDNNLayerr   c                    t                                                       |dk    r|j        |dz
           n|j        |         | _        |j        |         | _        |j        |         | _        |j        |         | _        t          j
        | j        | j        z  | j                  | _        t          j                    | _        d S )Nr   r#   )r   r   tdnn_dimr   r   tdnn_kernelr   tdnn_dilationdilationr   r   kernelr  r   r   s      r<   r   zTDNNLayer.__init__s  s    <DqLL6?8a<88fo^fNg"OH5!-h7,X6i 043C CTEVWW'))r;   r.   rB   c                 
   t                      rddlm} t                      r)t          | j        |          rt          j        d           |                    dd          }| j        j        	                    | j
        | j        | j                                      dd          }t          j                            ||| j        j        | j                  }|                    dd          }|                     |          }|S )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r#   r&   )r  )r    peft.tuners.lorar  r>  r  r   r   r   r   r  r   r   r   r   r  conv1dr   r  r   )r   r.   r  r   s       r<   r   zTDNNLayer.forward}  s     	3222222 	$+y11 O   &//155#(():D<LdN^__iijkmnoo,,]FDKDT_c_l,mm%//15566r;   r   )r2   r3   r4   r   r6   r6  r   r   r   s   @r<   r  r  r  sc        $ $ $ $ $ $U\ el        r;   r  zl
    Wav2Vec2 Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                       e Zd Z fdZd Zd Zd Zdeej	        e
f         fdZe	 	 	 	 	 ddeej                 d	eej                 d
ee         dee         dee         deej                 deeef         fd            Z xZS )Wav2Vec2ForXVectorc                    t                                                     t                    | _        j        dz   }j        r.t          j        t          j	        |          |z            | _
        t          j        j        j        d                   | _        fdt          t!          j                            D             }t          j        |          | _        t          j        j        d         dz  j                  | _        t          j        j        j                  | _        t-          j        j                  | _        |                                  d S )Nr#   r   c                 0    g | ]}t          |          S r:   )r  r   s     r<   rQ   z/Wav2Vec2ForXVector.__init__.<locals>.<listcomp>  s#    QQQy++QQQr;   rM   r&   )r   r   rD  r  rd  r  r   r  r6   rd   r  r   r   r  r  r]   rb   r   tdnnxvector_output_dimrF  r  r  r  	objectiver  )r   r   r  tdnn_layersr   s    `  r<   r   zWav2Vec2ForXVector.__init__  s)      %f---1
( 	S!#ej.D.Dz.Q!R!RD6#5vq7IJJQQQQU3v;O;O5P5PQQQM+..	!#6?2+>+BFD]!^!^)F$=v?XYY&v'@&BSTTr;   c                 b    t          j        dt                     |                                  dS r  rR  rT  s    r<   rU  z+Wav2Vec2ForXVector.freeze_feature_extractor  rV  r;   c                 B    | j         j                                         dS rX  rr  rT  s    r<   rS  z)Wav2Vec2ForXVector.freeze_feature_encoder  rs  r;   c                 L    | j                                         D ]	}d|_        
dS r  r  r   s     r<   r  z$Wav2Vec2ForXVector.freeze_base_model  r  r;   rl   c                 D    d }| j         j        D ]} |||d          }|S )z?
        Computes the output length of the TDNN layers
        c                     | |z
  |z  dz   S )Nr#   r:   r  s      r<   r  zEWav2Vec2ForXVector._get_tdnn_output_lengths.<locals>._conv_out_length  s     !;.69A==r;   r#   )r   r  )r   rl   r  r   s       r<   _get_tdnn_output_lengthsz+Wav2Vec2ForXVector._get_tdnn_output_lengths  sE    
	> 	> 	>
  ;2 	L 	LK,,]KKKMMr;   Nr   r@   r&  rf  rg  r  rB   c                 >   ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }| j        D ]}
 |
|          }|-|                    d          }|                    d          }n|                     |                    d                    }|                     |          }g }g }t'          |          D ]k\  }}|                    ||d|f                             d                     |                    ||d|f                             d                     lt          j        |          }t          j        |          }t          j        ||gd          }|                     |          }|                     |          }d}||                     ||          }|s||f|t          d         z   }||f|z   n|S t3          ||||j        |j                  S )	r  NTrc  r#   r  rM   r   )r*   r{  
embeddingsr.   r/   )r   re  r  r  r  r6   r  r   r  r  r  r  r[   r  r  r  r  r  r  	enumeraterf   ry  rF  r  r  r   r.   r/   )r   r   r@   r&  rf  rg  r  rT  r.   r  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr   lengthstatistic_poolingoutput_embeddingsr{  r*   r  s                         r<   r   zWav2Vec2ForXVector.forward  s   . &1%<kk$+B]'+{'IcttOc--)/!5#   
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55) 	6 	6J&J}55MM !)..1.55M(,,,33LL*.*O*OP^PbPbghPbPiPi*j*j'"&"?"?@["\"\ML&':;; J J	6$$]1gvg:%>%C%C%C%J%JKKK##M!WfW*$=$A$Aa$A$H$HIIII!K66M ;|44L!I}l&CLLL 223DEE!233>>&&11D 	F/07;X;Y;Y3ZZF)-)9TGf$$vE(!/)
 
 
 	
r;   rg  )r2   r3   r4   r   rU  rS  r  r   r6   rB  rF   r  r   r   r6  r_   r9   r   r   r   r   s   @r<   r  r    sB           &
& 
& 
&= = =( ( (eE<Lc<Q6R      26,0/3&*)-O
 O
u|,O
 !.O
 $D>	O

 'tnO
 d^O
 &O
 
um#	$O
 O
 O
 ^O
 O
 O
 O
 O
r;   r  )r  r  r  r  r  r  rD  r  r   r   )Nr  N)fr5   r  r   dataclassesr   typingr   r   r   numpyrV   r6   safetensors.torchr   r)  r   torch.nnr	   activationsr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   r   r   r   r   r   r    r!   r"   configuration_wav2vec2r$   r,  r'  integrations.flex_attentionr%   
get_loggerr2   r#  r  r)   r9   rF   r5  rB  ndarrayrs   r   r   r   r   Moduler   r   r   r   r   r6  r  r  r8  rG  rV  r^  r  r  r  r  rZ  r  rD  r  r  r  r  r  r  r  r  __all__r:   r;   r<   <module>r     s
       ! ! ! ! ! ! , , , , , , , , , ,      9 9 9 9 9 9       % % % % % % ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7        C B B B B B 9 9 9 9 9 9                  G F F F F F F F & & & & & &                  3 2 2 2 2 2 , 5 !! KJJJJJJ 
	H	%	% !"    
7 7 7 7 7; 7 7  7B 26t tc?tt t U-.	t
 t Zt t t tp Z^!$ !$!$*-!$BJ2:BV!$ !$ !$ !$H    #=   *    !;   6    !;   0* * * * *bi * * *Z    29   % % % % %RY % % %P
 
 
 
 
5 
 
 
1 1 1 1 1	 1 1 1,  $(,% %I%<% 
% <	%
 U\*% e_% % %% % % %<U/ U/ U/ U/ U/	 U/ U/ U/p    ")   0! ! ! ! !5 ! ! !H+ + + + +*D + + +\[ [ [ [ [bi [ [ [|_ _ _ _ _RY _ _ _DI' I' I' I' I'BI I' I' I'X    bi   >    29   $    ry   2 x' x' x' x' x'o x' x' x'v	 N
 N
 N
 N
 N
+ N
 N
 N
b   
Y
 Y
 Y
 Y
 Y
4 Y
 Y
 
Y
x *q *q *q *q *q1 *q *q *qZ   
S
 S
 S
 S
 S
, S
 S
 
S
l   p
 p
 p
 p
 p
(? p
 p
 p
f f
 f
 f
 f
 f
*A f
 f
 f
R    BI   .    	   @   
N
 N
 N
 N
 N
0 N
 N
 
N
b	 	 	r;   