
     `i                     T   d dl Z d dlZd dlmZmZmZ d dlZd dlZd dlm	Z	 d dl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*  e(            rddl+m,Z,  G d de          Z- G d de	j.                  Z/ G d de	j.                  Z0 G d de	j.                  Z1 G d de	j.                  Z2 G d de	j.                  Z3	 	 	 dQd e	j.        d!ej4        d"ej4        d#ej4        d$eej4                 d%ee5         d&e5d'eej4                 fd(Z6 G d) d*e	j.                  Z7 G d+ d,e	j.                  Z8 G d- d.e          Z9 G d/ d0e	j.                  Z: G d1 d2e	j.                  Z; G d3 d4e	j.                  Z<e& G d5 d6e"                      Z=	 	 dRd7e>e?e?f         d8e5d9e?d$eej@                 d:e?d;ejA        fd<ZBeZCe& G d= d>e=                      ZDd?ZE e&d@A           G dB dCe=                      ZF e&dDA           G dE dFe=                      ZGe& G dG dHe=                      ZH G dI dJe	j.                  ZI G dK dLe	j.                  ZJ e&dMA           G dN dOe=                      ZKg dPZLdS )S    N)CallableOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_peft_availableis_torch_flex_attn_available   )Data2VecAudioConfig)make_flex_block_causal_maskc                   &     e Zd Zd fd	Zd Z xZS )Data2VecAudioConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   )kernel_sizestridebiasTelementwise_affine)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconv	LayerNorm
layer_normr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/data2vec/modeling_data2vec_audio.pyr(   zData2VecAudioConvLayer.__init__8   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@    c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )N)r0   	transposer2   r4   r6   hidden_statess     r:   forwardzData2VecAudioConvLayer.forwardG   se    		-00%//B7766%//B7766r;   r   __name__
__module____qualname__r(   rB   __classcell__r9   s   @r:   r    r    7   sR        A A A A A A      r;   r    c                   $     e Zd Z fdZd Z xZS )Data2VecAudioPadLayerc                 l    t                                                       |dz  dk    rdnd| _        d S )N   r   r   )r'   r(   num_pad_remove)r6   num_conv_pos_embeddingsr9   s     r:   r(   zData2VecAudioPadLayer.__init__S   s:    #:Q#>!#C#Caar;   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )rN   r@   s     r:   rB   zData2VecAudioPadLayer.forwardW   s;    "")!!!QQQ0F43F2F0F*FGMr;   rD   rI   s   @r:   rK   rK   R   sL        K K K K K      r;   rK   c                   $     e Zd Z fdZd Z xZS ) Data2VecAudioPositionalConvLayerc                 \   t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        t          |j                  | _	        t          |j                 | _        t          j        |j        d          | _        d S )NrM   )r"   paddinggroupsFr%   )r'   r(   r   r,   hidden_sizeconv_pos_kernel_sizenum_conv_pos_embedding_groupsr0   rK   rU   r	   r3   r4   r1   r2   r6   r7   r9   s     r:   r(   z)Data2VecAudioPositionalConvLayer.__init__^   s    I3/147
 
 
	 -V-HII !?@,v'9eTTTr;   c                    |                      |          }|                     |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S Nr   rM   )r0   rU   r?   r2   r4   r@   s     r:   rB   z(Data2VecAudioPositionalConvLayer.forwardm   sv    		-00]33%//15566%//15566r;   rD   rI   s   @r:   rS   rS   ]   sL        U U U U U      r;   rS   c                   $     e Zd Z fdZd Z xZS )$Data2VecAudioPositionalConvEmbeddingc                     t                                                       t          j        fdt	          j                  D                       | _        d S )Nc                 .    g | ]}t                    S  )rS   .0_r7   s     r:   
<listcomp>zAData2VecAudioPositionalConvEmbedding.__init__.<locals>.<listcomp>|   s"    eee!-f55eeer;   )r'   r(   r   
ModuleListrangerO   layersrZ   s    `r:   r(   z-Data2VecAudioPositionalConvEmbedding.__init__y   sS    meeeeuVEc?d?deee
 
r;   c                     |                     dd          }| j        D ]} ||          }|                     dd          }|S r\   )r?   rh   )r6   rA   layers      r:   rB   z,Data2VecAudioPositionalConvEmbedding.forward   sT    %//155[ 	1 	1E!E-00MM%//155r;   rD   rI   s   @r:   r^   r^   x   sG        
 
 
 
 
      r;   r^   c                   .     e Zd ZdZ fdZd Zd Z xZS )Data2VecAudioFeatureEncoderz.Construct the features from raw audio waveformc                     t                                                       t          j        fdt	          j                  D                       | _        d| _        d| _        d S )Nc                 2    g | ]}t          |           S ))r8   )r    rc   ir7   s     r:   re   z8Data2VecAudioFeatureEncoder.__init__.<locals>.<listcomp>   s'    gggA#FQ777gggr;   FT)	r'   r(   r   rf   rg   num_feat_extract_layersconv_layersgradient_checkpointing_requires_gradrZ   s    `r:   r(   z$Data2VecAudioFeatureEncoder.__init__   sh    =ggggvGeAfAfggg
 
 ',#"r;   c                 P    |                                  D ]	}d|_        
d| _        d S NF)
parametersrequires_gradrt   r6   params     r:   _freeze_parametersz.Data2VecAudioFeatureEncoder._freeze_parameters   s4    __&& 	( 	(E"'E#r;   c                 r    |d d d f         }| j         r| j        rd|_        | j        D ]} ||          }|S )NT)rt   trainingrx   rr   )r6   input_valuesrA   
conv_layers       r:   rB   z#Data2VecAudioFeatureEncoder.forward   s[    $QQQW-  	/4= 	/*.M'* 	6 	6J&J}55MMr;   )rE   rF   rG   __doc__r(   r{   rB   rH   rI   s   @r:   rl   rl      s\        88# # # # #$ $ $

 
 
 
 
 
 
r;   rl   c                   $     e Zd Z fdZd Z xZS )Data2VecAudioFeatureProjectionc                 .   t                                                       t          j        |j        d         |j                  | _        t          j        |j        d         |j                  | _	        t          j
        |j                  | _        d S )Nr>   eps)r'   r(   r   r1   r)   layer_norm_epsr2   LinearrW   
projectionDropoutfeat_proj_dropoutdropoutrZ   s     r:   r(   z'Data2VecAudioFeatureProjection.__init__   sn    ,vr':@UVVV)FOB$79KLLz&":;;r;   c                     |                      |          }|                     |          }|                     |          }||fS N)r2   r   r   )r6   rA   norm_hidden_statess      r:   rB   z&Data2VecAudioFeatureProjection.forward   sC    !__];;(:;;]33000r;   rD   rI   s   @r:   r   r      sG        < < < < <1 1 1 1 1 1 1r;   r           modulequerykeyvalueattention_maskscalingr   	head_maskc                    ||                     d          dz  }t          j        ||                    dd                    |z  }	||	|z   }	t          j                            |	d          }	||	|                    dddd          z  }	t          j                            |	|| j	                  }	t          j        |	|          }
|
                    dd          
                                }
|
|	fS )Nr>         rM   r   dimr   )pr}   )sizetorchmatmulr?   r   
functionalsoftmaxviewr   r}   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r:   eager_attention_forwardr      s     **R..D(<s}}Q':':;;gEL!#n4=((2(>>L#innQAq&A&AA=((6?([[L,|U33K''1--88::K$$r;   c                   >    e Zd ZdZ	 	 	 	 	 ddededed	ed
ededee         f fdZ		 	 	 	 dde
j        dee
j                 dee
j                 dee
j                 dee         dee         dee
j        ee
j                 eee
j                          f         fdZ xZS )Data2VecAudioAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr   
is_decoderr$   	is_causalr7   c                 
   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r$   )r'   r(   r   r   r   head_dimr7   
ValueErrorr   r   r   r   r   k_projv_projq_projout_proj)	r6   r   r   r   r   r$   r   r7   r9   s	           r:   r(   zData2VecAudioAttention.__init__   s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr;   rA   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                    |du}|j         dd         \  }}	|r|j         d         n|	}
||	d| j        f}||
d| j        f} |                     |          j        |                     dd          }|r|n|} |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j        j	        dk    rt          | j        j	                 } || ||||f| j        sdn| j        | j        ||d|\  }}|                    ||	d                                          }|                     |          }||dfS )z#Input shape: Batch x Time x ChannelNr>   r   rM   eagerr   )r   r   r   r   )shaper   r   r   r?   r   r   r   r7   _attn_implementationr   r}   r   r   reshaper   r   )r6   rA   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       r:   rB   zData2VecAudioAttention.forward   s    .T9 %*3B3/W/AN"(++wgr4=9wDM: 7t{{=116FPPQRTUVV-?R))]5T[[005~FPPQRTUVV
7t{{>227HRRSTVWXX(?;+w66"9$+:Z"[$7$7%
  $}>CC$,L/%%
 %
 %
 %
!\ "))#w;;FFHHmmK00L$..r;   )r   FTFN)NNNF)rE   rF   rG   r   intfloatboolr   r   r(   r   Tensorr   r   tuplerB   rH   rI   s   @r:   r   r      s_       GG  04C CC C 	C
 C C C ,-C C C C C CD 481526,13/ 3/|3/ #5<03/ !.	3/
 "%,/3/ $D>3/ -.3/ 
u|Xel3XeEL>Q5RR	S3/ 3/ 3/ 3/ 3/ 3/ 3/ 3/r;   r   c                   $     e Zd Z fdZd Z xZS )Data2VecAudioFeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |j                  | _	        t          |j        t                    rt          |j                 | _        n|j        | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S r   )r'   r(   r   r   activation_dropoutintermediate_dropoutr   rW   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutrZ   s     r:   r(   z!Data2VecAudioFeedForward.__init__*  s    $&Jv/H$I$I!"$)F,>@X"Y"Yf'-- 	9'-f.?'@D$$'-'8D$If&>@RSS j)>??r;   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r   r   r   r   r   r@   s     r:   rB   z Data2VecAudioFeedForward.forward7  sg    //>>00??11-@@))-88++M::r;   rD   rI   s   @r:   r   r   )  sL        @ @ @ @ @      r;   r   c                   &     e Zd Z fdZddZ xZS )Data2VecAudioEncoderLayerc                    t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _        d S )NF)r   r   r   r   r7   r   )r'   r(   r   rW   num_attention_headsattention_dropout	attentionr   r   r   r   r1   r   r2   r   feed_forwardfinal_layer_normrZ   s     r:   r(   z"Data2VecAudioEncoderLayer.__init__B  s    /(0,
 
 
 z&"788,v'9v?TUUU4V<< "V-?VEZ [ [ [r;   NFc                    |}|                      |||          \  }}}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}|r||fz  }|S )Nr   r   )r   r   r2   r   r   )r6   rA   r   r   attn_residualr   rd   outputss           r:   rB   z!Data2VecAudioEncoderLayer.forwardQ  s    %)-.L] *8 *
 *
&|Q ]33%566%(9(9-(H(HH--m<< " 	'&Gr;   rv   rD   rI   s   @r:   r   r   A  sQ        \ \ \ \ \       r;   r   c                        e Zd Z fdZ	 	 	 	 ddej        deej                 deded	ef
d
Z	de
ej        df         dej        fdZ xZS )Data2VecAudioEncoderc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 .    g | ]}t                    S ra   )r   rb   s     r:   re   z1Data2VecAudioEncoder.__init__.<locals>.<listcomp>l  s"    $p$p$p1%>v%F%F$p$p$pr;   F)r'   r(   r7   r^   pos_conv_embedr   r1   rW   r   r2   r   r   r   rf   rg   num_hidden_layersrh   rs   rZ   s    `r:   r(   zData2VecAudioEncoder.__init__f  s    B6JJ,v'9v?TUUUz&"788m$p$p$p$pPUV\VnPoPo$p$p$pqq&+###r;   NFTrA   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     ||          }|                     |          }	||	z   }|                     |          }|                     |          }t                      pt          |           }
| j	        D ]a}|r||fz   }t          j        g           }| j        o|| j        j        k     }|r|
r ||||          }|d         }|rd}|r||d         fz   }b|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nra   r>   r   rM   r   r   NNc              3      K   | ]}||V  	d S r   ra   )rc   vs     r:   	<genexpr>z/Data2VecAudioEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr;   )last_hidden_staterA   
attentions)	unsqueezerepeatr   _update_full_maskr   r2   r   r
   r   rh   r   randr}   r7   	layerdropr   r   )r6   rA   r   r   r   r   all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrj   dropout_probabilityskip_the_layerlayer_outputss                  r:   rB   zData2VecAudioEncoder.forwardo  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001//
 

 #11-@@%(;;66]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 %!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r;   inputs_embedsc                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S )Nflash_attention_2r   sdpaflex_attentionF)r   )	r7   r   r   dtyper   r   r   r   r   )r6   r   r  s      r:   r   z&Data2VecAudioEncoder._update_full_mask  s    
 %{/3FFF343F3FD1V;; "E^UbUh!i!i15EEEnel;; b%@[`%a%a%aN "<NML_!`!`r;   )NFFT)rE   rF   rG   r(   r   tensorr   r   r   rB   r   r   rH   rI   s   @r:   r   r   e  s        , , , , , 26"'%* :
 :
|:
 !.:
  	:

 #:
 :
 :
 :
 :
xelD01 |       r;   r   c                   $     e Zd Z fdZd Z xZS )Data2VecAudioAdapterLayerc                     t                                                       t          j        |j        d|j        z  |j        |j        d          | _        d S )NrM   r   )r#   rU   )r'   r(   r   r,   output_hidden_sizeadapter_kernel_sizeadapter_strider0   rZ   s     r:   r(   z"Data2VecAudioAdapterLayer.__init__  sU    I%))&(
 
 
			r;   c                 r    |                      |          }t          j                            |d          }|S )Nr   r   )r0   r   r   glur@   s     r:   rB   z!Data2VecAudioAdapterLayer.forward  s3    		-00))-Q)??r;   rD   rI   s   @r:   r  r    sG        
 
 
 
 
      r;   r  c                   $     e Zd Z fdZd Z xZS )Data2VecAudioAdapterc                    t                                                       j        j        k    rCt	          j        j        j                  | _        t	          j        j                  | _        nd x| _        | _        t	          j	        fdt          j                  D                       | _        j        | _        d S )Nc              3   6   K   | ]}t                    V  d S r   )r  rb   s     r:   r   z0Data2VecAudioAdapter.__init__.<locals>.<genexpr>  s,      #p#p!$=f$E$E#p#p#p#p#p#pr;   )r'   r(   r  rW   r   r   projr1   proj_layer_normrf   rg   num_adapter_layersrh   r   rZ   s    `r:   r(   zData2VecAudioAdapter.__init__  s     $(:::	&"4f6OPPDI#%<0I#J#JD  /33DI,m#p#p#p#puU[UnOoOo#p#p#ppp)r;   c                 X   | j         1| j        *|                      |          }|                     |          }|                    dd          }| j        D ]=}t          j                                        }| j        r|| j        k    r ||          }>|                    dd          }|S r\   )r  r  r?   rh   nprandomr}   r   )r6   rA   rj   layerdrop_probs       r:   rB   zData2VecAudioAdapter.forward  s    9 T%9%E IIm44M 00??M%//155[ 	5 	5EY--//N= 5^dn%D%D %m 4 4%//155r;   rD   rI   s   @r:   r  r    sG        * * * * *      r;   r  c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
d Z	 ddeej        ef         dee         fd	Z	 dd
edej        fdZdS )Data2VecAudioPreTrainedModelr7   data2vec_audior~   Tc                    t          |t                    r}t          j        d|j        j        z            }t          j                            |j        j	        | |           t          j                            |j        j
        | |           dS t          |t                    r,t          j                            |j        j
        d           dS t          |t          j                  rT|j	        j                            d| j        j                   |j
         |j
        j                                         dS dS t          |t          j        t          j        f          rO|j
        |j
        j                                         |j	        !|j	        j                            d           dS dS t          |t          j                  rt          j                            |j	                   |j
        [t          j        |j        |j        |j        d         z  z            }t          j                            |j
        | |           dS dS dS )zInitialize the weightsr   )abr   r   )meanstdNg      ?)r   r   mathsqrtr   in_featuresr   inituniform_weightr$   rS   	constant_r0   r   datanormal_r7   initializer_rangezero_r1   	GroupNormfill_r,   kaiming_normal_rV   in_channelsr"   )r6   r   ks      r:   _init_weightsz*Data2VecAudioPreTrainedModel._init_weights  s   f<== 	9	!f/;;<<AGV.5!qAAAGV.3rQ????? @AA 	9Gfk.22222	** 	9M&&CT[5R&SSS{& &&((((( '&r| <== 
	9{& &&(((}("((----- )(	** 	9G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888	9 	9 '&r;   Ninput_lengthsadd_adapterc                    || j         j        n|}d }t          | j         j        | j         j                  D ]\  }} ||||          }|r3t          | j         j                  D ]} ||d| j         j                  }|S )zH
        Computes the output length of the convolutional layers
        Nc                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r   divinput_lengthr"   r#   s      r:   _conv_out_lengthzWData2VecAudioPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s&     9\K7wWWWZ[[[r;   r   )r7   r8  zipr-   r.   rg   r  r  )r6   r7  r8  r@  r"   r#   rd   s          r:    _get_feat_extract_output_lengthsz=Data2VecAudioPreTrainedModel._get_feat_extract_output_lengths  s     2=1Ddk--+	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMM 	_4;9:: _ _ 0 04;C] ^ ^r;   feature_vector_lengthr   c                    |                     d          d d df         }|                     ||          }|                    t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                               d          
                    dg                                          }|S )Nr>   r   r8  r   )r	  devicer   )rF  )cumsumrB  tor   longr   zerosr	  rF  arangeflipr   )r6   rC  r   r8  non_padded_lengthsoutput_lengths
batch_sizes          r:   "_get_feature_vector_attention_maskz?Data2VecAudioPreTrainedModel._get_feature_vector_attention_mask-  s   
 ,22r2::111b5A>>?Q_j>kk'**5:66#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr;   r   )rE   rF   rG   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr6  r   r   
LongTensorr   r   r   rB  rP  ra   r;   r:   r  r    s         ($O&*#N9 9 94 Z^ "5#3S#89HPQU   0 Y] %(:?:J     r;   r  r   	mask_probmask_length	min_masksr   c                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                                                    d                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r?  num_masked_spanepsilonrZ  rY  r[  sequence_lengths     r:   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_spanh  s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFOr;   Nr>   c                     g | ]}S ra   ra   )rc   rd   rb  s     r:   re   z)_compute_mask_indices.<locals>.<listcomp>{  s    999!o999r;   r	  r   F)replace)r   r  r  r   itemdetachsumtolistrg   rJ  r   choicerK  lenconcatenateonesint32appendarraybroadcast_tor   r_  put_along_axis)r   rY  rZ  r   r[  rO  rc  r7  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr?  r`  spec_aug_mask_idxdummy_mask_idxoffsetsra  rb  s    `` `           @@r:   _compute_mask_indicesrz  B  sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	##B''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???r;   c                   0    e Zd Zdef fdZd Z	 	 ddej        deej                 deej	                 fdZ
e	 	 	 	 	 dd	eej                 deej                 deej                 d
ee         dee         dee         deeef         fd            Z xZS )Data2VecAudioModelr7   c                    t                                          |           || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        t!          |          | _        |j        rt'          |          nd | _        |                                  d S )Nr   )r'   r(   r7   rl   feature_extractorr   feature_projectionmask_time_probmask_feature_probr   	Parameterr   r   rW   r*  masked_spec_embedr   encoderr8  r  adapter	post_initrZ   s     r:   r(   zData2VecAudioModel.__init__  s       !<V!D!D"@"H"H  3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"+F337=7IS+F333t 	r;   c                 8    | j                                          dS 
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r~  r{   r6   s    r:   freeze_feature_encoderz)Data2VecAudioModel.freeze_feature_encoder  s    
 	1133333r;   NrA   mask_time_indicesr   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )rY  rZ  r   r[  )rF  r	  )rY  rZ  r[  r>   )getattrr7   r   r  rH  r	  r  r}   rz  mask_time_lengthmask_time_min_masksr   r
  rF  r   r  mask_feature_lengthmask_feature_min_masksexpand)r6   rA   r  r   rO  rb  rW   mask_feature_indicess           r:   _mask_hidden_statesz&Data2VecAudioModel._mask_hidden_states  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r;   r~   r   r   r   r   c                 :   ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|#|                     |j        d         |d          }|                     |          \  }}| 	                    |||          }| 
                    |||||          }	|	d         }| j        |                     |          }|s||f|	dd         z   S t          |||	j        |	j        	          S )
a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   rM   FrE  )r  r   r   r   r   r   r   )r   extract_featuresrA   r   )r7   r   r   use_return_dictr~  r?   rP  r   r  r  r  r  Data2VecAudioBaseModelOutputrA   r   )
r6   r~   r   r  r   r   r   r  rA   encoder_outputss
             r:   rB   zData2VecAudioModel.forward  s|    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;%!DD &q)>u E  N +/*A*ABR*S*S''00->~ 1 
 
 ,,)/!5# ' 
 
 (*<# LL77M 	K!#34qrr7JJJ++-)7&1	
 
 
 	
r;   r   NNNNN)rE   rF   rG   r   r(   r  r   FloatTensorr   rX  r  r   r   r   r   r   r  rB   rH   rI   s   @r:   r|  r|    sL       2      "4 4 4 :>59	, ,(, $E$56, !!12	, , , ,\  269=,0/3&*7
 7
u|,7
 !.7
 $E$56	7

 $D>7
 'tn7
 d^7
 
u22	37
 7
 7
 ^7
 7
 7
 7
 7
r;   r|  rM   zu
    Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                        e Zd Z fdZd Zd Ze	 	 	 	 	 ddeej	                 deej	                 dee
         dee
         d	ee
         d
eej	                 deeef         fd            Z xZS )Data2VecAudioForCTCc                    t                                          |           t          |          | _        t	          j        |j                  | _        |j        t          d| j
         d          t          |d          r|j        r|j        n|j        }t	          j        ||j                  | _        |                                  dS )a7  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`Data2VecAudioForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r8  )r'   r(   r|  r   r   r   final_dropoutr   
vocab_sizer   r9   hasattrr8  r  rW   r   lm_headr  )r6   r7   r  r9   s      r:   r(   zData2VecAudioForCTC.__init__H  s     	   088z&"677$H H H H   *1)G)GvFL^vF%%djdv 	 y!3V5FGG 	r;   c                 b    t          j        dt                     |                                  dS r  The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningr  r  s    r:   freeze_feature_extractorz,Data2VecAudioForCTC.freeze_feature_extractorc  ;    
 	Q	
 	
 	

 	##%%%%%r;   c                 B    | j         j                                         dS r  r   r~  r{   r  s    r:   r  z*Data2VecAudioForCTC.freeze_feature_encodero  "    
 	-@@BBBBBr;   Nr~   r   r   r   r   labelsr   c           
      p   ||n| j         j        }|>|                                | j         j        k    rt	          d| j         j                   |                     |||||          }|d         }|                     |          }|                     |          }	d}
|Z||nt          j	        |t          j
                  }|                     |                    d                                        t          j
                  }|dk    }|                    d          }|                    |          }t          j                            |	dt          j                                      dd          }t          j        j                            d	
          5  t          j                            ||||| j         j        | j         j        | j         j                  }
ddd           n# 1 swxY w Y   |s|	f|t6          d         z   }|
|
f|z   n|S t9          |
|	|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   re  r>   )r   r	  r   F)enabled)blank	reductionzero_infinitylosslogitsrA   r   )r7   r  r_  r  r   r   r   r  r   	ones_likerI  rB  ri  rH  masked_selectr   r   log_softmaxfloat32r?   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rA   r   )r6   r~   r   r   r   r   r  r   rA   r  r  r7  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r:   rB   zData2VecAudioForCTC.forwardv  s   " &1%<kk$+B]&**,,$+2H"H"H\DKDZ\\]]]%%)/!5# & 
 
  
]33m,, #1"<%/R^fkfpBqBqBq  !AA.BTBTUWBXBXYY\\]b]ghhM !A+K(__R00N & 4 4[ A A 11&b1VV``abdeffI%++E+:: 	 	}--%!"+2"k<"&+"? .  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  	FY)F)G)G!HHF)-)9TGf$$vEfG4IV]Vh
 
 
 	
s    AG11G58G5r  )rE   rF   rG   r(   r  r  r   r   r   r   r   r   r   r   rB   rH   rI   s   @r:   r  r  B  s           6
& 
& 
&C C C  26,0/3&*)-D
 D
u|,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
 D
 ^D
 D
 D
 D
 D
r;   r  z
    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j
                 dee	j
                 dee         d	ee         d
ee         dee	j
                 deeef         fd            Z xZS )&Data2VecAudioForSequenceClassificationc                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        |                                  d S )Nr8  zdSequence classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)r   )r'   r(   r  r8  r   r|  r   r   use_weighted_layer_sumr   r  r   rn  layer_weightsr   rW   classifier_proj_size	projector
num_labels
classifierr  r6   r7   
num_layersr9   s      r:   r(   z/Data2VecAudioForSequenceClassification.__init__  s       6=)) 	f.@ 	v   188-1
( 	S!#ej.D.Dz.Q!R!RD6#5v7RSS)F$?ARSS 	r;   c                 b    t          j        dt                     |                                  dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        r  Nr  r  s    r:   r  z?Data2VecAudioForSequenceClassification.freeze_feature_extractor  r  r;   c                 B    | j         j                                         dS r  r  r  s    r:   r  z=Data2VecAudioForSequenceClassification.freeze_feature_encoder  r  r;   c                 L    | j                                         D ]	}d|_        
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr   rw   rx   ry   s     r:   freeze_base_modelz8Data2VecAudioForSequenceClassification.freeze_base_model  7    
 (3355 	( 	(E"'E	( 	(r;   Nr~   r   r   r   r   r  r   c                 d   ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }||                    d          }
n|                     |j        d         |          }|                    d                              dd|j        d                   }d	|| <   |                    d          |                    d                              dd          z  }
|                     |
          }d}|Kt)                      } ||                    d| j         j                  |                    d                    }|s|f|t          d         z   }||f|z   n|S t-          |||j        |j        
          S )  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   r>   r   rM   r   r  )r7   r  r  r   r  r   stackr   r   r   r  r   ri  r  r$  rP  r   r   r   r  r   r  r   rA   r   )r6   r~   r   r   r   r   r  r   rA   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  s                    r:   rB   z.Data2VecAudioForSequenceClassification.forward  sY   . &1%<kk$+B]'+{'IcttOc%%)/!5# & 
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55!)..1.55MMBB=CVWXCY[ijjL"."8"8"<"<"C"CAq-J]^_J`"a"a25M../)--!-44|7G7GA7G7N7N7S7STVXY7Z7ZZM//'))H8FKKDK,BCCV[[QS__UUD 	FY)F)G)G!HHF)-)9TGf$$vE'!/)	
 
 
 	
r;   r  )rE   rF   rG   r(   r  r  r  r   r   r   r   r   r   r   r   rB   rH   rI   s   @r:   r  r    s           "
& 
& 
&C C C( ( (  26,0/3&*)-B
 B
u|,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
 B
 ^B
 B
 B
 B
 B
r;   r  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 d	ee         d
ee         dee         deeef         fd            Z xZS )(Data2VecAudioForAudioFrameClassificationc                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        |j        | _        |                                  d S )Nr8  zgAudio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)r   )r'   r(   r  r8  r   r|  r   r   r  r   r  r   rn  r  r   rW   r  r  init_weightsr  s      r:   r(   z1Data2VecAudioForAudioFrameClassification.__init__9  s       6=)) 	f.@ 	y   188-1
( 	S!#ej.D.Dz.Q!R!RD)F$68IJJ +r;   c                 b    t          j        dt                     |                                  dS r  r  r  s    r:   r  zAData2VecAudioForAudioFrameClassification.freeze_feature_extractorI  r  r;   c                 B    | j         j                                         dS r  r  r  s    r:   r  z?Data2VecAudioForAudioFrameClassification.freeze_feature_encoderU  r  r;   c                 L    | j                                         D ]	}d|_        
dS r  r  ry   s     r:   r  z:Data2VecAudioForAudioFrameClassification.freeze_base_model\  r  r;   Nr~   r   r  r   r   r   r   c           	         ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }
d}|`t                      } ||
                    d| j                  t          j        |                    d| j                  d                    }|s|
f|t          d         z   }|S t#          ||
|j        |j        	          S )
r  NTr  r   r   r>   r   )axisr  )r7   r  r  r   r  r   r  r   r   r   r  r   ri  r  r   r  argmaxr   rA   r   )r6   r~   r   r  r   r   r   r   rA   r  r  r  r  r  s                 r:   rB   z0Data2VecAudioForAudioFrameClassification.forwardd  s   . &1%<kk$+B]'+{'IcttOc%%)/!5# & 
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM//'))H8FKKDO<<el6;;WY[_[jKkKkrs>t>t>tuuD 	Y)F)G)G!HHFM$!/)	
 
 
 	
r;   r  )rE   rF   rG   r(   r  r  r  r   r   r   r   r   r   r   r   rB   rH   rI   s   @r:   r  r  7  s            
& 
& 
&C C C( ( (  26)-,0/3&*9
 9
u|,9
 !.9
 &	9

 $D>9
 'tn9
 d^9
 
u++	,9
 9
 9
 ^9
 9
 9
 9
 9
r;   r  c                   &     e Zd Zd fd	Zd Z xZS )AMSoftmaxLoss      >@皙?c                     t                                                       || _        || _        || _        t          j        t          j        ||          d          | _	        t          j
                    | _        d S )NT)rx   )r'   r(   scalemarginr  r   r  r   randnr+  r   r  )r6   	input_dimr  r  r  r9   s        r:   r(   zAMSoftmaxLoss.__init__  se    
$l5;y*#E#EUYZZZ'))			r;   c                    |                                 }t          j                            | j        d          }t          j                            |d          }t          j        ||          }|| j        z
  }t          j                            || j	                  }| j
        t          j        |                                ||          z  }|                     ||          }|S )Nr   r   r   )flattenr   r   	normalizer+  r   mmr  one_hotr  r  wherer   r  )	r6   rA   r  r+  	cos_thetapsionehotr  r  s	            r:   rB   zAMSoftmaxLoss.forward  s    !!((!(<<//1/EEH]F33	$+%&&vt??ek&++--iHHHyy((r;   )r  r  rD   rI   s   @r:   r  r    sL        * * * * * *      r;   r  c                   D     e Zd Zd fd	Zdej        dej        fdZ xZS )	TDNNLayerr   c                    t                                                       |dk    r|j        |dz
           n|j        |         | _        |j        |         | _        |j        |         | _        |j        |         | _        t          j
        | j        | j        z  | j                  | _        t          j                    | _        d S )Nr   r   )r'   r(   tdnn_dimr*   r+   tdnn_kernelr"   tdnn_dilationdilationr   r   kernelReLUr4   r5   s      r:   r(   zTDNNLayer.__init__  s    <DqLL6?8a<88fo^fNg"OH5!-h7,X6i 043C CTEVWW'))r;   rA   r   c                 
   t                      rddlm} t                      r)t          | j        |          rt          j        d           |                    dd          }| j        j        	                    | j
        | j        | j                                      dd          }t          j                            ||| j        j        | j                  }|                    dd          }|                     |          }|S )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   rM   )r	  )r   peft.tuners.lorar  r   r
  r  r  r?   r+  r   r+   r"   r*   r   r   conv1dr$   r	  r4   )r6   rA   r  r+  s       r:   rB   zTDNNLayer.forward  s     	3222222 	$+y11 O   &//155#(():D<LdN^__iijkmnoo,,]FDKDT_c_l,mm%//15566r;   rC   )rE   rF   rG   r(   r   r   rB   rH   rI   s   @r:   r  r    sc        $ $ $ $ $ $U\ el        r;   r  zq
    Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                       e Zd Z fdZd Zd Zd Zdeej	        e
f         fdZe	 	 	 	 	 ddeej                 d	eej                 d
ee         dee         dee         deej                 deeef         fd            Z xZS )Data2VecAudioForXVectorc                    t                                                     t                    | _        j        dz   }j        r.t          j        t          j	        |          |z            | _
        t          j        j        j        d                   | _        fdt          t!          j                            D             }t          j        |          | _        t          j        j        d         dz  j                  | _        t          j        j        j                  | _        t-          j        j                  | _        |                                  d S )Nr   r   c                 0    g | ]}t          |          S ra   )r  ro   s     r:   re   z4Data2VecAudioForXVector.__init__.<locals>.<listcomp>  s#    QQQy++QQQr;   r>   rM   )r'   r(   r|  r   r   r  r   r  r   rn  r  r   rW   r  r  rg   rl  rf   tdnnxvector_output_dimr~  r  r  r  	objectiver  )r6   r7   r  tdnn_layersr9   s    `  r:   r(   z Data2VecAudioForXVector.__init__  s*      088-1
( 	S!#ej.D.Dz.Q!R!RD6#5vq7IJJQQQQU3v;O;O5P5PQQQM+..	!#6?2+>+BFD]!^!^)F$=v?XYY&v'@&BSTTr;   c                 b    t          j        dt                     |                                  dS r  r  r  s    r:   r  z0Data2VecAudioForXVector.freeze_feature_extractor  r  r;   c                 B    | j         j                                         dS r  r  r  s    r:   r  z.Data2VecAudioForXVector.freeze_feature_encoder  r  r;   c                 L    | j                                         D ]	}d|_        
dS r  r  ry   s     r:   r  z)Data2VecAudioForXVector.freeze_base_model  r  r;   r7  c                 D    d }| j         j        D ]} |||d          }|S )z?
        Computes the output length of the TDNN layers
        c                     | |z
  |z  dz   S )Nr   ra   r>  s      r:   r@  zJData2VecAudioForXVector._get_tdnn_output_lengths.<locals>._conv_out_length  s     !;.69A==r;   r   )r7   r  )r6   r7  r@  r"   s       r:   _get_tdnn_output_lengthsz0Data2VecAudioForXVector._get_tdnn_output_lengths  sE    
	> 	> 	>
  ;2 	L 	LK,,]KKKMMr;   Nr~   r   r   r   r   r  r   c                 >   ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }| j        D ]}
 |
|          }|-|                    d          }|                    d          }n|                     |                    d                    }|                     |          }g }g }t'          |          D ]k\  }}|                    ||d|f                             d                     |                    ||d|f                             d                     lt          j        |          }t          j        |          }t          j        ||gd          }|                     |          }|                     |          }d}||                     ||          }|s||f|t          d         z   }||f|z   n|S t3          ||||j        |j                  S )	r  NTr  r   r   r>   r   )r  r  
embeddingsrA   r   )r7   r  r  r   r  r   r  r   r   r   r  r   ri  r  r  r$  r%  rB  r  	enumeraterp  catr~  r  r  r   rA   r   )r6   r~   r   r   r   r   r  r   rA   r  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsrp   lengthstatistic_poolingoutput_embeddingsr  r  r  s                         r:   rB   zData2VecAudioForXVector.forward  s   . &1%<kk$+B]'+{'IcttOc%%)/!5# & 
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55) 	6 	6J&J}55MM !)..1.55M(,,,33LL*.*O*OP^PbPbghPbPiPi*j*j'"&"?"?@["\"\ML&':;; J J	6$$]1gvg:%>%C%C%C%J%JKKK##M!WfW*$=$A$Aa$A$H$HIIII!K66M ;|44L!I}l&CLLL 223DEE!233>>&&11D 	F/07;X;Y;Y3ZZF)-)9TGf$$vE(!/)
 
 
 	
r;   r  )rE   rF   rG   r(   r  r  r  r   r   rX  r   r  r   r   r   r   r   r   rB   rH   rI   s   @r:   r  r    sE           &
& 
& 
&C C C( ( (eE<Lc<Q6R      26,0/3&*)-O
 O
u|,O
 !.O
 $D>	O

 'tnO
 d^O
 &O
 
um#	$O
 O
 O
 ^O
 O
 O
 O
 O
r;   r  )r  r  r  r  r|  r  )Nr   NrQ   )Mr&  r  typingr   r   r   numpyr  r   r   torch.nnr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_data2vec_audior   integrations.flex_attentionr   r    ModulerK   rS   r^   rl   r   r   r   r   r   r   r   r   r  r  r  r   r   rX  ndarrayrz  r  r|  r  r  r  r  r  r  r  __all__ra   r;   r:   <module>r<     s  ,   , , , , , , , , , ,            % % % % % % ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7 g g g g g g g g B B B B B B 9 9 9 9 9 9                G F F F F F F F & & & & & & T T T T T T T T T T = = = = = =  !! KJJJJJJ    7   6    BI       ry   6    29       ")   :1 1 1 1 1RY 1 1 1*  $(,% %I%<% 
% <	%
 U\*% e_% % %% % % %<U/ U/ U/ U/ U/RY U/ U/ U/p    ry   0! ! ! ! ! : ! ! !HZ Z Z Z Z29 Z Z Zz    	   $    29   > K K K K K? K K Kd 26t tc?tt t U-.	t
 t Zt t t tn  7  
 
 
 
 
5 
 
 
D !"    
t
 t
 t
 t
 t
6 t
 t
 
t
n   p
 p
 p
 p
 p
-I p
 p
 p
f f
 f
 f
 f
 f
/K f
 f
 f
R    BI   .    	   @   
N
 N
 N
 N
 N
: N
 N
 
N
b  r;   