
     `iu                        d dl Z d dlZd dlmZ d dlmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+  e(            rddl,m-Z-  e)j.        e/          Z0e e'd           G d de                                  Z1 G d dej2                  Z3 G d dej2                  Z4 G d de          Z5 G d de          Z6 G d d e          Z7 G d! d"ej2                  Z8 G d# d$ej2                  Z9	 	 	 dTd&ej2        d'e
j:        d(e
j:        d)e
j:        d*ee
j:                 d+ee;         d,e;d-ee
j:                 fd.Z< G d/ d0ej2                  Z= G d1 d2ej2                  Z> G d3 d4e          Z? G d5 d6ej2                  Z@ G d7 d8ej2                  ZA G d9 d:e          ZB G d; d<ej2                  ZC G d= d>ej2                  ZDe' G d? d@e#                      ZE	 	 dUdAeFeGeGf         dBe;dCeGd*ee
jH                 dDeGdEe	jI        fdFZJe ZKe' G dG dHeE                      ZL e'dI           G dJ dKeE                      ZMdLZN e'dM           G dN dOeE                      ZO e'dP           G dQ dReE                      ZPg dSZQdS )V    N)	dataclass)CallableOptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputModelOutputSequenceClassifierOutputWav2Vec2BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging   )UniSpeechConfig)make_flex_block_causal_maskzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dS )	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r!   r   torchFloatTensor__annotations__r"   r#   r$   r%   tupler&        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/unispeech/modeling_unispeech.pyr    r    :   s           )-D(5$
%,,,48hu01888>B): ;BBB9=8E$56===8<M8E%"345<<<59Ju01299999r0   r    c                   $     e Zd Z fdZd Z xZS )UniSpeechSamePadLayerc                 l    t                                                       |dz  dk    rdnd| _        d S )N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__s     r1   r7   zUniSpeechSamePadLayer.__init__X   s:    #:Q#>!#C#Caar0   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )r8   r9   r%   s     r1   forwardzUniSpeechSamePadLayer.forward\   s;    "")!!!QQQ0F43F2F0F*FGMr0   r'   r(   r)   r7   r?   __classcell__r;   s   @r1   r3   r3   W   sL        K K K K K      r0   r3   c                   $     e Zd Z fdZd Z xZS ) UniSpeechPositionalConvEmbeddingc                    t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        t          j        j	        }t          t          j        j        d          rt          j        j        j	        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t-          |j                  | _        t0          |j                 | _        d S )	Nr5   )kernel_sizepaddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r6   r7   nnConv1dhidden_sizer:   num_conv_pos_embedding_groupsconvutilsrI   hasattrrN   r
   	deepspeedzeroGatheredParametersrK   	original0	original1weight_gweight_vregister_external_parameterr3   rG   r	   feat_extract_activation
activation)r9   configrI   rV   r[   r\   r;   s         r1   r7   z)UniSpeechPositionalConvEmbedding.__init__c   s   I62a77
 
 
	 h*28,m<< 	@(3?K%'' 	E22493CST2UU I I'K	aHHH	I I I I I I I I I I I I I I Ity"455 .95<F95<F9-9-N66tXFFFN66tXFFFF#DIH!DDDDI,V-KLL !?@s   C??DDc                     |                     dd          }|                     |          }|                     |          }|                     |          }|                     dd          }|S )Nr   r5   )	transposerS   rG   r_   r>   s     r1   r?   z(UniSpeechPositionalConvEmbedding.forward   se    %//155		-00]3366%//155r0   r@   rB   s   @r1   rD   rD   b   sM        A A A A AB      r0   rD   c                   &     e Zd Zd fd	Zd Z xZS )UniSpeechNoLayerNormConvLayerr   c                 Z   t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        d S )Nr   r   rF   stridebias)r6   r7   conv_dimin_conv_dimout_conv_dimrO   rP   conv_kernelconv_stride	conv_biasrS   r	   r^   r_   r9   r`   layer_idr;   s      r1   r7   z&UniSpeechNoLayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@r0   c                 Z    |                      |          }|                     |          }|S N)rS   r_   r>   s     r1   r?   z%UniSpeechNoLayerNormConvLayer.forward   s*    		-0066r0   r   r@   rB   s   @r1   rd   rd      sR        A A A A A A      r0   rd   c                   &     e Zd Zd fd	Zd Z xZS )UniSpeechLayerNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   rf   T)elementwise_affine)r6   r7   ri   rj   rk   rO   rP   rl   rm   rn   rS   	LayerNorm
layer_normr	   r^   r_   ro   s      r1   r7   z$UniSpeechLayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@r0   c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )N)rS   rb   ry   r_   r>   s     r1   r?   z#UniSpeechLayerNormConvLayer.forward   se    		-00%//B7766%//B7766r0   rs   r@   rB   s   @r1   ru   ru      sR        A A A A A A      r0   ru   c                   &     e Zd Zd fd	Zd Z xZS )UniSpeechGroupNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        t          j        | j        | j        d          | _        d S )Nr   r   rf   T)
num_groupsnum_channelsaffine)r6   r7   ri   rj   rk   rO   rP   rl   rm   rn   rS   r	   r^   r_   	GroupNormry   ro   s      r1   r7   z$UniSpeechGroupNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@,$2CRVRclpqqqr0   c                     |                      |          }|                     |          }|                     |          }|S rr   )rS   ry   r_   r>   s     r1   r?   z#UniSpeechGroupNormConvLayer.forward   s;    		-006666r0   rs   r@   rB   s   @r1   r~   r~      sR        r r r r r r       r0   r~   c                   .     e Zd ZdZ fdZd Zd Z xZS )UniSpeechFeatureEncoderz.Construct the features from raw audio waveformc                    t                                                       j        dk    r7t          d          gfdt	          j        dz
            D             z   }nDj        dk    r!fdt	          j                  D             }nt          dj         d	          t          j        |          | _	        d
| _
        d| _        d S )Ngroupr   rp   c                 8    g | ]}t          |d z             S )r   r   )rd   .0ir`   s     r1   
<listcomp>z4UniSpeechFeatureEncoder.__init__.<locals>.<listcomp>   s@     O O O .fq1uEEEO O Or0   r   layerc                 2    g | ]}t          |           S )r   )ru   r   s     r1   r   z4UniSpeechFeatureEncoder.__init__.<locals>.<listcomp>   s4       DE+FQ???  r0   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r6   r7   feat_extract_normr~   rangenum_feat_extract_layers
ValueErrorrO   
ModuleListconv_layersgradient_checkpointing_requires_grad)r9   r`   r   r;   s    ` r1   r7   z UniSpeechFeatureEncoder.__init__   s   #w..6vJJJK O O O Ov=ABBO O O KK %00   INvOmInIn  KK t1Ittt   =55&+#"r0   c                 P    |                                  D ]	}d|_        
d| _        d S NF)
parametersrequires_gradr   r9   params     r1   _freeze_parametersz*UniSpeechFeatureEncoder._freeze_parameters   s4    __&& 	( 	(E"'E#r0   c                 r    |d d d f         }| j         r| j        rd|_        | j        D ]} ||          }|S )NT)r   trainingr   r   )r9   input_valuesr%   
conv_layers       r1   r?   zUniSpeechFeatureEncoder.forward   s[    $QQQW-  	/4= 	/*.M'* 	6 	6J&J}55MMr0   )r'   r(   r)   r*   r7   r   r?   rA   rB   s   @r1   r   r      s\        88# # # # #($ $ $

 
 
 
 
 
 
r0   r   c                   $     e Zd Z fdZd Z xZS )UniSpeechFeatureProjectionc                 .   t                                                       t          j        |j        d         |j                  | _        t          j        |j        d         |j                  | _	        t          j
        |j                  | _        d S )Nr|   eps)r6   r7   rO   rx   ri   layer_norm_epsry   LinearrQ   
projectionDropoutfeat_proj_dropoutdropoutr9   r`   r;   s     r1   r7   z#UniSpeechFeatureProjection.__init__  sn    ,vr':@UVVV)FOB$79KLLz&":;;r0   c                     |                      |          }|                     |          }|                     |          }||fS rr   )ry   r   r   )r9   r%   norm_hidden_statess      r1   r?   z"UniSpeechFeatureProjection.forward  sC    !__];;(:;;]33000r0   r@   rB   s   @r1   r   r      sG        < < < < <1 1 1 1 1 1 1r0   r           modulequerykeyvalueattention_maskscalingr   	head_maskc                    ||                     d          dz  }t          j        ||                    dd                    |z  }	||	|z   }	t          j                            |	d          }	||	|                    dddd          z  }	t          j                            |	|| j	                  }	t          j        |	|          }
|
                    dd          
                                }
|
|	fS )Nr|         r5   r   rM   r   )pr   )sizer+   matmulrb   rO   
functionalsoftmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r1   eager_attention_forwardr     s     **R..D(<s}}Q':':;;gEL!#n4=((2(>>L#innQAq&A&AA=((6?([[L,|U33K''1--88::K$$r0   c                   >    e Zd ZdZ	 	 	 	 	 ddededed	ed
ededee         f fdZ		 	 	 	 dde
j        dee
j                 dee
j                 dee
j                 dee         dee         dee
j        ee
j                 eee
j                          f         fdZ xZS )UniSpeechAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr   
is_decoderrh   	is_causalr`   c                 
   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )rh   )r6   r7   r   r   r   head_dimr`   r   r   r   r   rO   r   k_projv_projq_projout_proj)	r9   r   r   r   r   rh   r   r`   r;   s	           r1   r7   zUniSpeechAttention.__init__0  s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr0   r%   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                    |du}|j         dd         \  }}	|r|j         d         n|	}
||	d| j        f}||
d| j        f} |                     |          j        |                     dd          }|r|n|} |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j        j	        dk    rt          | j        j	                 } || ||||f| j        sdn| j        | j        ||d|\  }}|                    ||	d                                          }|                     |          }||dfS )z#Input shape: Batch x Time x ChannelNr|   r   r5   eagerr   )r   r   r   r   )shaper   r   r   rb   r   r   r   r`   _attn_implementationr   r   r   r   reshaper   r   )r9   r%   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       r1   r?   zUniSpeechAttention.forwardO  s    .T9 %*3B3/W/AN"(++wgr4=9wDM: 7t{{=116FPPQRTUVV-?R))]5T[[005~FPPQRTUVV
7t{{>227HRRSTVWXX(?;+w66"9$+:Z"[$7$7%
  $}>CC$,L/%%
 %
 %
 %
!\ "))#w;;FFHHmmK00L$..r0   )r   FTFN)NNNF)r'   r(   r)   r*   intfloatboolr   r   r7   r+   Tensorr   r   r.   r?   rA   rB   s   @r1   r   r   -  s^       GG  ,0C CC C 	C
 C C C )C C C C C CD 481526,13/ 3/|3/ #5<03/ !.	3/
 "%,/3/ $D>3/ -.3/ 
u|Xel3XeEL>Q5RR	S3/ 3/ 3/ 3/ 3/ 3/ 3/ 3/r0   r   c                   $     e Zd Z fdZd Z xZS )UniSpeechFeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |j                  | _	        t          |j        t                    rt          |j                 | _        n|j        | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S rr   )r6   r7   rO   r   activation_dropoutintermediate_dropoutr   rQ   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r1   r7   zUniSpeechFeedForward.__init__  s    $&Jv/H$I$I!"$)F,>@X"Y"Yf'-- 	9'-f.?'@D$$'-'8D$If&>@RSS j)>??r0   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S rr   )r   r   r   r   r   r>   s     r1   r?   zUniSpeechFeedForward.forward  sg    //>>00??11-@@))-88++M::r0   r@   rB   s   @r1   r   r     sL        @ @ @ @ @      r0   r   c                   &     e Zd Z fdZddZ xZS )UniSpeechEncoderLayerc                    t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _        d S )NFr   r   r   r   r`   r   )r6   r7   r   rQ   num_attention_headsattention_dropout	attentionrO   r   r   r   rx   r   ry   r   feed_forwardfinal_layer_normr   s     r1   r7   zUniSpeechEncoderLayer.__init__  s    +(0,
 
 
 z&"788,v'9v?TUUU088 "V-?VEZ [ [ [r0   NFc                    |}|                      |||          \  }}}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}|r||fz  }|S Nr   r   )r  r   ry   r  r  r9   r%   r   r   attn_residualr   _outputss           r1   r?   zUniSpeechEncoderLayer.forward  s    %)-.L] *8 *
 *
&|Q ]33%566%(9(9-(H(HH--m<< " 	'&Gr0   r   r@   rB   s   @r1   r   r     sQ        \ \ \ \ \       r0   r   c                        e Zd Z fdZ	 	 	 	 ddej        deej                 deded	ef
d
Z	de
ej        df         dej        fdZ xZS )UniSpeechEncoderc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 .    g | ]}t                    S r/   )r   r   r	  r`   s     r1   r   z-UniSpeechEncoder.__init__.<locals>.<listcomp>  s"    $l$l$lq%:6%B%B$l$l$lr0   Fr6   r7   r`   rD   pos_conv_embedrO   rx   rQ   r   ry   r   r   r   r   r   num_hidden_layerslayersr   r   s    `r1   r7   zUniSpeechEncoder.__init__  s    >vFF,v'9v?TUUUz&"788m$l$l$l$lERXRjLkLk$l$l$lmm&+###r0   NFTr%   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     ||          }|                     |          }	||	z   }|                     |          }|                     |          }t                      pt          |           }
| j	        D ]a}|r||fz   }t          j        g           }| j        o|| j        j        k     }|r|
r ||||          }|d         }|rd}|r||d         fz   }b|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nr/   r|   r   r5   r   r  NNc              3      K   | ]}||V  	d S rr   r/   r   vs     r1   	<genexpr>z+UniSpeechEncoder.forward.<locals>.<genexpr>   (      mmq_`_l_l_l_l_lmmr0   last_hidden_stater%   r&   )	unsqueezerepeatr   _update_full_maskr  ry   r   r
   r   r  r+   randr   r`   	layerdropr.   r   r9   r%   r   r   r  r  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                  r1   r?   zUniSpeechEncoder.forward  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001//
 

 #11-@@%(;;66]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 %!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r0   inputs_embedsc                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S Nflash_attention_2r   sdpaflex_attentionF)r   	r`   r   r   dtyper   r+   r   r   r   r9   r   r-  s      r1   r!  z"UniSpeechEncoder._update_full_mask      
 %{/3FFF343F3FD1V;; "E^UbUh!i!i15EEEnel;; b%@[`%a%a%aN "<NML_!`!`r0   NFFT)r'   r(   r)   r7   r+   tensorr   r   r   r?   r   r!  rA   rB   s   @r1   r  r    s        , , , , , 26"'%* :
 :
|:
 !.:
  	:

 #:
 :
 :
 :
 :
xelD01 |       r0   r  c                   4     e Zd Z fdZdej        fdZ xZS )UniSpeechAttnAdapterLayerc                 t   t                                                       |j        | _        |j        | _        t          j        | j                  | _        t          j	        | j        | j                  | _
        t          j                    | _        t          j	        | j        | j                  | _        dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r6   r7   adapter_attn_dim	input_dimrQ   
hidden_dimrO   rx   normr   linear_1ReLUact_fnlinear_2r   s     r1   r7   z"UniSpeechAttnAdapterLayer.__init__  s    
 	0 ,L11		$/4>BBgii	$.$/BBr0   r%   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S rr   )r?  r@  rB  rC  r>   s     r1   r?   z!UniSpeechAttnAdapterLayer.forward-  sL    		-00m44M22m44r0   )r'   r(   r)   r7   r+   r,   r?   rA   rB   s   @r1   r:  r:    s[        C C C C CU%6        r0   r:  c                   X     e Zd Z fdZ	 	 ddej        deej                 defdZ xZ	S )	$UniSpeechEncoderLayerStableLayerNormc                    t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _        t#          |dd           t%          |          | _        d S d | _        d S )NFr   r   r<  )r6   r7   r   rQ   r   r   r  rO   r   r   r   rx   r   ry   r   r  r  getattrr:  adapter_layerr   s     r1   r7   z-UniSpeechEncoderLayerStableLayerNorm.__init__8  s    +(0,
 
 
 z&"788,v'9v?TUUU088 "V-?VEZ [ [ [6-t44@!:6!B!BD!%Dr0   NFr%   r   r   c                 J   |}|                      |          }|                     |||          \  }}}|                     |          }||z   }||                     |                     |                    z   }| j        ||                     |          z   }|f}|r||fz  }|S r  )ry   r  r   r  r  rI  r  s           r1   r?   z,UniSpeechEncoderLayerStableLayerNorm.forwardK  s     &66)-.L] *8 *
 *
&|Q ]33%5%(9(9$:O:OP]:^:^(_(__))D,>,>},M,MMM " 	'&Gr0   r   )
r'   r(   r)   r7   r+   r   r   r   r?   rA   rB   s   @r1   rF  rF  7  s~        & & & & &, 26"'	 | !.  	       r0   rF  c                   b     e Zd Z fdZ	 	 	 	 d	dZdeej        df         dej        fdZ xZ	S )
UniSpeechEncoderStableLayerNormc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 .    g | ]}t                    S r/   )rF  r  s     r1   r   z<UniSpeechEncoderStableLayerNorm.__init__.<locals>.<listcomp>m  s"    ccca1&99cccr0   Fr  r   s    `r1   r7   z(UniSpeechEncoderStableLayerNorm.__init__f  s    >vFF,v'9v?TUUUz&"788mcccc5IaCbCbccc
 
 ',###r0   NFTc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     ||          }|                     |          }	||	z   }|                     |          }t                      pt          |           }
| j        D ]a}|r||fz   }t          j
        g           }| j        o|| j        j        k     }|r|
r ||||          }|d         }|rd}|r||d         fz   }b|                     |          }|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nr/   r|   r   r5   r   r  r  c              3      K   | ]}||V  	d S rr   r/   r  s     r1   r  z:UniSpeechEncoderStableLayerNorm.forward.<locals>.<genexpr>  r  r0   r  )r  r   r   r!  r  r   r
   r   r  r+   r"  r   r`   r#  ry   r.   r   r$  s                  r1   r?   z'UniSpeechEncoderStableLayerNorm.forwardq  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001//
 

 #11-@@%(;;]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 !&!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O#66 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r0   r   r-  c                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S r/  r3  r5  s      r1   r!  z1UniSpeechEncoderStableLayerNorm._update_full_mask  r6  r0   r7  )
r'   r(   r)   r7   r?   r   r+   r   r!  rA   rB   s   @r1   rL  rL  e  s        	, 	, 	, 	, 	, "<
 <
 <
 <
|elD01 |       r0   rL  c                   >     e Zd ZdZ fdZed             Zd Z xZS )UniSpeechGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                    t                                                       |j        | _        |j        | _        |j        | j        z  dk    r t          d|j         d| j         d          t          j	        t          j        d| j        | j        z  |j        | j        z                      | _        t          j        |j        d         | j        | j        z            | _        d| _        d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationr   r|   r5   )r6   r7   num_codevector_groupsr   num_codevectors_per_groupnum_varscodevector_dimr   rO   	Parameterr+   r,   codevectorsr   ri   weight_projtemperaturer   s     r1   r7   z'UniSpeechGumbelVectorQuantizer.__init__  s     68 4?2a77Y&*? Y Y59_Y Y Y   <a4=!@&BW[_[jBjkk
 
 9V_R%8$/DM:YZZ r0   c           	          |                      d          }t          j        t          j        |t          j        |dz             z  d                                                     }|S )Nr   r   gHz>r|   )meanr+   expsumlog)probsmarginal_probs
perplexitys      r1   _compute_perplexityz2UniSpeechGumbelVectorQuantizer._compute_perplexity  s^    **Y	.59^VZEZ;[;[*[ac d d ddeeiikk
r0   c                    |j         \  }}}|                     |          }|                    ||z  | j        z  d          }| j        rt
          j                            |                                | j	        d          
                    |          }t          j        |                    ||z  | j        d                                          d          }|                     |          }n|                    d          } |j        |j                              d|                    dd          d          }|                    ||z  | j        d          }|                     |          }|                    ||z  d          }|                    d          | j        z  }	|	                    ||z  | j        | j        d          }
|
                    d                              ||d          }
|
|fS )Nr|   T)tauhardr   r         ?r{   )r   r[  r   r   r   rO   r   gumbel_softmaxr   r\  type_asr+   r   re  argmax	new_zerosscatter_r  rZ  rW  r`  )r9   r%   
batch_sizesequence_lengthrQ   codevector_probscodevector_soft_distrd  codevector_idxcodevectors_per_grouprZ  s              r1   r?   z&UniSpeechGumbelVectorQuantizer.forward  s   3@3F0
O[ ((77%**:+G$/+Y[]^^= 	D!};;##%%4+;$  <    gm$$ 
 $)="":#?RTUU[[]]ce$ $ $  112FGGJJ +11b199N6}68KLUUN''A..     044Z/5QSWSbdfgg112BCCJ+00o1MrRR 0 : :2 > >AQ Q+00o1Mt`d`moqrr!oob))..z?BOOJ&&r0   )	r'   r(   r)   r*   r7   staticmethodre  r?   rA   rB   s   @r1   rS  rS    sl         
    (   \
#' #' #' #' #' #' #'r0   rS  c                   x    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej        ef         fdZded	ej        fd
ZdS )UniSpeechPreTrainedModelr`   	unispeechr   Tc           
      \   t          |t                    ro|j        j        j                            dd           |j        j        j                                         t          j	        
                    |j                   dS t          |t                    rt          j	                            |j        j        ddt          j        d|j        j        d         |j        j        z  z            z             t          j	                            |j        j        d           dS t          |t&                    r}t          j        d|j        j        z            }t          j	        
                    |j        j        | |           t          j	        
                    |j        j        | |           dS t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j                                         dS dS t          |t          j        t          j        f          r?|j        j                                         |j        j                            d           dS t          |t          j                  rt          j	                            |j                   |j        [t          j        |j        |j        |j        d         z  z            }t          j	        
                    |j        | |           dS dS dS )	zInitialize the weightsr   r   )r^  stdr   r5   )abNri  )r   rS  r[  rK   datanormal_rh   zero_rO   inituniform_rZ  rD   rS   mathsqrtrF   in_channels	constant_r   r   in_featuresr   r`   initializer_rangerx   r   fill_rP   kaiming_normal_rH   )r9   r   ks      r1   _init_weightsz&UniSpeechPreTrainedModel._init_weights  s    f<== 	9%*222CCC#(..000GV/00000 @AA 	9GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.22222 :;; 	9	!f/;;<<AGV.5!qAAAGV.3rQ?????	** 	9M&&CT[5R&SSS{& &&((((( '&r| <== 	9K""$$$M$$S)))))	** 	9G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888	9 	9 '&r0   input_lengthsc                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r+   div)input_lengthrF   rg   s      r1   _conv_out_lengthzSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length<  s&     9\K7wWWWZ[[[r0   )zipr`   rl   rm   )r9   r  r  rF   rg   s        r1    _get_feat_extract_output_lengthsz9UniSpeechPreTrainedModel._get_feat_extract_output_lengths7  s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMr0   feature_vector_lengthr   c                    |                     d          d d df         }|                     |                              t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                               d          
                    dg                                          }|S )Nr|   r   r   )r4  devicer   )r  )cumsumr  tor+   longr   zerosr4  r  arangeflipr   )r9   r  r   non_padded_lengthsoutput_lengthsro  s         r1   "_get_feature_vector_attention_maskz;UniSpeechPreTrainedModel._get_feature_vector_attention_maskF  s     ,22r2::111b5A>>?QRRUUV[V`aa#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr0   N)r'   r(   r)   r   r-   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr  r   r+   
LongTensorr   r  r  r/   r0   r1   rw  rw    s         #$O&*#N9 9 9BeEDTVYDY>Z     ]b]m      r0   rw  r   	mask_probmask_length	min_masksr   c                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                                                    d                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r  num_masked_spanepsilonr  r  r  rp  s     r1   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span|  s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFOr0   Nr|   c                     g | ]}S r/   r/   )r   r	  rp  s     r1   r   z)_compute_mask_indices.<locals>.<listcomp>  s    999!o999r0   r4  r   F)replace)r   nprandomr"  itemdetachr`  tolistr   r  r   choicer  lenconcatenateonesint32appendarraybroadcast_tor   r  put_along_axis)r   r  r  r   r  ro  r  r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  rp  s    `` `           @@r1   _compute_mask_indicesr  V  sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	##B''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???r0   c                   *    e Zd Zdef fdZ	 	 ddej        deej                 deej                 fdZ	e
	 	 	 	 	 ddeej                 deej                 deej                 d	ee         d
ee         dee         deeef         fd            Z xZS )UniSpeechModelr`   c                    t                                          |           || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        |j        rt#          |          | _        nt'          |          | _        |                                  d S )Nr   )r6   r7   r`   r   feature_extractorr   feature_projectionmask_time_probmask_feature_probrO   rY  r+   r   rQ   r  masked_spec_embeddo_stable_layer_normrL  encoderr  	post_initr   s     r1   r7   zUniSpeechModel.__init__  s       !8!@!@"<V"D"D 3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"& 	4:6BBDLL+F33DL 	r0   Nr%   mask_time_indicesr   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r  r  r   r  )r  r4  )r  r  r  r|   )rH  r`   r   r  r  r4  r  r   r  mask_time_lengthmask_time_min_masksr+   r8  r  r   r  mask_feature_lengthmask_feature_min_masksexpand)r9   r%   r  r   ro  rp  rQ   mask_feature_indicess           r1   _mask_hidden_statesz"UniSpeechModel._mask_hidden_states  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r0   r   r   r  r  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|!|                     |j        d         |          }|                     |          \  }}| 	                    |||          }| 
                    |||||          }	|	d         }|s||f|	dd         z   S t          |||	j        |	j                  S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r5   )r  r   r   r   r  r  r   )r  extract_featuresr%   r&   )r`   r   r  use_return_dictr  rb   r  r   r  r  r  UniSpeechBaseModelOutputr%   r&   )
r9   r   r   r  r   r  r  r  r%   encoder_outputss
             r1   r?   zUniSpeechModel.forward  sW    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;%!DDEUE[\]E^`nooN*.*A*ABR*S*S''00->~ 1 
 
 ,,)/!5# ' 
 
 (* 	K!#34qrr7JJJ'+-)7&1	
 
 
 	
r0   r  NNNNN)r'   r(   r)   r   r7   r+   r,   r   r  r  r   r   r   r   r.   r  r?   rA   rB   s   @r1   r  r    s<             ( :>59	, ,(, $E$56, !!12	, , , ,\  269=,0/3&*2
 2
u|,2
 !.2
 $E$56	2

 $D>2
 'tn2
 d^2
 
u..	/2
 2
 2
 ^2
 2
 2
 2
 2
r0   r  zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                        e Zd Zdef fdZdefdZd Zd Ze		 dde
j        d	e
j        d
e
j        defd            Ze	 	 	 	 ddee
j                 dee
j                 dee         dee         dee         deeef         fd            Z xZS )UniSpeechForPreTrainingr`   c                    t                                          |           t          |          | _        t	          j        |j                  | _        t          |          | _	        t	          j
        |j        |j                  | _        t	          j
        |j        |j                  | _        t	          j
        |j        |j                  | _        t	          j        |j                  | _        |                                  d S rr   )r6   r7   r  rx  rO   r   feat_quantizer_dropoutdropout_featuresrS  	quantizerr   rX  proj_codevector_dim	project_qrQ   project_hidnum_ctc_classesctc_projfinal_dropoutr   r  r   s     r1   r7   z UniSpeechForPreTraining.__init__M  s       '// "
6+H I I7??6#8&:TUU9V%?ASTT	&"4f6LMMz&"677 	r0   r\  c                     || j         _        dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r  r\  )r9   r\  s     r1   set_gumbel_temperaturez.UniSpeechForPreTraining.set_gumbel_temperature\  s     &1"""r0   c                 b    t          j        dt                     |                                  dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr9   s    r1   freeze_feature_extractorz0UniSpeechForPreTraining.freeze_feature_extractorb  ;    
 	Q	
 	
 	

 	##%%%%%r0   c                 B    | j         j                                         dS 
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        Nrx  r  r   r  s    r1   r  z.UniSpeechForPreTraining.freeze_feature_encodern  !    
 	(;;=====r0   r   target_featuresnegative_featurespredicted_featuresc                     t          j        | |gd          } t          j        |                                |                                 d          }|                    |           }||z  }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r   r|   )r+   catcosine_similarityr   rk  )r  r  r  r\  logitss        r1   compute_contrastive_logitsz2UniSpeechForPreTraining.compute_contrastive_logitsu  sq      )_6G$HaPPP();)A)A)C)C_EZEZE\E\bdeee00 +%r0   Nr   r   r   r  r  r   c                 N   ||n| j         j        }|                     |||||          }|d         }|                     |d                   }|                     |          \  }	}
|                     |	                    | j        j        j                            }	| 	                    |	          }	t          j        |                    d          |                    d                                        | j         j                  }|                    dd          }t          j        |                                                              |j                  }|                    dd          }|                    d          }|                    |d          |	                    | d          z   }|                     |          }|                     |          }d}|s#||||	|
f|dd         z   S ||	|
f|dd         z   S t/          |||	|
|j        |j                  S )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr  r   r   r|   r   r5   )r!   r"   r#   r$   r%   r&   )r`   r  rx  r  r  r  r  rK   r4  r  r+   emptyr   r  replace_probrb   	bernoullir   r  r  masked_fillr   r  r    r%   r&   )r9   r   r   r   r  r  r
  transformer_featuresr  quantized_featuresr$   prob_replace_matrixsampled_replace_matrixr  r!   s                  r1   r?   zUniSpeechForPreTraining.forward  sW   * &1%<kk$+B]..)/!5# ! 
 
  'qz  00<<48NNCS4T4T11 "^^,>,A,A$.BWB],^,^__!--.@AA#k*>*C*CA*F*FH\HaHabcHdHdeekkK$
 
 2;;AqAA!&1D!E!E!J!J!L!L!O!OPdPk!l!l!7!A!A!Q!G!G!7!A!A"!E!E%112H#NN**,B+BCHH

 f%%v&&  	c24FH]^ahijikikalll(*<>STW^_`_a_aWbbb,1'9"7!/)
 
 
 	
r0   )r   )NNNN)r'   r(   r)   r   r7   r   r  r  r  ru  r+   r,   r  r   r   r   r   r   r.   r    r?   rA   rB   s   @r1   r  r  G  su             1# 1 1 1 1
& 
& 
&> > > 
 	 * , "- 	   \&  26,0/3&*D
 D
u|,D
 !.D
 $D>	D

 'tnD
 d^D
 
u33	4D
 D
 D
 ^D
 D
 D
 D
 D
r0   r  r5   zq
    UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    c                        e Zd Zddee         f fdZd Zd Zd Zd Z	e
	 	 	 	 	 ddeej                 d	eej                 d
ee         dee         dee         deej                 deeef         fd            Z xZS )UniSpeechForCTCNtarget_langc                    t                                          |           t          |          | _        t	          j        |j                  | _        || _        |j	        t          d| j         d          t          |d          r|j        r|j        n|j        }t	          j        ||j	                  | _        |                                  dS )a3  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r6   r7   r  rx  rO   r   r  r   r  
vocab_sizer   r;   rU   r  output_hidden_sizerQ   r   lm_headr  )r9   r`   r  r  r;   s       r1   r7   zUniSpeechForCTC.__init__  s     	   '//z&"677&$H H H H   *1)G)GvFL^vF%%djdv 	 y!3V5FGG 	r0   c                    | j         }|)t          | j        dd          t          d| d          |2t          | j        dd          t                              d           dS ||                     |d           dS dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr<  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  rH  r`   r   loggerinfoload_adapter)r9   r  s     r1   tie_weightszUniSpeechForCTC.tie_weights  s     &"wt{<NPT'U'U']u;uuuvvv WT[:Ld%S%S%_KKCDDDDD$kd;;;;; %$r0   c                 b    t          j        dt                     |                                  dS )r  r  Nr  r  s    r1   r  z(UniSpeechForCTC.freeze_feature_extractor  r  r0   c                 B    | j         j                                         dS r  r  r  s    r1   r  z&UniSpeechForCTC.freeze_feature_encoder  r  r0   c                 L    | j                                         D ]	}d|_        
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNrx  r   r   r   s     r1   freeze_base_modelz!UniSpeechForCTC.freeze_base_model  6    
 ^..00 	( 	(E"'E	( 	(r0   r   r   r   r  r  labelsr   c           
      p   ||n| j         j        }|>|                                | j         j        k    rt	          d| j         j                   |                     |||||          }|d         }|                     |          }|                     |          }	d}
|Z||nt          j	        |t          j
                  }|                     |                    d                                        t          j
                  }|dk    }|                    d          }|                    |          }t          j                            |	dt          j                                      dd          }t          j        j                            d	
          5  t          j                            ||||| j         j        | j         j        | j         j                  }
ddd           n# 1 swxY w Y   |s|	f|t6          d         z   }|
|
f|z   n|S t9          |
|	|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r  r|   )rM   r4  r   F)enabled)blank	reductionzero_infinityr!   r  r%   r&   )r`   r  r  r  r   rx  r   r  r+   	ones_liker  r  r`  r  masked_selectrO   r   log_softmaxfloat32rb   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r%   r&   )r9   r   r   r   r  r  r+  r
  r%   r  r!   r  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r1   r?   zUniSpeechForCTC.forward'  s   " &1%<kk$+B]&**,,$+2H"H"H\DKDZ\\]]]..)/!5# ! 
 
  
]33m,, #1"<%/R^fkfpBqBqBq  !AA.BTBTUWBXBXYY\\]b]ghhM !A+K(__R00N & 4 4[ A A 11&b1VV``abdeffI%++E+:: 	 	}--%!"+2"k<"&+"? .  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  	FY)F)G)G!HHF)-)9TGf$$vEfG4IV]Vh
 
 
 	
s    AG11G58G5rr   r  )r'   r(   r)   r   r   r7   r#  r  r  r)  r   r+   r   r   r   r.   r   r?   rA   rB   s   @r1   r  r    s>        HSM      :< < <*
& 
& 
&> > >( ( (  26,0/3&*)-D
 D
u|,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
 D
 ^D
 D
 D
 D
 D
r0   r  z
    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j
                 dee	j
                 dee         d	ee         d
ee         dee	j
                 deeef         fd            Z xZS )"UniSpeechForSequenceClassificationc                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        |                                  d S )Nr  z`Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)r   )r6   r7   rU   r  r   r  rx  r  use_weighted_layer_sumrO   rY  r+   r  layer_weightsr   rQ   classifier_proj_size	projector
num_labels
classifierr  )r9   r`   
num_layersr;   s      r1   r7   z+UniSpeechForSequenceClassification.__init__v  s       6=)) 	f.@ 	r   (//-1
( 	S!#ej.D.Dz.Q!R!RD6#5v7RSS)F$?ARSS 	r0   c                 b    t          j        dt                     |                                  dS r  r  r  s    r1   r  z;UniSpeechForSequenceClassification.freeze_feature_extractor  r  r0   c                 B    | j         j                                         dS r  r  r  s    r1   r  z9UniSpeechForSequenceClassification.freeze_feature_encoder  r  r0   c                 L    | j                                         D ]	}d|_        
dS r'  r(  r   s     r1   r)  z4UniSpeechForSequenceClassification.freeze_base_model  r*  r0   Nr   r   r   r  r  r+  r   c                 d   ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }||                    d          }
n|                     |j        d         |          }|                    d                              dd|j        d                   }d	|| <   |                    d          |                    d                              dd          z  }
|                     |
          }d}|Kt)                      } ||                    d| j         j                  |                    d                    }|s|f|t          d         z   }||f|z   n|S t-          |||j        |j        
          S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   r|   r   r5   r   r1  )r`   r  rF  rx  r=  r+   stackrO   r   r   rG  r   r`  rI  r^  r  r   r  r   rK  r   rJ  r   r%   r&   )r9   r   r   r   r  r  r+  r
  r%   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r!   loss_fctrB  s                    r1   r?   z*UniSpeechForSequenceClassification.forward  sW   . &1%<kk$+B]'+{'IcttOc..)/!5# ! 
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55!)..1.55MMBB=CVWXCY[ijjL"."8"8"<"<"C"CAq-J]^_J`"a"a25M../)--!-44|7G7GA7G7N7N7S7STVXY7Z7ZZM//'))H8FKKDK,BCCV[[QS__UUD 	FY)F)G)G!HHF)-)9TGf$$vE'!/)	
 
 
 	
r0   r  )r'   r(   r)   r7   r  r  r)  r   r   r+   r   r   r   r.   r   r?   rA   rB   s   @r1   rD  rD  o  s           "
& 
& 
&> > >( ( (  26,0/3&*)-B
 B
u|,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
 B
 ^B
 B
 B
 B
 B
r0   rD  )r  r  rD  r  rw  )Nr   Nr=   )Rr  r  dataclassesr   typingr   r   r   numpyr  r+   torch.nnrO   r   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   rT   r   r   r   configuration_unispeechr   integrations.flex_attentionr   
get_loggerr'   r   r    Moduler3   rD   rd   ru   r~   r   r   r   r   r   r   r   r   r  r:  rF  rL  rS  rw  r.   r   r  ndarrayr  r  r  r  r=  r  rD  __all__r/   r0   r1   <module>rj     s  ,   ! ! ! ! ! ! , , , , , , , , , ,            % % % % % % ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7 g g g g g g g g B B B B B B 9 9 9 9 9 9              G F F F F F F F & & & & & & J J J J J J J J J J 4 4 4 4 4 4  !! KJJJJJJ 
	H	%	%   
: : : : :K : :  :.    BI   * * * * *ry * * *Z    $>   *    "<   6    "<   0& & & & &bi & & &R1 1 1 1 1 1 1 1*  $(,% %I%<% 
% <	%
 U\*% e_% % %% % % %<U/ U/ U/ U/ U/ U/ U/ U/p    29   0! ! ! ! !6 ! ! !HZ Z Z Z Zry Z Z Zz    	   2+ + + + ++E + + +\^ ^ ^ ^ ^bi ^ ^ ^BC' C' C' C' C'RY C' C' C'L F F F F F F F FZ 26t tc?tt t U-.	t
 t Zt t t tn 3  s
 s
 s
 s
 s
- s
 s
 s
l   
B
 B
 B
 B
 B
6 B
 B
 
B
J !"    
S
 S
 S
 S
 S
. S
 S
 
S
l   p
 p
 p
 p
 p
)A p
 p
 p
f  r0   