
     `i_                        d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z	 d dlm
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&  e#            rddl'm(Z(  e$j)        e*          Z+ G d de	j,                  Z- G d de	j,                  Z. G d de          Z/ G d de          Z0 G d de          Z1 G d de	j,                  Z2 G d de	j,                  Z3	 	 	 dJd!e	j,        d"ej4        d#ej4        d$ej4        d%eej4                 d&ee5         d'e5d(eej4                 fd)Z6 G d* d+e	j,                  Z7 G d, d-e	j,                  Z8 G d. d/e          Z9 G d0 d1e	j,                  Z: G d2 d3e	j,                  Z; G d4 d5e          Z< G d6 d7e	j,                  Z=e" G d8 d9e                      Z>	 	 dKd:e?e@e@f         d;e5d<e@d%eejA                 d=e@d>ejB        fd?ZCe" G d@ dAe>                      ZDdZE e"dBC           G dD dEe>                      ZF e"dFC           G dG dHe>                      ZGg dIZHdS )L    N)CallableOptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging   )HubertConfig)make_flex_block_causal_maskc                   $     e Zd Z fdZd Z xZS )HubertPositionalConvEmbeddingc                 &   t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        d | _        |j	        r t          j
        |j                  | _        nWt          j        j        }t          t          j        j        d          rt          j        j        j        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t3          |j                  | _        t6          |j                 | _        d S )	N   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr"   hasattrr(   r	   	deepspeedzeroGatheredParametersr%   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr    r   feat_extract_activation
activation)selfconfigr"   r6   r;   r<   	__class__s         ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/hubert/modeling_hubert.pyr*   z&HubertPositionalConvEmbedding.__init__3   s'   I62a77
 
 
	 % 	I nV-?@@DOO(.Krx0-@@ D h7C)++ I    ^66ty7GWX6YY M M +DIH! L L LDIM M M M M M M M M M M M M M M49&899 2#y9@JH#y9@JHH#y1H#y1H::4JJJ::4JJJJ'K	aHHH	)&*HII !?@s   D--D14D1c                    |                     dd          }| j        |                     |          }|                     |          }|                     |          }|                     |          }|                     dd          }|S )Nr   r   )	transposer1   r0   r    r@   rA   hidden_statess     rD   forwardz%HubertPositionalConvEmbedding.forwardX   s~    %//155?& OOM::M		-00]3366%//155    __name__
__module____qualname__r*   rI   __classcell__rC   s   @rD   r   r   2   sM        #A #A #A #A #AJ	 	 	 	 	 	 	rJ   r   c                   $     e Zd Z fdZd Z xZS )r>   c                 l    t                                                       |dz  dk    rdnd| _        d S )Nr   r   r   )r)   r*   num_pad_remove)rA   r.   rC   s     rD   r*   zHubertSamePadLayer.__init__e   s:    #:Q#>!#C#CaarJ   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )rS   rG   s     rD   rI   zHubertSamePadLayer.forwardi   s;    "")!!!QQQ0F43F2F0F*FGMrJ   rK   rP   s   @rD   r>   r>   d   sL        K K K K K      rJ   r>   c                   &     e Zd Zd fd	Zd Z xZS )HubertNoLayerNormConvLayerr   c                 Z   t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        d S )Nr   r   r   stridebias)r)   r*   conv_dimin_conv_dimout_conv_dimr+   r,   conv_kernelconv_stride	conv_biasr0   r   r?   r@   rA   rB   layer_idrC   s      rD   r*   z#HubertNoLayerNormConvLayer.__init__p   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@rJ   c                 Z    |                      |          }|                     |          }|S N)r0   r@   rG   s     rD   rI   z"HubertNoLayerNormConvLayer.forward~   s*    		-0066rJ   r   rK   rP   s   @rD   rW   rW   o   sR        A A A A A A      rJ   rW   c                   &     e Zd Zd fd	Zd Z xZS )HubertLayerNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   rY   T)elementwise_affine)r)   r*   r\   r]   r^   r+   r,   r_   r`   ra   r0   	LayerNorm
layer_normr   r?   r@   rb   s      rD   r*   z!HubertLayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@rJ   c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )N)r0   rF   rl   r@   rG   s     rD   rI   z HubertLayerNormConvLayer.forward   se    		-00%//B7766%//B7766rJ   rf   rK   rP   s   @rD   rh   rh      sR        A A A A A A      rJ   rh   c                   &     e Zd Zd fd	Zd Z xZS )HubertGroupNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        t          j        | j        | j        d          | _        d S )Nr   r   rY   T)
num_groupsnum_channelsaffine)r)   r*   r\   r]   r^   r+   r,   r_   r`   ra   r0   r   r?   r@   	GroupNormrl   rb   s      rD   r*   z!HubertGroupNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@,$2CRVRclpqqqrJ   c                     |                      |          }|                     |          }|                     |          }|S re   )r0   rl   r@   rG   s     rD   rI   z HubertGroupNormConvLayer.forward   s;    		-006666rJ   rf   rK   rP   s   @rD   rq   rq      sR        r r r r r r       rJ   rq   c                   .     e Zd ZdZ fdZd Zd Z xZS )HubertFeatureEncoderz.Construct the features from raw audio waveformc                    t                                                       j        dk    r7t          d          gfdt	          j        dz
            D             z   }nDj        dk    r!fdt	          j                  D             }nt          dj         d	          t          j        |          | _	        d
| _
        d| _        d S )Ngroupr   rc   c                 8    g | ]}t          |d z             S )r   r|   )rW   .0irB   s     rD   
<listcomp>z1HubertFeatureEncoder.__init__.<locals>.<listcomp>   s>     L L LGH*6AEBBBL L LrJ   r   layerc                 2    g | ]}t          |           S )r|   )rh   r~   s     rD   r   z1HubertFeatureEncoder.__init__.<locals>.<listcomp>   s'    wwwA3FQGGGwwwrJ   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r)   r*   feat_extract_normrq   rangenum_feat_extract_layers
ValueErrorr+   
ModuleListconv_layersgradient_checkpointing_requires_grad)rA   rB   r   rC   s    ` rD   r*   zHubertFeatureEncoder.__init__   s   #w..3FQGGGH L L L LLQRXRpstRtLuLuL L L KK %00wwwwQVW]WuQvQvwwwKKt1Ittt   =55&+#"rJ   c                 P    |                                  D ]	}d|_        
d| _        d S NF)
parametersrequires_gradr   rA   params     rD   _freeze_parametersz'HubertFeatureEncoder._freeze_parameters   s4    __&& 	( 	(E"'E#rJ   c                 r    |d d d f         }| j         r| j        rd|_        | j        D ]} ||          }|S )NT)r   trainingr   r   )rA   input_valuesrH   
conv_layers       rD   rI   zHubertFeatureEncoder.forward   s[    $QQQW-  	/4= 	/*.M'* 	6 	6J&J}55MMrJ   )rL   rM   rN   __doc__r*   r   rI   rO   rP   s   @rD   ry   ry      s\        88# # # # #"$ $ $

 
 
 
 
 
 
rJ   ry   c                   $     e Zd Z fdZd Z xZS )HubertFeatureProjectionc                 T   t                                                       |j        | _        | j        r+t          j        |j        d         |j                  | _        t          j        |j        d         |j	                  | _
        t          j        |j                  | _        d S )Nro   eps)r)   r*   feat_proj_layer_normr+   rk   r\   layer_norm_epsrl   Linearr-   
projectionDropoutfeat_proj_dropoutdropoutrA   rB   rC   s     rD   r*   z HubertFeatureProjection.__init__   s    $*$?!$ 	[ l6?2+>FDYZZZDO)FOB$79KLLz&":;;rJ   c                     | j         r|                     |          }|                     |          }|                     |          }|S re   )r   rl   r   r   rG   s     rD   rI   zHubertFeatureProjection.forward   sF    $ 	; OOM::M66]33rJ   rK   rP   s   @rD   r   r      sG        < < < < <      rJ   r           modulequerykeyvalueattention_maskscalingr   	head_maskc                    ||                     d          dz  }t          j        ||                    dd                    |z  }	||	|z   }	t          j                            |	d          }	||	|                    dddd          z  }	t          j                            |	|| j	                  }	t          j        |	|          }
|
                    dd          
                                }
|
|	fS )Nro         r   r   r'   r   )pr   )sizetorchmatmulrF   r+   
functionalsoftmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              rD   eager_attention_forwardr      s     **R..D(<s}}Q':':;;gEL!#n4=((2(>>L#innQAq&A&AA=((6?([[L,|U33K''1--88::K$$rJ   c                   >    e Zd ZdZ	 	 	 	 	 ddededed	ed
ededee         f fdZ		 	 	 	 dde
j        dee
j                 dee
j                 dee
j                 dee         dee         dee
j        ee
j                 eee
j                          f         fdZ xZS )HubertAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr   
is_decoderr[   	is_causalrB   c                 
   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r[   )r)   r*   r   r   r   head_dimrB   r   r   r   r   r+   r   k_projv_projq_projout_proj)	rA   r   r   r   r   r[   r   rB   rC   s	           rD   r*   zHubertAttention.__init__  s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBrJ   rH   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                    |du}|j         dd         \  }}	|r|j         d         n|	}
||	d| j        f}||
d| j        f} |                     |          j        |                     dd          }|r|n|} |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j        j	        dk    rt          | j        j	                 } || ||||f| j        sdn| j        | j        ||d|\  }}|                    ||	d                                          }|                     |          }||dfS )z#Input shape: Batch x Time x ChannelNro   r   r   eagerr   )r   r   r   r   )shaper   r   r   rF   r   r   r   rB   _attn_implementationr   r   r   r   reshaper   r   )rA   rH   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       rD   rI   zHubertAttention.forward/  s    .T9 %*3B3/W/AN"(++wgr4=9wDM: 7t{{=116FPPQRTUVV-?R))]5T[[005~FPPQRTUVV
7t{{>227HRRSTVWXX(?;+w66"9$+:Z"[$7$7%
  $}>CC$,L/%%
 %
 %
 %
!\ "))#w;;FFHHmmK00L$..rJ   )r   FTFN)NNNF)rL   rM   rN   r   intfloatboolr   r   r*   r   Tensorr   r   tuplerI   rO   rP   s   @rD   r   r     s^       GG  )-C CC C 	C
 C C C &C C C C C CD 481526,13/ 3/|3/ #5<03/ !.	3/
 "%,/3/ $D>3/ -.3/ 
u|Xel3XeEL>Q5RR	S3/ 3/ 3/ 3/ 3/ 3/ 3/ 3/rJ   r   c                   $     e Zd Z fdZd Z xZS )HubertFeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |j                  | _	        t          |j        t                    rt          |j                 | _        n|j        | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S re   )r)   r*   r+   r   activation_dropoutintermediate_dropoutr   r-   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     rD   r*   zHubertFeedForward.__init__f  s    $&Jv/H$I$I!"$)F,>@X"Y"Yf'-- 	9'-f.?'@D$$'-'8D$If&>@RSS j)>??rJ   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S re   )r   r   r   r   r   rG   s     rD   rI   zHubertFeedForward.forwards  sg    //>>00??11-@@))-88++M::rJ   rK   rP   s   @rD   r   r   e  sL        @ @ @ @ @      rJ   r   c                   &     e Zd Z fdZddZ xZS )HubertEncoderLayerc                    t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _        d S )NFr   r   r   r   rB   r   )r)   r*   r   r-   num_attention_headsattention_dropout	attentionr+   r   r   r   rk   r   rl   r   feed_forwardfinal_layer_normr   s     rD   r*   zHubertEncoderLayer.__init__~  s    ((0,
 
 
 z&"788,v'9v?TUUU-f55 "V-?VEZ [ [ [rJ   NFc                    |}|                      |||          \  }}}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}|r||fz  }|S Nr   r   )r   r   rl   r   r   rA   rH   r   r   attn_residualr   _outputss           rD   rI   zHubertEncoderLayer.forward  s    %)-.L] *8 *
 *
&|Q ]33%566%(9(9-(H(HH--m<< " 	'&GrJ   r   rK   rP   s   @rD   r   r   }  sQ        \ \ \ \ \       rJ   r   c                        e Zd Z fdZ	 	 	 	 ddej        deej                 deded	ef
d
Z	de
ej        df         dej        fdZ xZS )HubertEncoderc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 .    g | ]}t                    S  )r   r   r   rB   s     rD   r   z*HubertEncoder.__init__.<locals>.<listcomp>  s"    $i$i$iA%7%?%?$i$i$irJ   Fr)   r*   rB   r   pos_conv_embedr+   rk   r-   r   rl   r   r   r   r   r   num_hidden_layerslayersr   r   s    `rD   r*   zHubertEncoder.__init__  s    ;FCC,v'9v?TUUUz&"788m$i$i$i$ivOgIhIh$i$i$ijj&+###rJ   NFTrH   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     ||          }|                     |          }	||	z   }|                     |          }|                     |          }t                      pt          |           }
| j	        D ]a}|r||fz   }t          j        g           }| j        o|| j        j        k     }|r|
r ||||          }|d         }|rd}|r||d         fz   }b|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nr  ro   r   r   r   r   NNc              3      K   | ]}||V  	d S re   r  r   vs     rD   	<genexpr>z(HubertEncoder.forward.<locals>.<genexpr>  (      mmq_`_l_l_l_l_lmmrJ   last_hidden_staterH   
attentions)	unsqueezerepeatr   _update_full_maskr  rl   r   r	   r
   r
  r   randr   rB   	layerdropr   r   rA   rH   r   r   r  r  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                  rD   rI   zHubertEncoder.forward  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001//
 

 #11-@@%(;;66]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 %!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
rJ   inputs_embedsc                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S Nflash_attention_2r   sdpaflex_attentionF)r   	rB   r   r   dtyper   r   r   r   r   rA   r   r%  s      rD   r  zHubertEncoder._update_full_mask      
 %{/3FFF343F3FD1V;; "E^UbUh!i!i15EEEnel;; b%@[`%a%a%aN "<NML_!`!`rJ   NFFT)rL   rM   rN   r*   r   tensorr   r   r   rI   r   r  rO   rP   s   @rD   r  r    s        , , , , , 26"'%* :
 :
|:
 !.:
  	:

 #:
 :
 :
 :
 :
xelD01 |       rJ   r  c                   4     e Zd Z fdZdej        fdZ xZS )HubertAttnAdapterLayerc                 t   t                                                       |j        | _        |j        | _        t          j        | j                  | _        t          j	        | j        | j                  | _
        t          j                    | _        t          j	        | j        | j                  | _        dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r)   r*   adapter_attn_dim	input_dimr-   
hidden_dimr+   rk   normr   linear_1ReLUact_fnlinear_2r   s     rD   r*   zHubertAttnAdapterLayer.__init__  s    
 	0 ,L11		$/4>BBgii	$.$/BBrJ   rH   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S re   )r7  r8  r:  r;  rG   s     rD   rI   zHubertAttnAdapterLayer.forward  sL    		-00m44M22m44rJ   )rL   rM   rN   r*   r   FloatTensorrI   rO   rP   s   @rD   r2  r2    s[        C C C C CU%6        rJ   r2  c                   X     e Zd Z fdZ	 	 ddej        deej                 defdZ xZ	S )	!HubertEncoderLayerStableLayerNormc                    t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _        t#          |dd           t%          |          | _        d S d | _        d S )NFr   r   r4  )r)   r*   r   r-   r   r   r   r+   r   r   r   rk   r   rl   r   r   r   getattrr2  adapter_layerr   s     rD   r*   z*HubertEncoderLayerStableLayerNorm.__init__  s    ((0,
 
 
 z&"788,v'9v?TUUU-f55 "V-?VEZ [ [ [6-t44@!7!?!?D!%DrJ   NFrH   r   r   c                 J   |}|                      |          }|                     |||          \  }}}|                     |          }||z   }||                     |                     |                    z   }| j        ||                     |          z   }|f}|r||fz  }|S r   )rl   r   r   r   r   rB  r   s           rD   rI   z)HubertEncoderLayerStableLayerNorm.forward+  s     &66)-.L] *8 *
 *
&|Q ]33%5%(9(9$:O:OP]:^:^(_(__))D,>,>},M,MMM " 	'&GrJ   r   )
rL   rM   rN   r*   r   r   r   r   rI   rO   rP   s   @rD   r?  r?    s~        & & & & &, 26"'	 | !.  	       rJ   r?  c                   b     e Zd Z fdZ	 	 	 	 d	dZdeej        df         dej        fdZ xZ	S )
HubertEncoderStableLayerNormc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 .    g | ]}t                    S r  )r?  r  s     rD   r   z9HubertEncoderStableLayerNorm.__init__.<locals>.<listcomp>M  s"    ```1.v66```rJ   Fr  r   s    `rD   r*   z%HubertEncoderStableLayerNorm.__init__F  s    ;FCC,v'9v?TUUUz&"788m````fF^@_@_```
 
 ',###rJ   NFTc                    |rdnd }|rdnd }|;|                     d                              dd|j        d                   }d|| <   |                     ||          }|                     |          }	||	z   }|                     |          }t                      pt          |           }
| j        D ]a}|r||fz   }t          j
        g           }| j        o|| j        j        k     }|r|
r ||||          }|d         }|rd}|r||d         fz   }b|                     |          }|r||fz   }|st          d |||fD                       S t!          |||	          S )
Nr  ro   r   r   r   r   r  c              3      K   | ]}||V  	d S re   r  r  s     rD   r  z7HubertEncoderStableLayerNorm.forward.<locals>.<genexpr>  r  rJ   r  )r  r  r   r  r  r   r	   r
   r
  r   r  r   rB   r  rl   r   r   r  s                  rD   rI   z$HubertEncoderStableLayerNorm.forwardQ  s    #7@BBD$5?bb4%$2$<$<R$@$@$G$G1mNabcNd$e$e!45M001//
 

 #11-@@%(;;]33022R6LT6R6R[ 	P 	PE# I$58H$H! #(*R..!]Z/BT[EZ/ZN! 1[ 1 !&!.Te! ! ! !.a 0 - ,  P&9]1=M<O&O#66 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
rJ   r   r%  c                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S r'  r+  r-  s      rD   r  z.HubertEncoderStableLayerNorm._update_full_mask  r.  rJ   r/  )
rL   rM   rN   r*   rI   r   r   r   r  rO   rP   s   @rD   rE  rE  E  s        	, 	, 	, 	, 	, "<
 <
 <
 <
|elD01 |       rJ   rE  c                   x    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej        ef         fdZded	ej        fd
ZdS )HubertPreTrainedModelrB   hubertr   Tc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
        t          j        t          j        f          r?|j        j        	                                 |j        j                            d           dS t          |t          j                  rQt                      rddl}t#          |d          rzt#          |d          rj|j                            |j        |j        gd          5  t          j                            |j        j                   ddd           n# 1 swxY w Y   n|j                            |j        d          5  t          j                            |j        j                   ddd           n# 1 swxY w Y   n)t          j                            |j        j                   |j         |j        j        	                                 dS dS t          |t0                    r2t#          |d	          r |j        j                                         dS dS t          |t6                    rAt#          |d
          r3|j        j                            d| j        j        dz   z             dS dS dS )zInitialize the weightsr   )meanstdNg      ?r   r<   r;   r#   masked_spec_embedlayer_weightsr   )r   r+   r   r%   datanormal_rB   initializer_ranger[   zero_rk   rv   r3   fill_r,   r	   r6   r5   r7   r8   r<   r;   initkaiming_normal_HubertModelrQ  uniform_HubertForSequenceClassificationrR  r	  )rA   r   r6   s      rD   _init_weightsz#HubertPreTrainedModel._init_weights  sc   fbi(( 	[ M&&CT[5R&SSS{& &&((((( '&r|R^ LMM 	[K""$$$M$$S)))))	** 	[)++ 
<    6:.. D76:3N3N D"::FOV_;]mn:oo D D//0BCCCD D D D D D D D D D D D D D D #::6=XY:ZZ D D//0BCCCD D D D D D D D D D D D D D D ''(:;;;{& &&((((( '&,, 	[v233 9(-66888889 9 ?@@ 	[v// [$)//t{7TWX7X0YZZZZZ	[ 	[[ [s$   *F

FF7*G--G14G1input_lengthsc                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r   div)input_lengthr   rZ   s      rD   _conv_out_lengthzPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s&     9\K7wWWWZ[[[rJ   )ziprB   r_   r`   )rA   r^  re  r   rZ   s        rD    _get_feat_extract_output_lengthsz6HubertPreTrainedModel._get_feat_extract_output_lengths  s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMrJ   feature_vector_lengthr   c                    |                      |                    d                                        t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                              d          
                    dg                                          }|S )Nro   r   )r,  devicer   )rj  )rg  sumtor   longr   zerosr,  rj  arangeflipcumsumr   )rA   rh  r   output_lengths
batch_sizes        rD   "_get_feature_vector_attention_maskz8HubertPreTrainedModel._get_feature_vector_attention_mask  s    >>~?Q?QRT?U?UVVYYZ_Zdee#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOrJ   N)rL   rM   rN   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr]  r   r   
LongTensorr   rg  rt  r  rJ   rD   rL  rL    s          $O&*#N[ [ [BeEDTVYDY>Z    
 
]b]m 
 
 
 
 
 
rJ   rL  r   	mask_probmask_length	min_masksr   c                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                                                    d                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)rd  num_masked_spanepsilonr~  r}  r  sequence_lengths     rD   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span  s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFOrJ   Nro   c                     g | ]}S r  r  )r   r   r  s     rD   r   z)_compute_mask_indices.<locals>.<listcomp>&  s    999!o999rJ   r,  r   F)replace)r   nprandomr  itemdetachrk  tolistr   rn  r   choicero  lenconcatenateonesint32appendarraybroadcast_tor   r  put_along_axis)r   r}  r~  r   r  rs  r  r^  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrd  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  r  s    `` `           @@rD   _compute_mask_indicesr    sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	##B''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???rJ   c                   *    e Zd Zdef fdZ	 	 ddej        deej                 deej                 fdZ	e
	 	 	 	 	 ddeej                 deej                 deej                 d	ee         d
ee         dee         deeef         fd            Z xZS )rZ  rB   c                    t                                          |           || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        |j        rt#          |          | _        nt'          |          | _        |                                  d S )Nr   )r)   r*   rB   ry   feature_extractorr   feature_projectionmask_time_probmask_feature_probr+   	Parameterr   r   r-   r[  rQ  do_stable_layer_normrE  encoderr  	post_initr   s     rD   r*   zHubertModel.__init__f  s       !5f!=!="9&"A"A  3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"& 	17??DLL(00DL 	rJ   NrH   mask_time_indicesr   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r}  r~  r   r  )rj  r,  )r}  r~  r  ro   )rA  rB   r   rQ  rl  r,  r  r   r  mask_time_lengthmask_time_min_masksr   r0  rj  r   r  mask_feature_lengthmask_feature_min_masksexpand)rA   rH   r  r   rs  r  r-   mask_feature_indicess           rD   _mask_hidden_stateszHubertModel._mask_hidden_statesx  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./rJ   r   r   r  r  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|!|                     |j        d         |          }|                     |          }| 	                    ||          }| 
                    |||||          }	|	d         }|s|f|	dd         z   S t          ||	j        |	j                  S )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )r  r   r   r  r  r   r  )rB   r   r  use_return_dictr  rF   rt  r   r  r  r  r   rH   r  )
rA   r   r   r  r   r  r  extract_featuresrH   encoder_outputss
             rD   rI   zHubertModel.forward  s@   H 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;%!DDEUE[\]E^`nooN//0@AA00Rc0dd,,)/!5# ' 
 
 (* 	:!#oabb&999+)7&1
 
 
 	
rJ   r  NNNNN)rL   rM   rN   r   r*   r   r=  r   r|  r  r   r   r   r   r   r   rI   rO   rP   s   @rD   rZ  rZ  d  sK       |      * :>59	, ,(, $E$56, !!12	, , , ,\  269=,0/3&*D
 D
u|,D
 !.D
 $E$56	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
 D
 ^D
 D
 D
 D
 D
rJ   rZ  zn
    Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                        e Zd Zddee         f fdZd Zd Zd Zd Z	e
	 	 	 	 	 ddeej                 d	eej                 d
ee         dee         dee         deej                 deeef         fd            Z xZS )HubertForCTCNtarget_langc                    t                                          |           t          |          | _        t	          j        |j                  | _        || _        |j	        t          d| j         d          t          |d          r|j        r|j        n|j        }t	          j        ||j	                  | _        |                                  dS )a0  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`HubertForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r)   r*   rZ  rM  r+   r   final_dropoutr   r  
vocab_sizer   rC   r5   r  output_hidden_sizer-   r   lm_headr  )rA   rB   r  r  rC   s       rD   r*   zHubertForCTC.__init__  s     	   !&))z&"677&$H H H H   *1)G)GvFL^vF%%djdv 	 y!3V5FGG 	rJ   c                    | j         }|)t          | j        dd          t          d| d          |2t          | j        dd          t                              d           dS ||                     |d           dS dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr4  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  rA  rB   r   loggerinfoload_adapter)rA   r  s     rD   tie_weightszHubertForCTC.tie_weights  s     &"wt{<NPT'U'U']u;uuuvvv WT[:Ld%S%S%_KKCDDDDD$kd;;;;; %$rJ   c                 b    t          j        dt                     |                                  dS )
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderrA   s    rD   freeze_feature_extractorz%HubertForCTC.freeze_feature_extractor)  ;    
 	Q	
 	
 	

 	##%%%%%rJ   c                 B    | j         j                                         dS r  NrM  r  r   r  s    rD   r  z#HubertForCTC.freeze_feature_encoder5  !    
 	%88:::::rJ   c                 L    | j                                         D ]	}d|_        
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNrM  r   r   r   s     rD   freeze_base_modelzHubertForCTC.freeze_base_model<  6    
 [++-- 	( 	(E"'E	( 	(rJ   r   r   r   r  r  labelsr   c           
      p   ||n| j         j        }|>|                                | j         j        k    rt	          d| j         j                   |                     |||||          }|d         }|                     |          }|                     |          }	d}
|Z||nt          j	        |t          j
                  }|                     |                    d                                        t          j
                  }|dk    }|                    d          }|                    |          }t          j                            |	dt          j                                      dd          }t          j        j                            d	
          5  t          j                            ||||| j         j        | j         j        | j         j                  }
ddd           n# 1 swxY w Y   |s|	f|t6          d         z   }|
|
f|z   n|S t9          |
|	|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r  ro   )r'   r,  r   F)enabled)blank	reductionzero_infinitylosslogitsrH   r  )rB   r  r  r  r   rM  r   r  r   	ones_likerm  rg  rk  rl  masked_selectr+   r   log_softmaxfloat32rF   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rH   r  )rA   r   r   r   r  r  r  r   rH   r  r  r^  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    rD   rI   zHubertForCTC.forwardD  s   " &1%<kk$+B]&**,,$+2H"H"H\DKDZ\\]]]++)/!5#  
 
  
]33m,, #1"<%/R^fkfpBqBqBq  !AA.BTBTUWBXBXYY\\]b]ghhM !A+K(__R00N & 4 4[ A A 11&b1VV``abdeffI%++E+:: 	 	}--%!"+2"k<"&+"? .  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  	FY)F)G)G!HHF)-)9TGf$$vEfG4IV]Vh
 
 
 	
s    AG11G58G5re   r  )rL   rM   rN   r   r   r*   r  r  r  r  r   r   r   r   r   r   r   rI   rO   rP   s   @rD   r  r    s>        HSM      :< < <*
& 
& 
&; ; ;( ( (  26,0/3&*)-D
 D
u|,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
 D
 ^D
 D
 D
 D
 D
rJ   r  z
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j
                 dee	j
                 dee         d	ee         d
ee         dee	j
                 deeef         fd            Z xZS )r\  c                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        |                                  d S )Nr  z]Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)r   )r)   r*   r5   r  r   rZ  rM  r	  use_weighted_layer_sumr+   r  r   r  rR  r   r-   classifier_proj_size	projector
num_labels
classifierr  )rA   rB   
num_layersrC   s      rD   r*   z(HubertForSequenceClassification.__init__  s       6=)) 	f.@ 	o   "&))-1
( 	S!#ej.D.Dz.Q!R!RD6#5v7RSS)F$?ARSS 	rJ   c                 b    t          j        dt                     |                                  dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        r  Nr  r  s    rD   r  z8HubertForSequenceClassification.freeze_feature_extractor  r  rJ   c                 B    | j         j                                         dS r  r  r  s    rD   r  z6HubertForSequenceClassification.freeze_feature_encoder  r  rJ   c                 L    | j                                         D ]	}d|_        
dS r  r  r   s     rD   r  z1HubertForSequenceClassification.freeze_base_model  r  rJ   Nr   r   r   r  r  r  r   c                 d   ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }||                    d          }
n|                     |j        d         |          }|                    d                              dd|j        d                   }d	|| <   |                    d          |                    d                              dd          z  }
|                     |
          }d}|Kt)                      } ||                    d| j         j                  |                    d                    }|s|f|t          d         z   }||f|z   n|S t-          |||j        |j        
          S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   ro   r   r   r   r  )rB   r  r  rM  r  r   stackr+   r   r   rR  r   rk  r  rO  rt  r   r  r  r  r   r  r   rH   r  )rA   r   r   r   r  r  r  r   rH   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  s                    rD   rI   z'HubertForSequenceClassification.forward  sW   . &1%<kk$+B]'+{'IcttOc++)/!5#  
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55!)..1.55MMBB=CVWXCY[ijjL"."8"8"<"<"C"CAq-J]^_J`"a"a25M../)--!-44|7G7GA7G7N7N7S7STVXY7Z7ZZM//'))H8FKKDK,BCCV[[QS__UUD 	FY)F)G)G!HHF)-)9TGf$$vE'!/)	
 
 
 	
rJ   r  )rL   rM   rN   r*   r  r  r  r   r   r   r   r   r   r   r   rI   rO   rP   s   @rD   r\  r\    s           "
& 
& 
&; ; ;( ( (  26,0/3&*)-B
 B
u|,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
 B
 ^B
 B
 B
 B
 B
rJ   r\  )r  r\  rZ  rL  )Nr   NrU   )Ir  typingr   r   r   numpyr  r   torch.nnr+   r   activationsr   integrations.deepspeedr	   integrations.fsdpr
   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   r4   r   r   r   configuration_hubertr   integrations.flex_attentionr   
get_loggerrL   r  Moduler   r>   rW   rh   rq   ry   r   r   r   r   r   r   r   r  r2  r?  rE  rL  r   r   r|  ndarrayr  rZ  r  r  r\  __all__r  rJ   rD   <module>r     sY  ,  , , , , , , , , , ,            % % % % % % ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7 g g g g g g g g B B B B B B 9 9 9 9 9 9 Y Y Y Y Y Y Y Y Y Y F F F F F F F F & & & & & & J J J J J J J J J J . . . . . .  !! KJJJJJJ 
	H	%	%/ / / / /BI / / /d           !;   *    9   6    9   0# # # # #29 # # #L    bi   0  $(,% %I%<% 
% <	%
 U\*% e_% % %% % % %<U/ U/ U/ U/ U/bi U/ U/ U/p    	   0! ! ! ! !3 ! ! !HZ Z Z Z ZBI Z Z Zz    RY   2+ + + + +(B + + +\^ ^ ^ ^ ^29 ^ ^ ^B C C C C CO C C CT 26t tc?tt t U-.	t
 t Zt t t tn F
 F
 F
 F
 F
' F
 F
 F
R !"    
S
 S
 S
 S
 S
( S
 S
 
S
l   p
 p
 p
 p
 p
&; p
 p
 p
f f
e
erJ   