
     `i.                        d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZ ddlmZ dZ G d dej                  Z G d de          Z G d de          Z G d dej                  Z  G d de          Z! G d de          Z"e G d de                      Z# G d dee#          Z$ G d de          Z% G d  d!e          Z&g d"Z'dS )#zPyTorch Hubert model.    )OptionalUnionN   )ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutput)PreTrainedModel)auto_docstring   )Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ModelWav2Vec2SamePadLayer   )HubertConfigc                   $     e Zd Z fdZd Z xZS )HubertPositionalConvEmbeddingc                 &   t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        d | _        |j	        r t          j
        |j                  | _        nWt          j        j        }t          t          j        j        d          rt          j        j        j        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t3          |j                  | _        t6          |j                 | _        d S )	Nr   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr   hasattrr!   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr   r/   r4   r5   	__class__s         }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/hubert/modular_hubert.pyr#   z&HubertPositionalConvEmbedding.__init__+   s'   I62a77
 
 
	 % 	I nV-?@@DOO(.Krx0-@@ D h7C)++ I    ^66ty7GWX6YY M M +DIH! L L LDIM M M M M M M M M M M M M M M49&899 2#y9@JH#y9@JHH#y1H#y1H::4JJJ::4JJJJ'K	aHHH	)&*HII !?@s   D--D14D1c                    |                     dd          }| j        |                     |          }|                     |          }|                     |          }|                     |          }|                     dd          }|S )Nr   r   )	transposer*   r)   r   r9   r:   hidden_statess     r=   forwardz%HubertPositionalConvEmbedding.forwardP   s~    %//155?& OOM::M		-00]3366%//155    __name__
__module____qualname__r#   rB   __classcell__r<   s   @r=   r   r   *   sM        #A #A #A #A #AJ	 	 	 	 	 	 	rC   r   c                       e Zd ZdS )r7   NrE   rF   rG    rC   r=   r7   r7   \           DrC   r7   c                       e Zd ZdS )HubertFeatureEncoderNrK   rL   rC   r=   rO   rO   `   rM   rC   rO   c                   $     e Zd Z fdZd Z xZS )HubertFeatureProjectionc                 T   t                                                       |j        | _        | j        r+t          j        |j        d         |j                  | _        t          j        |j        d         |j	                  | _
        t          j        |j                  | _        d S )N)eps)r"   r#   feat_proj_layer_normr$   	LayerNormconv_dimlayer_norm_eps
layer_normLinearr&   
projectionDropoutfeat_proj_dropoutdropoutr:   r;   r<   s     r=   r#   z HubertFeatureProjection.__init__e   s    $*$?!$ 	[ l6?2+>FDYZZZDO)FOB$79KLLz&":;;rC   c                     | j         r|                     |          }|                     |          }|                     |          }|S )N)rU   rY   r[   r^   r@   s     r=   rB   zHubertFeatureProjection.forwardm   sF    $ 	; OOM::M66]33rC   rD   rI   s   @r=   rQ   rQ   d   sG        < < < < <      rC   rQ   c                       e Zd ZdS )HubertEncoderNrK   rL   rC   r=   rb   rb   v   rM   rC   rb   c                       e Zd ZdS )HubertEncoderStableLayerNormNrK   rL   rC   r=   rd   rd   z   rM   rC   rd   c                   x    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej        ef         fdZded	ej        fd
ZdS )HubertPreTrainedModelr;   hubertinput_valuesTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
        t          j        t          j        f          r?|j        j        	                                 |j        j                            d           dS t          |t          j                  rQt                      rddl}t#          |d          rzt#          |d          rj|j                            |j        |j        gd          5  t          j                            |j        j                   ddd           n# 1 swxY w Y   n|j                            |j        d          5  t          j                            |j        j                   ddd           n# 1 swxY w Y   n)t          j                            |j        j                   |j         |j        j        	                                 dS dS t          |t0                    r2t#          |d	          r |j        j                                         dS dS t          |t6                    rAt#          |d
          r3|j        j                            d| j        j        dz   z             dS dS dS )zInitialize the weights        )meanstdNg      ?r   r5   r4   r   masked_spec_embedlayer_weightsr   )
isinstancer$   rZ   r   datanormal_r;   initializer_rangebiaszero_rV   	GroupNormr,   fill_r%   r   r/   r.   r0   r1   r5   r4   initkaiming_normal_HubertModelrm   uniform_HubertForSequenceClassificationrn   num_hidden_layers)r:   moduler/   s      r=   _init_weightsz#HubertPreTrainedModel._init_weights   sc   fbi(( 	[ M&&CT[5R&SSS{& &&((((( '&r|R^ LMM 	[K""$$$M$$S)))))	** 	[)++ 
<    6:.. D76:3N3N D"::FOV_;]mn:oo D D//0BCCCD D D D D D D D D D D D D D D #::6=XY:ZZ D D//0BCCCD D D D D D D D D D D D D D D ''(:;;;{& &&((((( '&,, 	[v233 9(-66888889 9 ?@@ 	[v// [$)//t{7TWX7X0YZZZZZ	[ 	[[ [s$   *F

FF7*G--G14G1input_lengthsc                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )torchdiv)input_lengthr   strides      r=   _conv_out_lengthzPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s&     9\K7wWWWZ[[[rC   )zipr;   conv_kernelconv_stride)r:   r   r   r   r   s        r=    _get_feat_extract_output_lengthsz6HubertPreTrainedModel._get_feat_extract_output_lengths   s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMrC   feature_vector_lengthattention_maskc                    |                      |                    d                                        t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                              d          
                    dg                                          }|S )NrS   r   )dtypedevicer   )r   )r   sumtor   longshapezerosr   r   arangeflipcumsumbool)r:   r   r   output_lengths
batch_sizes        r=   "_get_feature_vector_attention_maskz8HubertPreTrainedModel._get_feature_vector_attention_mask   s    >>~?Q?QRT?U?UVVYYZ_Zdee#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOrC   N)rE   rF   rG   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr~   r   r   
LongTensorintr   r   rL   rC   r=   rf   rf   ~   s          $O&*#N[ [ [BeEDTVYDY>Z    
 
]b]m 
 
 
 
 
 
rC   rf   c                        e Zd Zdef fdZd Zd Z	 	 	 	 	 ddeej	                 deej	                 deej
                 d	ee         d
ee         dee         deeef         fdZ xZS )ry   r;   c                    t                                          |           || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        |j        rt#          |          | _        nt'          |          | _        |                                  | `d S )Nrj   )r"   r#   r;   rO   feature_extractorrQ   feature_projectionmask_time_probmask_feature_probr$   	Parameterr   Tensorr&   rz   rm   do_stable_layer_normrd   encoderrb   	post_initadapterr_   s     r=   r#   zHubertModel.__init__   s       !5f!=!="9&"A"A 3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"& 	17??DLL(00DL 	LLLrC   c                      t          d          NzNot needed for HubertAttributeErrorr:   s    r=   freeze_feature_extractorz$HubertModel.freeze_feature_extractor       4555rC   c                      t          d          r   r   r   s    r=   freeze_feature_encoderz"HubertModel.freeze_feature_encoder   r   rC   Nrh   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|!|                     |j        d         |          }|                     |          }| 	                    ||          }| 
                    |||||          }	|	d         }|s|f|	dd         z   S t          ||	j        |	j                  S )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )r   )r   r   r   r   r   )last_hidden_staterA   
attentions)r;   r   r   use_return_dictr   r?   r   r   r   _mask_hidden_statesr   r   rA   r   )
r:   rh   r   r   r   r   r   extract_featuresrA   encoder_outputss
             r=   rB   zHubertModel.forward   s@   F 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;%!DDEUE[\]E^`nooN//0@AA00Rc0dd,,)/!5# ' 
 
 (* 	:!#oabb&999+)7&1
 
 
 	
rC   )NNNNN)rE   rF   rG   r   r#   r   r   r   r   r   FloatTensorr   r   tupler   rB   rH   rI   s   @r=   ry   ry      s       |      &6 6 66 6 6 269=,0/3&*D
 D
u|,D
 !.D
 $E$56	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
 D
 D
 D
 D
 D
 D
rC   ry   c                       e Zd ZdS )HubertForCTCNrK   rL   rC   r=   r   r   &  rM   rC   r   c                       e Zd ZdS )r{   NrK   rL   rC   r=   r{   r{   *  rM   rC   r{   )r   r{   ry   rf   )(__doc__typingr   r   r   torch.nnr$   activationsr   integrations.deepspeedr   modeling_outputsr   modeling_utilsr	   r-   r
   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   configuration_hubertr   _HIDDEN_STATES_START_POSITIONModuler   r7   rO   rQ   rb   rd   rf   ry   r   r{   __all__rL   rC   r=   <module>r      s     " " " " " " " "        ! ! ! ! ! ! @ @ @ @ @ @ / / / / / / - - - - - - # # # # # #                  / . . . . . !" / / / / /BI / / /d	 	 	 	 	- 	 	 		 	 	 	 	1 	 	 	    bi   $	 	 	 	 	O 	 	 		 	 	 	 	#A 	 	 	 C C C C CO C C CL^
 ^
 ^
 ^
 ^
-!6 ^
 ^
 ^
B	 	 	 	 	> 	 	 		 	 	 	 	&G 	 	 	 f
e
erC   