
     `i$                     D   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ d	d
lmZmZmZmZmZmZmZmZmZmZmZ ddlmZ  G d de          Z G d de          Z G d dej                  Z G d dej                  Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d dee          Z#e	Z$ G d d e#e          Z% G d! d"e#e          Z& G d# d$e          Z' G d% d&e          Z( G d' d(e          Z)g d)Z*dS )*zPyTorch Data2VecText model.    N)nn   )ACT2FN)GradientCheckpointingLayer)Wav2Vec2BaseModelOutput)PreTrainedModel   )Wav2Vec2AdapterWav2Vec2EncoderWav2Vec2FeatureEncoderWav2Vec2FeatureProjection#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PreTrainedModelWav2Vec2SamePadLayer   )Data2VecAudioConfigc                   &     e Zd Zd fd	Zd Z xZS )Data2VecAudioConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   )kernel_sizestridebiasTelementwise_affine)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconv	LayerNorm
layer_normr   feat_extract_activation
activation)selfconfiglayer_id	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/data2vec/modular_data2vec_audio.pyr    zData2VecAudioConvLayer.__init__+   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@    c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )N)r(   	transposer*   r,   r-   hidden_statess     r1   forwardzData2VecAudioConvLayer.forward:   se    		-00%//B7766%//B7766r2   )r   __name__
__module____qualname__r    r9   __classcell__r0   s   @r1   r   r   *   sR        A A A A A A      r2   r   c                       e Zd ZdS )Data2VecAudioPadLayerNr;   r<   r=    r2   r1   rA   rA   E           Dr2   rA   c                   $     e Zd Z fdZd Z xZS ) Data2VecAudioPositionalConvLayerc                 \   t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        t          |j                  | _	        t          |j                 | _        t          j        |j        d          | _        d S )Nr	   )r   paddinggroupsFr   )r   r    r   r$   hidden_sizeconv_pos_kernel_sizenum_conv_pos_embedding_groupsr(   rA   rH   r   r+   r,   r)   r*   r-   r.   r0   s     r1   r    z)Data2VecAudioPositionalConvLayer.__init__J   s    I3/147
 
 
	 -V-HII !?@,v'9eTTTr2   c                    |                      |          }|                     |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S Nr   r	   )r(   rH   r6   r*   r,   r7   s     r1   r9   z(Data2VecAudioPositionalConvLayer.forwardY   sv    		-00]33%//15566%//15566r2   r:   r?   s   @r1   rF   rF   I   sL        U U U U U      r2   rF   c                   $     e Zd Z fdZd Z xZS )$Data2VecAudioPositionalConvEmbeddingc                     t                                                       t          j        fdt	          j                  D                       | _        d S )Nc                 .    g | ]}t                    S rC   )rF   ).0_r.   s     r1   
<listcomp>zAData2VecAudioPositionalConvEmbedding.__init__.<locals>.<listcomp>h   s"    eee!-f55eeer2   )r   r    r   
ModuleListrangenum_conv_pos_embeddingslayersrM   s    `r1   r    z-Data2VecAudioPositionalConvEmbedding.__init__e   sS    meeeeuVEc?d?deee
 
r2   c                     |                     dd          }| j        D ]} ||          }|                     dd          }|S rO   )r6   rZ   )r-   r8   layers      r1   r9   z,Data2VecAudioPositionalConvEmbedding.forwardk   sT    %//155[ 	1 	1E!E-00MM%//155r2   r:   r?   s   @r1   rQ   rQ   d   sG        
 
 
 
 
      r2   rQ   c                       e Zd Zd ZdS )Data2VecAudioFeatureEncoderc                     t           j                            |            t          j        fdt	          j                  D                       | _        d| _        d| _        d S )Nc                 2    g | ]}t          |           S ))r/   )r   )rT   ir.   s     r1   rV   z8Data2VecAudioFeatureEncoder.__init__.<locals>.<listcomp>w   s'    gggA#FQ777gggr2   FT)	r   Moduler    rW   rX   num_feat_extract_layersconv_layersgradient_checkpointing_requires_gradr-   r.   s    `r1   r    z$Data2VecAudioFeatureEncoder.__init__t   sg    
	4   =ggggvGeAfAfggg
 
 ',#"r2   N)r;   r<   r=   r    rC   r2   r1   r^   r^   s   s#        # # # # #r2   r^   c                       e Zd ZdS )Data2VecAudioFeatureProjectionNrB   rC   r2   r1   ri   ri   }   rD   r2   ri   c                       e Zd ZdS )Data2VecAudioEncoderNrB   rC   r2   r1   rk   rk      rD   r2   rk   c                       e Zd ZdS )Data2VecAudioAdapterNrB   rC   r2   r1   rm   rm      rD   r2   rm   c                   J    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zd Zd Zd Zd	S )
Data2VecAudioPreTrainedModelr.   data2vec_audioinput_valuesTc                    t          |t                    r}t          j        d|j        j        z            }t          j                            |j        j	        | |           t          j                            |j        j
        | |           dS t          |t                    r,t          j                            |j        j
        d           dS t          |t          j                  rT|j	        j                            d| j        j                   |j
         |j
        j                                         dS dS t          |t          j        t          j        f          rO|j
        |j
        j                                         |j	        !|j	        j                            d           dS dS t          |t          j                  rt          j                            |j	                   |j
        [t          j        |j        |j        |j        d         z  z            }t          j                            |j
        | |           dS dS dS )zInitialize the weightsr   )abr           )meanstdNg      ?)
isinstanceri   mathsqrt
projectionin_featuresr   inituniform_weightr   rF   	constant_r(   Lineardatanormal_r.   initializer_rangezero_r)   	GroupNormfill_r$   kaiming_normal_rI   in_channelsr   )r-   moduleks      r1   _init_weightsz*Data2VecAudioPreTrainedModel._init_weights   s   f<== 	9	!f/;;<<AGV.5!qAAAGV.3rQ????? @AA 	9Gfk.22222	** 	9M&&CT[5R&SSS{& &&((((( '&r| <== 
	9{& &&(((}("((----- )(	** 	9G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888	9 	9 '&r2   c                      t          d          NzNot needed for Data2VecAudioAttributeErrorr-   s    r1   _get_adaptersz*Data2VecAudioPreTrainedModel._get_adapters       ;<<<r2   c                      t          d          r   r   r   s    r1   init_adapter_layersz0Data2VecAudioPreTrainedModel.init_adapter_layers   r   r2   c                      t          d          r   r   r   s    r1   load_adapterz)Data2VecAudioPreTrainedModel.load_adapter   r   r2   N)r;   r<   r=   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   r   r   r   rC   r2   r1   ro   ro      s         ($O&*#N9 9 92= = == = == = = = =r2   ro   c                   6     e Zd ZdefdZd Zd Z fdZ xZS )Data2VecAudioModelr.   c                    t                               | |           || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        t!          |          | _        |j        rt'          |          nd | _        |                                  d S )Nru   )ro   r    r.   r^   feature_extractorri   feature_projectionmask_time_probmask_feature_probr   	ParametertorchTensorrJ   r~   masked_spec_embedrk   encoderadd_adapterrm   adapter	post_initrg   s     r1   r    zData2VecAudioModel.__init__   s    $--dF;;;!<V!D!D"@"H"H  3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"+F337=7IS+F333t 	r2   c                      t          d          r   r   r   s    r1   freeze_feature_extractorz+Data2VecAudioModel.freeze_feature_extractor   r   r2   c                 8    | j                                          dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r   _freeze_parametersr   s    r1   freeze_feature_encoderz)Data2VecAudioModel.freeze_feature_encoder   s    
 	1133333r2   c                 6     t                      j        di |S NrC   r   r9   r-   super_kwargsr0   s     r1   r9   zData2VecAudioModel.forward       uww.....r2   )	r;   r<   r=   r   r    r   r   r9   r>   r?   s   @r1   r   r      sr        2    "= = =4 4 4/ / / / / / / / /r2   r   c                   0     e Zd Zd Zd Zd Z fdZ xZS )Data2VecAudioForCTCc                    t                               | |           t          |          | _        t	          j        |j                  | _        |j        t          d| j
         d          t          |d          r|j        r|j        n|j        }t	          j        ||j                  | _        |                                  d S )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r   )ro   r    r   rp   r   Dropoutfinal_dropoutdropout
vocab_size
ValueErrorr0   hasattrr   output_hidden_sizerJ   r   lm_headr   )r-   r.   r   s      r1   r    zData2VecAudioForCTC.__init__   s    $--dF;;;088z&"677$H H H H   *1)G)GvFL^vF%%djdv 	 y!3V5FGG 	r2   c                      t          d          r   r   r   s    r1   freeze_base_modelz%Data2VecAudioForCTC.freeze_base_model   r   r2   c                      t          d          r   r   r   s    r1   tie_weightszData2VecAudioForCTC.tie_weights   r   r2   c                 6     t                      j        di |S r   r   r   s     r1   r9   zData2VecAudioForCTC.forward   r   r2   )r;   r<   r=   r    r   r   r9   r>   r?   s   @r1   r   r      se          *= = == = =/ / / / / / / / /r2   r   c                       e Zd ZdS )&Data2VecAudioForSequenceClassificationNrB   rC   r2   r1   r   r      rD   r2   r   c                       e Zd ZdS )(Data2VecAudioForAudioFrameClassificationNrB   rC   r2   r1   r   r      rD   r2   r   c                       e Zd ZdS )Data2VecAudioForXVectorNrB   rC   r2   r1   r   r      rD   r2   r   )r   r   r   r   r   ro   )+__doc__ry   r   r   activationsr   modeling_layersr   modeling_outputsr   modeling_utilsr   wav2vec2.modeling_wav2vec2r
   r   r   r   r   r   r   r   r   r   r   configuration_data2vec_audior   r   rA   rb   rF   rQ   r^   ri   rk   rm   ro   Data2VecAudioBaseModelOutputr   r   r   r   r   __all__rC   r2   r1   <module>r      s   " !         ! ! ! ! ! ! 9 9 9 9 9 9 7 7 7 7 7 7 - - - - - -                          > = = = = =    7   6	 	 	 	 	0 	 	 	    ry   6    29   # # # # #"8 # # #	 	 	 	 	%> 	 	 		 	 	 	 	? 	 	 		 	 	 	 	? 	 	 	)= )= )= )= )=?4K )= )= )=X  7 / / / / /5} / / /@/ / / / /6 / / /@	 	 	 	 	-N 	 	 		 	 	 	 	/R 	 	 		 	 	 	 	0 	 	 	  r2   