
     `iG                        d Z ddlZddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ d	d
lmZmZmZmZmZmZmZmZmZ ddlmZ  ej        e           Z!e ed           G d de                                  Z" G d de          Z# G d de          Z$ G d de          Z% G d de          Z& G d de          Z' G d de          Z(e G d de                      Z)eZ* G d d e)e          Z+ ed!           G d" d#e)                      Z, G d$ d%e          Z- G d& d'e          Z.g d(Z/dS ))zPyTorch UniSpeech model.    N)	dataclass)OptionalUnion   )ModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dS )	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   tupler         /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/unispeech/modular_unispeech.pyr   r   -   s           )-D(5$
%,,,48hu01888>B): ;BBB9=8E$56===8<M8E%"345<<<59Ju01299999r*   r   c                       e Zd ZdS ) UniSpeechPositionalConvEmbeddingNr!   r"   r#   r)   r*   r+   r-   r-   J           Dr*   r-   c                       e Zd ZdS )UniSpeechFeatureEncoderNr.   r)   r*   r+   r1   r1   N   r/   r*   r1   c                       e Zd ZdS )UniSpeechFeatureProjectionNr.   r)   r*   r+   r3   r3   R   r/   r*   r3   c                       e Zd ZdS )UniSpeechEncoderNr.   r)   r*   r+   r5   r5   V   r/   r*   r5   c                       e Zd ZdS )UniSpeechEncoderStableLayerNormNr.   r)   r*   r+   r7   r7   Z   r/   r*   r7   c                   *    e Zd Zed             Zd ZdS )UniSpeechGumbelVectorQuantizerc           	          |                      d          }t          j        t          j        |t          j        |dz             z  d                                                     }|S )Nr   dimgHz>)meanr%   expsumlog)probsmarginal_probs
perplexitys      r+   _compute_perplexityz2UniSpeechGumbelVectorQuantizer._compute_perplexity_   s^    **Y	.59^VZEZ;[;[*[ac d d ddeeiikk
r*   c                    |j         \  }}}|                     |          }|                    ||z  | j        z  d          }| j        rt
          j                            |                                | j	        d          
                    |          }t          j        |                    ||z  | j        d                                          d          }|                     |          }n|                    d          } |j        |j                              d|                    dd          d          }|                    ||z  | j        d          }|                     |          }|                    ||z  d          }|                    d          | j        z  }	|	                    ||z  | j        | j        d          }
|
                    d                              ||d          }
|
|fS )Nr=   T)tauhardr;   r         ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr%   softmaxrE   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsr@   )selfr   
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distrD   codevector_idxcodevectors_per_groupr[   s              r+   forwardz&UniSpeechGumbelVectorQuantizer.forwarde   s   3@3F0
O[ ((77%**:+G$/+Y[]^^= 	D!};;##%%4+;$  <    gm$$ 
 $)="":#?RTUU[[]]ce$ $ $  112FGGJJ +11b199N6}68KLUUN''A..     044Z/5QSWSbdfgg112BCCJ+00o1MrRR 0 : :2 > >AQ Q+00o1Mt`d`moqrr!oob))..z?BOOJ&&r*   N)r!   r"   r#   staticmethodrE   re   r)   r*   r+   r9   r9   ^   s<          \
#' #' #' #' #'r*   r9   c                   x    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej        ef         fdZded	ej        fd
ZdS )UniSpeechPreTrainedModelconfig	unispeechinput_valuesTc           
      \   t          |t                    ro|j        j        j                            dd           |j        j        j                                         t          j	        
                    |j                   dS t          |t                    rt          j	                            |j        j        ddt          j        d|j        j        d         |j        j        z  z            z             t          j	                            |j        j        d           dS t          |t&                    r}t          j        d|j        j        z            }t          j	        
                    |j        j        | |           t          j	        
                    |j        j        | |           dS t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j                                         dS dS t          |t          j        t          j        f          r?|j        j                                         |j        j                            d           dS t          |t          j                  rt          j	                            |j                   |j        [t          j        |j        |j        |j        d         z  z            }t          j	        
                    |j        | |           dS dS dS )	zInitialize the weights        r   )r>   stdr   r   )abNrI   )
isinstancer9   rL   weightdatanormal_biaszero_rP   inituniform_r[   r-   convmathsqrtkernel_sizein_channels	constant_r3   
projectionin_featuresLinearri   initializer_range	LayerNorm	GroupNormfill_Conv1dkaiming_normal_groups)r]   moduleks      r+   _init_weightsz&UniSpeechPreTrainedModel._init_weights   s    f<== 	9%*222CCC#(..000GV/00000 @AA 	9GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.22222 :;; 	9	!f/;;<<AGV.5!qAAAGV.3rQ?????	** 	9M&&CT[5R&SSS{& &&((((( '&r| <== 	9K""$$$M$$S)))))	** 	9G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888	9 	9 '&r*   input_lengthsc                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r%   div)input_lengthr|   strides      r+   _conv_out_lengthzSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s&     9\K7wWWWZ[[[r*   )zipri   conv_kernelconv_stride)r]   r   r   r|   r   s        r+    _get_feat_extract_output_lengthsz9UniSpeechPreTrainedModel._get_feat_extract_output_lengths   s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMr*   feature_vector_lengthattention_maskc                    |                     d          d d df         }|                     |                              t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                               d          
                    dg                                          }|S )Nr=   r;   r   )dtypedevicer   )r   )cumsumr   tor%   longrK   zerosr   r   arangeflipbool)r]   r   r   non_padded_lengthsoutput_lengthsr^   s         r+   "_get_feature_vector_attention_maskz;UniSpeechPreTrainedModel._get_feature_vector_attention_mask   s     ,22r2::111b5A>>?QRRUUV[V`aa#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr*   N)r!   r"   r#   r   r'   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   r   r%   
LongTensorintr   r   r)   r*   r+   rh   rh      s         #$O&*#N9 9 9BeEDTVYDY>Z     ]b]m      r*   rh   c                       e Zd ZdefdZd Zd Z	 	 	 	 	 ddeej	                 deej	                 deej
                 d	ee         d
ee         dee         deeef         fdZdS )UniSpeechModelri   c                    t                               | |           || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        |j        rt#          |          | _        nt'          |          | _        |                                  d S )Nrm   )rh   __init__ri   r1   feature_extractorr3   feature_projectionmask_time_probmask_feature_probrP   	Parameterr%   Tensorr`   rx   masked_spec_embeddo_stable_layer_normr7   encoderr5   	post_init)r]   ri   s     r+   r   zUniSpeechModel.__init__   s     ))$777!8!@!@"<V"D"D 3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"& 	4:6BBDLL+F33DL 	r*   c                      t          d          NzNot needed for UniSpeechAttributeErrorr]   s    r+   freeze_feature_extractorz'UniSpeechModel.freeze_feature_extractor       7888r*   c                      t          d          r   r   r   s    r+   freeze_feature_encoderz%UniSpeechModel.freeze_feature_encoder   r   r*   Nrk   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|!|                     |j        d         |          }|                     |          \  }}| 	                    |||          }| 
                    |||||          }	|	d         }|s||f|	dd         z   S t          |||	j        |	j                  S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr   r    )ri   r   r   use_return_dictr   	transposer   rK   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr   r    )
r]   rk   r   r   r   r   r   r   r   encoder_outputss
             r+   re   zUniSpeechModel.forward   sW    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;%!DDEUE[\]E^`nooN*.*A*ABR*S*S''00->~ 1 
 
 ,,)/!5# ' 
 
 (* 	K!#34qrr7JJJ'+-)7&1	
 
 
 	
r*   )NNNNN)r!   r"   r#   r   r   r   r   r   r%   r   r&   r   r   r(   r   re   r)   r*   r+   r   r      s            "9 9 99 9 9 269=,0/3&*2
 2
u|,2
 !.2
 $E$56	2

 $D>2
 'tn2
 d^2
 
u..	/2
 2
 2
 2
 2
 2
r*   r   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                        e Zd Zdef fdZdefdZd Zd Ze		 dde
j        d	e
j        d
e
j        defd            Ze	 	 	 	 ddee
j                 dee
j                 dee         dee         dee         deeef         fd            Z xZS )UniSpeechForPreTrainingri   c                    t                                          |           t          |          | _        t	          j        |j                  | _        t          |          | _	        t	          j
        |j        |j                  | _        t	          j
        |j        |j                  | _        t	          j
        |j        |j                  | _        t	          j        |j                  | _        |                                  d S )N)superr   r   rj   rP   Dropoutfeat_quantizer_dropoutdropout_featuresr9   	quantizerr   codevector_dimproj_codevector_dim	project_qr`   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   )r]   ri   	__class__s     r+   r   z UniSpeechForPreTraining.__init__+  s       '// "
6+H I I7??6#8&:TUU9V%?ASTT	&"4f6LMMz&"677 	r*   rT   c                     || j         _        dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r   rT   )r]   rT   s     r+   set_gumbel_temperaturez.UniSpeechForPreTraining.set_gumbel_temperature:  s     &1"""r*   c                 b    t          j        dt                     |                                  dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        zThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.N)warningswarnFutureWarningr   r   s    r+   r   z0UniSpeechForPreTraining.freeze_feature_extractor@  s;    
 	Q	
 	
 	

 	##%%%%%r*   c                 B    | j         j                                         dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rj   r   _freeze_parametersr   s    r+   r   z.UniSpeechForPreTraining.freeze_feature_encoderL  s!    
 	(;;=====r*   r   target_featuresnegative_featurespredicted_featuresc                     t          j        | |gd          } t          j        |                                |                                 d          }|                    |           }||z  }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r;   r=   )r%   catcosine_similarityrS   rU   )r   r   r   rT   logitss        r+   compute_contrastive_logitsz2UniSpeechForPreTraining.compute_contrastive_logitsS  sq      )_6G$HaPPP();)A)A)C)C_EZEZE\E\bdeee00 +%r*   Nrk   r   r   r   r   r   c                 N   ||n| j         j        }|                     |||||          }|d         }|                     |d                   }|                     |          \  }	}
|                     |	                    | j        j        j                            }	| 	                    |	          }	t          j        |                    d          |                    d                                        | j         j                  }|                    dd          }t          j        |                                                              |j                  }|                    dd          }|                    d          }|                    |d          |	                    | d          z   }|                     |          }|                     |          }d}|s#||||	|
f|dd         z   S ||	|
f|dd         z   S t/          |||	|
|j        |j                  S )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr   r   r   r=   rm   r   )r   r   r   r   r   r    )ri   r   rj   r   r   r   r   rr   r   r   r%   emptysizer   replace_probr   	bernoullir   r   rZ   masked_fillr   r   r   r   r    )r]   rk   r   r   r   r   outputstransformer_featuresr   quantized_featuresr   prob_replace_matrixsampled_replace_matrixr   r   s                  r+   re   zUniSpeechForPreTraining.forwardg  sW   * &1%<kk$+B]..)/!5# ! 
 
  'qz  00<<48NNCS4T4T11 "^^,>,A,A$.BWB],^,^__!--.@AA#k*>*C*CA*F*FH\HaHabcHdHdeekkK$
 
 2;;AqAA!&1D!E!E!J!J!L!L!O!OPdPk!l!l!7!A!A!Q!G!G!7!A!A"!E!E%112H#NN**,B+BCHH

 f%%v&&  	c24FH]^ahijikikalll(*<>STW^_`_a_aWbbb,1'9"7!/)
 
 
 	
r*   )r   )NNNN)r!   r"   r#   r   r   r   r   r   r   rf   r%   r&   r   r
   r   r   r   r   r(   r   re   __classcell__)r   s   @r+   r   r   %  su             1# 1 1 1 1
& 
& 
&> > > 
 	 * , "- 	   \&  26,0/3&*D
 D
u|,D
 !.D
 $D>	D

 'tnD
 d^D
 
u33	4D
 D
 D
 ^D
 D
 D
 D
 D
r*   r   c                       e Zd ZdS )UniSpeechForCTCNr.   r)   r*   r+   r  r    r/   r*   r  c                       e Zd ZdS )"UniSpeechForSequenceClassificationNr.   r)   r*   r+   r  r    r/   r*   r  )r  r   r  r   rh   )0r$   rz   r   dataclassesr   typingr   r   r%   torch.nnrP   modeling_outputsr   r   modeling_utilsr	   utilsr
   r   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr!   loggerr   r-   r1   r3   r5   r7   r9   rh   r   r   r   r  r  __all__r)   r*   r+   <module>r     s       ! ! ! ! ! ! " " " " " " " "        D D D D D D D D - - - - - - , , , , , , , ,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5 4 4 4 4 4 
	H	%	%   
: : : : :K : :  :.	 	 	 	 	'F 	 	 		 	 	 	 	4 	 	 		 	 	 	 	!: 	 	 		 	 	 	 	 	 	 		 	 	 	 	&D 	 	 	*' *' *' *' *'%B *' *' *'Z F F F F F F F FR 3 J
 J
 J
 J
 J
-} J
 J
 J
Z   
B
 B
 B
 B
 B
6 B
 B
 
B
J	 	 	 	 	n 	 	 		 	 	 	 	)J 	 	 	  r*   