
     `iA                    n   d Z ddlZddlmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.  e)j/        e0          Z1dZ2dej3        de4de4fdZ5	 ddej3        de4deej3                 fdZ6	 	 dde7e4e4f         de8de4deej9                 de4d ej:        fd!Z; G d" d#e          Z< G d$ d%e          Z= G d& d'e          Z> G d( d)ej?                  Z@ G d* d+ej?                  ZA G d, d-ej?                  ZB G d. d/ej        j?                  ZC G d0 d1ej?                  ZD G d2 d3ej?                  ZE G d4 d5ej?                  ZF G d6 d7ej?                  ZG G d8 d9ej?                  ZH G d: d;ej?                  ZI G d< d=ej?                  ZJ G d> d?ej?        e%          ZK G d@ dAej?        e%          ZL G dB dCej?        e%          ZM G dD dEej?                  ZN G dF dGej?                  ZO G dH dIe          ZP G dJ dKe          ZQe( G dL dMe&                      ZR G dN dOeR          ZS G dP dQeR          ZT G dR dSeR          ZU G dT dUeR          ZV G dV dWeR          ZW G dX dYeR          ZX G dZ d[eR          ZY G d\ d]eR          ZZ G d^ d_ej?                  Z[ G d` daej?                  Z\ e(dbc           G dd deeR                      Z] e(dfc           G dg dheRe                      Z^	 	 	 	 	 	 	 	 ddmeRdej_        dneej_                 deej9                 doe8dpe8dqe8dreej?                 dse`dte`d eej_        e7ej_        ej_        f         f         fduZa e(dvc           G dw dxeR                      Zb e(dyc           G dz d{eR                      Zc G d| d}ej?                  Zd e(d~c           G d de&                      Zeg dZfdS )zPyTorch SpeechT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging)deprecate_kwarg   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     |                      | j                  }| ddddf                                         |ddddf<   ||dddf<   |t          d          |                    |dk    |           |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r!   r"   r#   shifted_input_idss       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr-   6   s     "++IO<<(CRC06688aaae4aaadLMMM""#4#<lKKK    input_valuesreduction_factorattention_maskc                    |dk    r&| dd|dz
  d|f         } ||dd|dz
  d|f         }|                      | j                  }| ddddf                                         |ddddf<   |                    |dk    d           ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr%         Y        )r&   r'   r(   r*   )r/   r0   r1   shifted_input_valuess       r,   shift_spectrograms_rightr6   F   s     !#AAA'7!';'O?O'O$OP%+AAA/?!/C/WGW/W,WXN'11,2DEE".qqq#2#v"6"<"<">">ABB %%&:f&DcJJJ//r.   r'   	mask_probmask_length	min_masksreturnc                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                                                    d                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr8   r7   r9   sequence_lengths     r,   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span   s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFOr.   Nr%   c                     g | ]}S  rF   ).0_rC   s     r,   
<listcomp>z)_compute_mask_indices.<locals>.<listcomp>   s    999!o999r.   dtyper   F)replace)r)   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper?   put_along_axis)r'   r7   r8   r1   r9   
batch_sizerD   input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr@   rA   spec_aug_mask_idxdummy_mask_idxoffsetsrB   rC   s    `` `           @@r,   _compute_mask_indicesrj   \   sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	##B''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???r.   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5NoLayerNormConvLayerr   c                 Z   t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        d S )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr
   feat_extract_activation
activationselfconfiglayer_id	__class__s      r,   rs   z%SpeechT5NoLayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@r.   c                 Z    |                      |          }|                     |          }|S N)r{   r}   r   hidden_statess     r,   forwardz$SpeechT5NoLayerNormConvLayer.forward   s*    		-0066r.   r   __name__
__module____qualname__rs   r   __classcell__r   s   @r,   rl   rl      sR        A A A A A A      r.   rl   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5LayerNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   rn   T)elementwise_affine)rr   rs   rt   ru   rv   r   rw   rx   ry   rz   r{   	LayerNorm
layer_normr
   r|   r}   r~   s      r,   rs   z#SpeechT5LayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@r.   c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )Nr%   )r{   	transposer   r}   r   s     r,   r   z"SpeechT5LayerNormConvLayer.forward   se    		-00%//B7766%//B7766r.   r   r   r   s   @r,   r   r      sR        A A A A A A      r.   r   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5GroupNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        t          j        | j        | j        d          | _        d S )Nr   r   rn   T)
num_groupsnum_channelsaffine)rr   rs   rt   ru   rv   r   rw   rx   ry   rz   r{   r
   r|   r}   	GroupNormr   r~   s      r,   rs   z#SpeechT5GroupNormConvLayer.__init__  s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@,$2CRVRclpqqqr.   c                     |                      |          }|                     |          }|                     |          }|S r   )r{   r   r}   r   s     r,   r   z"SpeechT5GroupNormConvLayer.forward  s;    		-006666r.   r   r   r   s   @r,   r   r     sR        r r r r r r       r.   r   c            	           e Zd ZdZddededee         f fdZddededee         fdZeddededee         fd	            Z	 e
j                    dde
j        defd            Z	 dde
j        dedee         fdZ xZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxc                     t                                                       d| _        || _        || _        |                     || j        z   ||           d S N   )rr   rs   offsetr   r   make_weights)r   r   r   r   r   s       r,   rs   z.SpeechT5SinusoidalPositionalEmbedding.__init__"  sU    *&-$+5}kRRRRRr.   num_embeddingsc                     |                      |||          }t          | d          r+|                    | j        j        | j        j                  }|                     d|d           d S )NweightsrK   deviceF
persistent)get_embeddinghasattrtor   rK   r   register_buffer)r   r   r   r   emb_weightss        r,   r   z2SpeechT5SinusoidalPositionalEmbedding.make_weights)  sl    ((TT4## 	_%..t|/A$,J].^^KYFFFFFr.   c                    |dz  }t          j        d          |dz
  z  }t          j        t          j        |t          j                                                  | z            }t          j        | t          j                                                                      d          |                    d          z  }t          j        t          j	        |          t          j
        |          gd                              | d          }|dz  dk    r+t          j        |t          j        | d          gd          }|	d||ddf<   |                    t          j                              S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   rJ   r   dimr%   N)mathlogtorchexprX   int64float	unsqueezecatsincosviewrU   r   get_default_dtype)r   r   r   half_dimembs        r,   r   z3SpeechT5SinusoidalPositionalEmbedding.get_embedding1  s?    !A%huooA.iXU[AAAGGIISDPQQl>===CCEEOOPQRRUXUbUbcdUeUeei338a@@@EEnVXYY1!!)S%+na"@"@AqIIIC""#CQQQvve-//000r.   r   r!   past_key_values_lengthc                    |                                 \  }}|                     || j        |                              |j                  }| j        dz   |z   }|| j                             d          k    r)|                     || j        z   | j        | j                   | j        	                    d|
                    d                    
                    ||d                                          S )Nr   r   r%   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rQ   )r   r!   r   bszseq_lenposition_idsmax_poss          r,   r   z-SpeechT5SinusoidalPositionalEmbedding.forwardC  s     ~~''W>>y$JZ\rssvv
 

 "Q&0T\&&q))))g3T5GIYZZZ|((L,=,=b,A,ABBGGWVXYY``bbbr.   c                     |                     |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner>   r   cumsumtype_aslong)r   r!   r   r   maskincremental_indicess         r,   r   zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsR  sg     ||K((,,..$|Da888@@FFI__cgg"''))K77r.   r   r   )r   r   r   __doc__r>   r   rs   r   staticmethodr   r   no_gradTensorr   r   r   r   s   @r,   r   r     sl       NNS Sc S# SHUXM S S S S S SG G3 Gs GQYZ]Q^ G G G G 1 1c 1# 1HUXM 1 1 1 \1" U]__c c cs c c c _c bc8 88478QYZ]Q^8 8 8 8 8 8 8 8r.   r   c                   $     e Zd Z fdZd Z xZS )SpeechT5PositionalConvEmbeddingc                    t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        t          j        j	        }t          t          j        j        d          rt          j        j        j	        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t-          |j                  | _        t0          |j                 | _        d S )	Nr   )ro   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rr   rs   r   rw   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsr{   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r
   r|   r}   )r   r   r   r   r   r   r   s         r,   rs   z(SpeechT5PositionalConvEmbedding.__init__e  s   I62a77
 
 
	 h*28,m<< 	@(3?K%'' 	E22493CST2UU I I'K	aHHH	I I I I I I I I I I I I I I Ity"455 .95<F95<F9-9-N66tXFFFN66tXFFFF#DIH!DDDDI+F,JKK !?@s   C??DDc                     |                     dd          }|                     |          }|                     |          }|                     |          }|                     dd          }|S Nr   r   )r   r{   r   r}   r   s     r,   r   z'SpeechT5PositionalConvEmbedding.forward  se    %//155		-00]3366%//155r.   r   r   s   @r,   r   r   d  sM        A A A A AB      r.   r   c                   *     e Zd ZdZd fd	Zd Z xZS ) SpeechT5ScaledPositionalEncodingu[   
    Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
      c                 >   t          j        ||          }t          j        d|                              d          }t          j        t          j        d|dt           j                                                  t          j        d          |z   z            }t          j	        |                                |z            |d d dd df<   t          j
        |                                |z            |d d dd df<   |                    d          }t                                                       |                     d|d           t          j        |	          | _        || _        t          j        t          j        d
                    | _        d S )Nr   r   r   rJ   g     @peFr   p      ?)r   rU   rX   r   r   r   r   r   r   r   r   rr   rs   r   r   Dropoutdropoutr   	Parametertensoralpha)r   r   r   max_lenr   positiondiv_termr   s          r,   rs   z)SpeechT5ScaledPositionalEncoding.__init__  sX   [#&&<7++55a889U\!S!5;GGGMMOOTXT\]dTeTehkTkRllmmi 0 08 ;<<111add7i 0 08 ;<<111add7\\!__T2%888zG,,,\%,s"3"344


r.   c                     || j         | j        d d d |                    d          f         z  z   }|                     |          }|S )Nr   )r   r   r   r   )r   r   s     r,   r   z(SpeechT5ScaledPositionalEncoding.forward  sG    DJMchhqkkM)9!:::ll3
r.   )r   )r   r   r   r   rs   r   r   r   s   @r,   r   r     sV         5 5 5 5 5 5      r.   r   c                   &     e Zd Zd fd	Zd Z xZS )"SpeechT5RelativePositionalEncoding  c                     t                                                       || _        || _        t          j                            d|z  |          | _        d S r   )rr   rs   r   
max_lengthr   r   	Embeddingpe_k)r   r   r	  r   s      r,   rs   z+SpeechT5RelativePositionalEncoding.__init__  sH    $H&&q:~s;;			r.   c                 T   |j         d         }t          j        d|                              |j        t          j                  }|d d d f         |d d d f         z
  }| j         ||| j         k     <   | j        dz
  ||| j        k    <   || j        z   }|                     |          S )Nr   r   r   rK   )r'   r   rX   r   r   r   r	  r  )r   r   r   pos_seqs       r,   r   z*SpeechT5RelativePositionalEncoding.forward  s    %a(,q'**--]5IQVQ[-\\!!!T'"WT111W%55/3.>4?**+.2o.A4?*+DO+yy!!!r.   )r  r   r   s   @r,   r  r    sL        < < < < < <	" 	" 	" 	" 	" 	" 	"r.   r  c                   $     e Zd Z fdZd Z xZS )r   c                 l    t                                                       |dz  dk    rdnd| _        d S )Nr   r   r   )rr   rs   num_pad_remove)r   r   r   s     r,   rs   zSpeechT5SamePadLayer.__init__  s:    #:Q#>!#C#Caar.   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )r  r   s     r,   r   zSpeechT5SamePadLayer.forward  s;    "")!!!QQQ0F43F2F0F*FGMr.   r   r   s   @r,   r   r     sL        K K K K K      r.   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc                    t                                                       j        dk    r7t          d          gfdt	          j        dz
            D             z   }nDj        dk    r!fdt	          j                  D             }nt          dj         d	          t          j        |          | _	        d
| _
        d| _        d S )Ngroupr   r   c                 8    g | ]}t          |d z             S )r   r  )rl   rG   ir   s     r,   rI   z3SpeechT5FeatureEncoder.__init__.<locals>.<listcomp>  s>     N N NIJ,Va!eDDDN N Nr.   r   layerc                 2    g | ]}t          |           S )r  )r   r  s     r,   rI   z3SpeechT5FeatureEncoder.__init__.<locals>.<listcomp>  s4       CD*6A>>>  r.   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rr   rs   feat_extract_normr   rT   num_feat_extract_layersr)   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r   r   r!  r   s    ` r,   rs   zSpeechT5FeatureEncoder.__init__  s   #w..5fqIIIJ N N N NNSTZTruvTvNwNwN N N KK %00   HMfNlHmHm  KK t1Ittt   =55&+#"r.   c                 P    |                                  D ]	}d|_        
d| _        d S )NF)
parametersrequires_gradr#  )r   params     r,   _freeze_parametersz)SpeechT5FeatureEncoder._freeze_parameters  s4    __&& 	( 	(E"'E#r.   c                 r    |d d d f         }| j         r| j        rd|_        | j        D ]} ||          }|S NT)r#  trainingr&  r!  )r   r/   r   
conv_layers       r,   r   zSpeechT5FeatureEncoder.forward  s[    $QQQW-  	/4= 	/*.M'* 	6 	6J&J}55MMr.   )r   r   r   r   rs   r(  r   r   r   s   @r,   r  r    s\        88# # # # #&$ $ $

 
 
 
 
 
 
r.   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeatureProjectionc                 .   t                                                       t          j        |j        d         |j                  | _        t          j        |j        d         |j                  | _	        t          j
        |j                  | _        d S )Nr%   eps)rr   rs   r   r   rt   layer_norm_epsr   Linearr   
projectionr   feat_proj_dropoutr   r   r   r   s     r,   rs   z"SpeechT5FeatureProjection.__init__  sn    ,vr':@UVVV)FOB$79KLLz&":;;r.   c                     |                      |          }|                     |          }|                     |          }||fS r   )r   r4  r   )r   r   norm_hidden_statess      r,   r   z!SpeechT5FeatureProjection.forward  sC    !__];;(:;;]33000r.   r   r   s   @r,   r.  r.    sG        < < < < <1 1 1 1 1 1 1r.   r.  c                       e Zd Z fdZd Z	 	 ddej        deej                 deej	                 fdZ
dedej        fd	Zd
eej        ef         fdZ	 	 ddej	        deej	                 deej                 fdZ xZS )SpeechT5SpeechEncoderPrenetc                    t                                                       || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        t!          |          | _        t%          |j        |j        z   dz   |j        |j                  | _        d S )Nr4   r   )rr   rs   r   r  feature_encoderr.  feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr"   pos_sinusoidal_embedr6  s     r,   rs   z$SpeechT5SpeechEncoderPrenet.__init__  s    5f==";F"C"C  3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"=fEE$I'&*==A%
 %
!!!r.   c                 8    | j                                          d S r   )r<  r(  r   s    r,   freeze_feature_encoderz2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //11111r.   Nr/   r1   mask_time_indicesc                 (   |                      |          }|                    dd          }|!|                     |j        d         |          }|                     |          \  }}|                     |||          }|                     |          }||z   }|(|                    d                                          }n3t          j
        |j        d d         t          j        |j                  }|                     |          }||z   }||fS )Nr   r   )rH  r1   r   )r<  r   "_get_feature_vector_attention_maskr'   r=  _mask_hidden_statesrB  r   r   r   rU   r   rD  )	r   r/   r1   rH  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r,   r   z#SpeechT5SpeechEncoderPrenet.forward  s/     //==+55a;;%!DD &q) N
 +/*A*ABR*S*S''00->~ 1 
 
 %)$7$7$F$F!%(AA%),,Q//4466LL ;}':2A2'>ejYfYmnnnL+/+D+D\+R+R(%(HHn,,r.   feature_vector_lengthc                    |                     d          d d df         }|                     |                              t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                               d          
                    dg                                          }|S )Nr%   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r'   rU   rK   r   rX   fliprV   )r   rP  r1   non_padded_lengthsoutput_lengthsrb   s         r,   rJ  z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask9  s     ,22r2::111b5A>>?QRRUUV[V`aa#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr.   rc   c                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r   div)r@   ro   rp   s      r,   _conv_out_lengthzVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthN  s&     9\K7wWWWZ[[[r.   )zipr   rx   ry   )r   rc   r\  ro   rp   s        r,   rS  z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsI  s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMr.   r   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r7   r8   r1   r9   r  )r7   r8   r9   r%   )getattrr   r   rA  r   rK   r>  r+  rj   mask_time_lengthmask_time_min_masksr   r   r   rV   r?  mask_feature_lengthmask_feature_min_masksexpand)r   r   rH  r1   rb   rC   r   mask_feature_indicess           r,   rK  z/SpeechT5SpeechEncoderPrenet._mask_hidden_statesY  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r.   NN)r   r   r   rs   rG  r   r   r   
LongTensorFloatTensorr   r>   rJ  r   rS  rK  r   r   s   @r,   r:  r:    s6       
 
 
 
 
"2 2 2 6:9=	 -  -l - !!12 - $E$56	 -  -  -  -F ]b]m     eEDTVYDY>Z    & :>59	, ,(, $E$56, !!12	, , , , , , , ,r.   r:  c                   X     e Zd Z fdZd Z	 ddej        deej                 fdZ xZ	S )SpeechT5SpeechDecoderPrenetc                    t                                                       | _        t          j        fdt          j                  D                       | _        t          j        j	        j
                  | _        t          j        j
        j                  | _        t          j        j        j
        z   j
                  | _        d S )Nc                 h    g | ].}t          j        |d k    rj        nj        j                  /S r   )r   r3  num_mel_binsspeech_decoder_prenet_unitsr  s     r,   rI   z8SpeechT5SpeechDecoderPrenet.__init__.<locals>.<listcomp>  sS       
 	 	+,66F''v7Y6   r.   )rr   rs   r   r   r   rT   speech_decoder_prenet_layerslayersr3  ro  r   final_layerr   positional_dropoutrC  encode_positionsspeaker_embedding_dimspeaker_embeds_layerr6  s    `r,   rs   z$SpeechT5SpeechDecoderPrenet.__init__  s    m   
 vBCC  
 
 9V%GI[\\ @%'!
 !

 %'If.JVM_._agas$t$t!!!r.   c                     t          j        |d         |          }|                    d                              |                    d          dd          }t          j        |dk    |d          dz  d|z
  z  S )Nr   r   r   )r   	bernoullir   repeatr   where)r   inputs_embedsr   r   	all_maskss        r,   _consistent_dropoutz/SpeechT5SpeechDecoderPrenet._consistent_dropout  sr    }Q/1555NN1%%,,]-?-?-B-BAqII	{9>=!<<q@AEJJr.   Nr/   speaker_embeddingsc                 R   |}| j         D ]J}t          j                             ||                    }|                     || j        j                  }K|                     |          }|                     |          }|t          j        	                    |          }|
                    d                              d|                    d          d          }t          j        ||gd          }t          j                            |                     |                    }|S )Nr   r%   r   )rq  r   
functionalrelur}  r   speech_decoder_prenet_dropoutrr  rt  	normalizer   re  r   r   r   rv  )r   r/   r~  r{  r  s        r,   r   z#SpeechT5SpeechDecoderPrenet.forward  s    %[ 	o 	oEM..uu]/C/CDDM 44]DKDmnnMM((77--m<<)!#!8!89K!L!L!3!=!=a!@!@!G!GML^L^_`LaLace!f!f!I}6H&IrRRRMM..t/H/H/W/WXXMr.   r   )
r   r   r   rs   r}  r   r   r   r   r   r   s   @r,   rk  rk    s        u u u u u,K K K 6: l %U\2       r.   rk  c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5BatchNormConvLayerr   c                    t                                                       |dk    r|j        }n|j        }||j        dz
  k    r|j        }n|j        }t          j        |||j        d|j        dz
  dz  d          | _        t          j	        |          | _
        ||j        dz
  k     rt          j                    | _        nd | _        t          j        |j                  | _        d S )Nr   r   r   F)ro   rp   r   rq   )rr   rs   rn  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rw   speech_decoder_postnet_kernelr{   BatchNorm1d
batch_normTanhr}   r   speech_decoder_postnet_dropoutr   )r   r   r   ru   rv   r   s        r,   rs   z#SpeechT5BatchNormConvLayer.__init__  s    q== -KK =Kv;a???!.LL!>LI<9A=!C
 
 
	 .66f:Q>>> giiDOO"DOz&"GHHr.   c                     |                      |          }|                     |          }| j        |                     |          }|                     |          }|S r   )r{   r  r}   r   r   s     r,   r   z"SpeechT5BatchNormConvLayer.forward  sT    		-0066?& OOM::M]33r.   r   r   r   s   @r,   r  r    sR        I I I I I I<      r.   r  c                   J     e Zd Z fdZdej        fdZdej        fdZ xZS )SpeechT5SpeechDecoderPostnetc                 f   t                                                       | _        t          j        j        j        j        z            | _        t          j        j        j                  | _	        t          j
        fdt          j                  D                       | _        d S )Nc                 0    g | ]}t          |          S rF   )r  r  s     r,   rI   z9SpeechT5SpeechDecoderPostnet.__init__.<locals>.<listcomp>  s$    hhhq'22hhhr.   )rr   rs   r   r   r3  r   rn  r0   feat_outprob_outr   rT   r  rq  r6  s    `r,   rs   z%SpeechT5SpeechDecoderPostnet.__init__  s    	&"4f6IFLc6cdd	&"4f6MNNmhhhhE&Bf<g<ghhh
 
r.   r   c                 <   |                      |                              |                    d          d| j        j                  }|                     |          }|                     |                              |                    d          d          }|||fS )Nr   r%   )r  r   r   r   rn  postnetr  )r   r   outputs_before_postnetoutputs_after_postnetlogitss        r,   r   z$SpeechT5SpeechDecoderPostnet.forward  s    !%}!=!=!B!B=CUCUVWCXCXZ\^b^i^v!w!w $-C D D}--22=3E3Ea3H3H"MM%'<fDDr.   c                     |                     dd          }| j        D ]} ||          }||                     dd          z   S r   )r   rq  )r   r   layer_outputr  s       r,   r  z$SpeechT5SpeechDecoderPostnet.postnet  sT    $..q!44[ 	/ 	/E 5..LL|55a;;;;r.   )	r   r   r   rs   r   r   r   r  r   r   s   @r,   r  r    sw        	
 	
 	
 	
 	
EU\ E E E E<U\ < < < < < < < <r.   r  c                   4     e Zd Z fdZdej        fdZ xZS )SpeechT5TextEncoderPrenetc                     t                                                       || _        t          j        |j        |j        |j                  | _        t          |j
        |j        |j                  | _        d S r   )rr   rs   r   r   r
  
vocab_sizer   r"   embed_tokensr   rs  max_text_positionsrt  r6  s     r,   rs   z"SpeechT5TextEncoderPrenet.__init__  sj    L):F<NPVPcdd @%%!
 !
r.   r!   c                 Z    |                      |          }|                     |          }|S r   )r  rt  )r   r!   r{  s      r,   r   z!SpeechT5TextEncoderPrenet.forward  s.    )))44--m<<r.   )r   r   r   rs   r   r   r   r   r   s   @r,   r  r    sU        
 
 
 
 
        r.   r  c                   d     e Zd Z fdZ	 	 ddej        deej                 dee         fdZ	 xZ
S )SpeechT5TextDecoderPrenetc                    t                                                       || _        t          j        |j                  | _        |j        rt          j	        |j
                  nd| _        t          j        |j        |j
        |j                  | _        t!          |j        |j        z   dz   |j
        |j                  | _        d S )Nr   r   )rr   rs   r   r   r   rs  r   scale_embeddingr   sqrtr   embed_scaler
  r  r"   r  r   r  embed_positionsr6  s     r,   rs   z"SpeechT5TextDecoderPrenet.__init__  s    z&";<<<B<R[49V%7888X[L):F<NPVPcddD%(;;a? 
  
r.   Nr!   r1   past_key_valuesc                    |1|                                 }|                    d|d                   }nt          d          d}|Bt          |t                    s|d         d         j        d         n|                                }|                     ||          }|                     |          | j	        z  }||z  }| 
                    |          }||fS )Nr%   z'You have to specify `decoder_input_ids`r   r   )r   r   r)   
isinstancer   r'   get_seq_lengthr  r  r  r   )r   r!   r1   r  input_shaper   	positionsr{  s           r,   r   z!SpeechT5TextDecoderPrenet.forward  s      #..**K!r;r?;;IIFGGG!"& "/5996"1%+B//$3355 # ((4JKK	)))44t7GG"]33n,,r.   rg  )r   r   r   rs   r   r   r   rh  r   r   r   r   s   @r,   r  r    s        
 
 
 
 
" 6:+/	- -<- !!12- "%	- - - - - - - -r.   r  c                   @     e Zd Z fdZdej        fdZd Zd Z xZ	S )SpeechT5TextDecoderPostnetc                     t                                                       || _        t          j        |j        |j        d          | _        d S )NFrq   )rr   rs   r   r   r3  r   r  lm_headr6  s     r,   rs   z#SpeechT5TextDecoderPostnet.__init__;  sB    y!3V5FUSSSr.   r   c                 ,    |                      |          S r   r  r   s     r,   r   z"SpeechT5TextDecoderPostnet.forward@  s    ||M***r.   c                     | j         S r   r  rF  s    r,   get_output_embeddingsz0SpeechT5TextDecoderPostnet.get_output_embeddingsC  s     |r.   c                     || _         d S r   r  r   new_embeddingss     r,   set_output_embeddingsz0SpeechT5TextDecoderPostnet.set_output_embeddingsH  s    %r.   )
r   r   r   rs   r   r   r   r  r  r   r   s   @r,   r  r  :  sx        T T T T T
+U\ + + + +  
& & & & & & &r.   r  c                       e Zd ZdZ	 	 	 	 ddededee         d	ee         d
ee         dee         f fdZ e	ddd          	 	 	 	 	 	 	 dde
j        dee
j                 dee         dee
j                 dee
j                 dee
j                 dedee
j                 dee
j        ee
j                 ee         f         fd            Z xZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    r4   FTN	embed_dim	num_headsr   
is_decoderrq   	layer_idxc                    t                                                       || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _        || _	        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rr   rs   r  r  r   head_dimr)   scalingr  r  r   r3  k_projv_projq_projout_proj)r   r  r  r   r  rq   r  r   s          r,   rs   zSpeechT5Attention.__init__R  s	    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr.   past_key_valuer  4.58new_nameversionr   key_value_statesr1   layer_head_maskposition_biasoutput_attentionscache_positionr:   c	                    |du}	|                                 \  }
}}|                     |          | j        z  }d}|Ht          |t                    r1|j                            | j                  }|	r|j        }n
|j	        }n|}|	r|n|}|	r3|1|r/|j
        | j                 j        }|j
        | j                 j        }n|                     |          }|                     |          }|                    |
d| j        | j                                      dd          }|                    |
d| j        | j                                      dd          }|N|	s|nd}|                    ||| j        d|i          \  }}|	r$t          |t                    rd|j        | j        <   |
| j        z  d| j        f}|                    |
|| j        | j                                      dd          } |j        | } |j        | } |j        | }|                     d          }t+          j        ||                    dd                    }|                                 |
| j        z  ||fk    r2t/          d|
| j        z  ||f d	|                                            ||                                                    |
| j        z  d| j                                      d
d          }t+          j        ||                    dd                    }|                    d
d                              |
| j        z  |                     d
          |                     d                    }||z  }||                                 |
d||fk    r+t/          d|
d||f d	|                                            |                    |
| j        ||          |z   }|                    |
| j        z  ||          }t4          j                            |d          }||                                 | j        fk    r-t/          d| j        f d	|                                            |                    dddd          |                    |
| j        ||          z  }|                    |
| j        z  ||          }|r=|                    |
| j        ||          }|                    |
| j        z  ||          }nd}t4          j                            || j        | j                  }t+          j        ||          }|                                 |
| j        z  || j        fk    r5t/          d|
| j        || j        f d	|                                            |                    |
| j        || j                  }|                    dd          }|                    |
|| j                  }|                      |          }||fS )z#Input shape: Batch x Time x ChannelNFr%   r   r   r  Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   z/Head mask for a single layer should be of size )r   r+  z `attn_output` should be of size )!r   r  r  r  r   
is_updatedgetr  cross_attention_cacheself_attention_cacherq  keysvaluesr  r  r   r  r  r   updater`   r   bmmr)   
contiguousmatmulr   r  softmaxr   r+  r  r  )r   r   r  r  r1   r  r  r  r  is_cross_attentionr   tgt_lenrH   query_statesr  curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                              r,   r   zSpeechT5Attention.forwardo  s     .T9',,..Wa {{=11DL@
&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#-?R))] 	F/"=*"=,3DNCHJ.5dnELLL^44J;;~66L#b$.$-PPZZ[\^_``J',,S"dndmTT^^_`bcddL*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~>DN*B>
#((gt~t}UU__`acdee+|+Z8'Z'4
+|+Z8//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *   $$//1166sT^7KRQUQ^__iijkmnooI <	=3J3J2r3R3RSSL'11!Q77<<dn$m&8&8&;&;]=O=OPQ=R=R L L(L%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11 "))#wGGmmK00111r.   )r4   FTN)NNNNNFN)r   r   r   r   r>   r   r   rV   rs   r   r   r   r   tupler   r   r   s   @r,   r  r  L  s         $'%*#$(C CC C %	C
 TNC tnC D>C C C C C C: _%0A6RRR 48+/152604"'152 2|2 #5<02 "%	2
 !.2 "%,/2  -2  2 !.2 
u|Xel3Xe_D	E2 2 2 SR2 2 2 2 2r.   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |          | _        t          |j
        t                    rt          |j
                 | _        n|j
        | _        t          j        ||j                  | _        t          j        |j                  | _        d S r   )rr   rs   r   r   activation_dropoutintermediate_dropoutr3  r   intermediate_denser  
hidden_actstrr
   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)r   r   intermediate_sizer   s      r,   rs   zSpeechT5FeedForward.__init__  s    $&Jv/H$I$I!"$)F,>@Q"R"Rf'-- 	9'-f.?'@D$$'-'8D$I&79KLL j)>??r.   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r  r  r  r  r  r   s     r,   r   zSpeechT5FeedForward.forward   sg    //>>00??11-@@))-88++M::r.   r   r   s   @r,   r  r    sL        @ @ @ @ @      r.   r  c                        e Zd Zdef fdZ	 	 	 	 ddej        deej                 deej                 deej                 d	ef
d
Z	 xZ
S )SpeechT5EncoderLayerr   c                    t                                                       t          |j        |j        |j        d          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          ||j                  | _        t          j        |j        |j                  | _        d S )NF)r  r  r   r  r0  )rr   rs   r  r   encoder_attention_headsattention_dropout	attentionr   r   r  r   r   r2  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normr6  s     r,   rs   zSpeechT5EncoderLayer.__init__  s    *(4,	
 
 
 z&"788,v'9v?TUUU/8NOO "V-?VEZ [ [ [r.   NFr   r1   r  r  r  c                    |}|                      |||||          \  }}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}|r||fz  }|S )as  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(config.encoder_attention_heads,)`.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r1   r  r  r  )r  r   r   r  r  )	r   r   r1   r  r  r  residualr  outputss	            r,   r   zSpeechT5EncoderLayer.forward  s    . !&*nn')+'/ '5 '
 '
#| ]33 =066%(9(9-(H(HH--m<< " 	'&Gr.   )NNNF)r   r   r   r   rs   r   r   r   rV   r   r   r   s   @r,   r  r  
  s        \~ \ \ \ \ \ \  262604"', ,|, !., "%,/	,
  -,  , , , , , , , ,r.   r  c                   D    e Zd Zddef fdZ eddd          	 	 	 	 	 	 	 	 	 dd
ej        deej                 deej                 deej                 deej                 deej                 dee	         dee
         dee
         deej                 fd            Z xZS )SpeechT5DecoderLayerNr   c                 6   t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |j        |j        |j        d|          | _        t          j        |j        |j                  | _        t!          ||j                  | _        t          j        |j        |j                  | _        d S )NT)r  r  r   r  r  r0  )r   r  r  )rr   rs   r  r   decoder_attention_headsr  	self_attnr   r   r  r   r   r2  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr  r  )r   r   r  r   s      r,   rs   zSpeechT5DecoderLayer.__init__H  s    *(4,
 
 
 z&"788$&L1CI^$_$_$_!-*,
 
 
 (*|F4FFLa'b'b'b$/8NOO "V-?VEZ [ [ [r.   r  r  r  r  FTr   r1   encoder_hidden_statesencoder_attention_maskr  cross_attn_layer_head_maskr  	use_cacher  c           	         |}|                      ||||||
          \  }}|                     |          }||z   }|                     |          }d}|P|}|                     |||||||
          \  }}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}|r|||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r  r1   r  r  r  N)r   r  r1   r  r  r  r  )r  r   r  r  r  r  r  )r   r   r1   r  r  r  r  r  r  r  r  r  self_attn_weightscross_attn_weightsr	  s                  r,   r   zSpeechT5DecoderLayer.forward`  s8   @ ! ,0>>'+)+/) ,: ,
 ,
(( ]33 =011-@@ " ,$H040A0A+!65 : /"3- 1B 1 1-M- !LL77M$}4M 88GGM &(9(9-(H(HH--m<< " 	?)+=>>Gr.   r   )	NNNNNNFTN)r   r   r   r   rs   r   r   r   r   r   rV   r   r   r   s   @r,   r  r  G  sU       \ \~ \ \ \ \ \ \0 _%0A6RRR 268<9=26=A+/,1$(15I I|I !.I  (5	I
 !) 6I "%,/I %-U\$:I "%I $D>I D>I !.I I I SRI I I I Ir.   r  c                   <    e Zd ZU eed<   dZdZdZdej	        fdZ
dS )SpeechT5PreTrainedModelr   speecht5r/   Tmodulec           
         | j         j        }t          |t                    rt          j                            |j        j        ddt          j
        d|j        j        d         |j        j        z  z            z             t          j                            |j        j        d           nt          |t                    r!|j        j                            d           nlt          |t&                    r}t          j
        d|j        j        z            }t          j                            |j        j        | |           t          j                            |j        j        | |           nt          |t          j                  rH|j        j                            d|           |j        |j        j                                         nxt          |t          j        t          j        t          j        f          r?|j        j                                         |j        j                            d           nt          |t          j                  rt          j                            |j                   |j        Wt          j
        |j        |j        |j        d         z  z            }t          j                            |j        | |           nkt          |t          j                  rQ|j        j                            d|           |j         )|j        j        |j                                                   tC          |d	          r&t          j                            |j"                   dS dS )
zInitialize the weightsr   r   r   meanstdr   )abr4   NrA  )#r   initializer_ranger  r   r   initnormal_r{   r   r   r  ro   in_channels	constant_rq   r   r   datafill_r.  r4  in_featuresr@  r3  zero_r   r   r  rw   kaiming_normal_r   r
  r   r   rA  )r   r  r!  ks       r,   _init_weightsz%SpeechT5PreTrainedModel._init_weights  s   k+f=>> 	?GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.2222 @AA 	?L##C(((( 9:: 	?	!f/;;<<AGV.5!qAAAGV.3rQ????	** 	?M&&CS&999{& &&(((r|R^ LMM 	?K""$$$M$$S))))	** 	?G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 888-- 	?M&&CS&999!-"6#56<<>>>6.// 	7GV566666	7 	7r.   N)r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   Moduler/  rF   r.   r,   r  r    sP         "$O&*#"7BI "7 "7 "7 "7 "7 "7r.   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej        deej	                 deej	                 dee
         d	ee
         d
ee
         deeef         fdZ xZS )SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    r   c                    t                                                     t          j        j        j                  | _        t          j        j                  | _	        j
        | _        t          j        fdt          j                  D                       | _        t!          j        j        z  j                  | _        d| _        |                                  d S )Nr0  c                 .    g | ]}t                    S rF   )r  )rG   rH   r   s     r,   rI   z,SpeechT5Encoder.__init__.<locals>.<listcomp>  s"    $h$h$ha%9&%A%A$h$h$hr.   F)rr   rs   r   r   r   r2  r   r   r  r   encoder_layerdrop	layerdropr   rT   encoder_layersrq  r  r  encoder_max_relative_positionr  r"  	post_initr6  s    `r,   rs   zSpeechT5Encoder.__init__  s       ,v'9v?TUUUz&"7881m$h$h$h$h5QWQfKgKg$h$h$hiiA&"@@&Bf 
  
 ',# 	r.   Nr   r1   	head_maskr  output_hidden_statesreturn_dictr:   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          ||j                  }|                     |          }|                     |          }|                     |          }t                      pt          |           }|rdnd}	|rdnd}
|p|                                d         t          | j                  k    r@t          dt          | j                   d|                                d          d          t          | j                  D ]m\  }}|r|	|fz   }	d}| j        rt#          j        g           }|| j        k     }|r|r" |||||||         nd|          }|d         }|rd	}|r|
|d
         fz   }
n|r|	|fz   }	|st)          d ||	|
fD                       S t+          ||	|
          S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrF   r   z&The head_mask should be specified for  layers, but it is for .F)r1   r  r  r  rg  r   c              3      K   | ]}||V  	d S r   rF   rG   vs     r,   	<genexpr>z*SpeechT5Encoder.forward.<locals>.<genexpr>O  s(      mmq_`_l_l_l_l_lmmr.   last_hidden_stater   
attentions)r   r  r?  use_return_dictr   rK   r   r   r  r   r   r   rY   rq  r)   	enumerater+  r   rO   r:  r  r   )r   r   r1   r>  r  r?  r@  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r,   r   zSpeechT5Encoder.forward  s   H 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] %7H[\\N66]33,,];;022R6LT6R6R"6@BBD$5?bb4  ~~"c$+&6&666 /S=M=M / /!((+/ / /  
 #,DK"8"8 	P 	PC# I$58H$H! #N} F&+jnn#!4t~!E! 	1[ 	1 -!#1"/7@7LYs^^RV&7! ! ! !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r.   NNNNNr   r   r   r   r   rs   r   ri  r   r   rV   r   r  r   r   r   r   s   @r,   r6  r6    s         ~      ( 26,0,0/3&*f
 f
(f
 !.f
 EL)	f

 $D>f
 'tnf
 d^f
 
uo%	&f
 f
 f
 f
 f
 f
 f
 f
r.   r6  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej        deej	                 deej	                 dee
         d	ee
         d
ee
         deeef         fdZ xZS )SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    r   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )rr   rs   r:  prenetr6  wrapped_encoderr=  r6  s     r,   rs   z(SpeechT5EncoderWithSpeechPrenet.__init__^  R       1&99.v66 	r.   Nr/   r1   r>  r  r?  r@  r:   c                 n    |                      ||          \  }}|                     ||||||          }|S N)r   r1   r>  r  r?  r@  rZ  r[  	r   r/   r1   r>  r  r?  r@  r   r	  s	            r,   r   z'SpeechT5EncoderWithSpeechPrenet.forwardf  sO     )-L.(Q(Q%~&&')/!5# ' 
 
 r.   rU  rV  r   s   @r,   rX  rX  X  s         
~       26,0,0/3&* ' !. EL)	
 $D> 'tn d^ 
uo%	&       r.   rX  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 	 ddej	        de
ej                 d	e
ej                 d
e
e         de
e         de
e         deeef         fdZ xZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    r   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )rr   rs   r  rZ  r6  r[  r=  r6  s     r,   rs   z&SpeechT5EncoderWithTextPrenet.__init__  R       /77.v66 	r.   c                 4    | j                                         S r   rZ  get_input_embeddingsrF  s    r,   rg  z2SpeechT5EncoderWithTextPrenet.get_input_embeddings      {//111r.   c                 :    | j                             |           d S r   rZ  set_input_embeddingsr   values     r,   rk  z2SpeechT5EncoderWithTextPrenet.set_input_embeddings      ((/////r.   Nr/   r1   r>  r  r?  r@  r:   c                 f    |                      |          }|                     ||||||          }|S r^  r_  r`  s	            r,   r   z%SpeechT5EncoderWithTextPrenet.forward  sH     L11&&')/!5# ' 
 
 r.   rU  )r   r   r   r   r   rs   rg  rk  r   ri  r   r   rV   r   r  r   r   r   r   s   @r,   rb  rb  }  s        ~      2 2 20 0 0 26,0,0/3&* ' !. EL)	
 $D> 'tn d^ 
uo%	&       r.   rb  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej        deej	                 deej	                 dee
         d	ee
         d
ee
         deeef         fdZ xZS )SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    r   c                     t                                          |           t          |          | _        |                                  d S r   )rr   rs   r6  r[  r=  r6  s     r,   rs   z%SpeechT5EncoderWithoutPrenet.__init__  C       .v66 	r.   Nr/   r1   r>  r  r?  r@  r:   c                 8    |                      ||||||          S r^  )r[  )r   r/   r1   r>  r  r?  r@  s          r,   r   z$SpeechT5EncoderWithoutPrenet.forward  s3     ##&)/!5# $ 
 
 	
r.   rU  rV  r   s   @r,   rq  rq    s         
~       26,0,0/3&*
 
'
 !.
 EL)	

 $D>
 'tn
 d^
 
uo%	&
 
 
 
 
 
 
 
r.   rq  c                   d    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej	                 d	eej
                 d
eej
                 dee         dee         dee         dee         dee         deej
                 deeef         fdZ xZS )SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    r   c                    t                                                     j        | _        t	          j        fdt          j                  D                       | _        d| _	        | 
                                 d S )Nc                 2    g | ]}t          |           S ))r  )r  r  s     r,   rI   z,SpeechT5Decoder.__init__.<locals>.<listcomp>  s(    $u$u$uST%9&A%N%N%N$u$u$ur.   F)rr   rs   decoder_layerdropr:  r   r   rT   decoder_layersrq  r"  r=  r6  s    `r,   rs   zSpeechT5Decoder.__init__  sy       1m$u$u$u$uX]^d^sXtXt$u$u$uvv&+# 	r.   Nr   r1   r  r  r>  cross_attn_head_maskr  r  r  r?  r@  r  r:   c                    |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|                                dd         }| j        r%| j        r|rt          	                    d           d}|r8|6t          t          | j                   t          | j                             }|rCt          |t                    r.t          	                    d           t          j        |          }||                                nd}t!          ||||          }||t#          ||j        |d                   }t'                      pt)          |           }|
rd	nd}|	rd	nd}|	r|d	nd}t+          ||gd
dg          D ]z\  }}|s|                                d         t-          | j                  k    rCt1          d| dt-          | j                   d|                                d          d          {t3          | j                  D ]\  }}|
r||fz   }d}| j        rt5          j        g           }|| j        k     }|r|s: ||||||||         nd|||         nd||	||
  
        }|d         }|	r||d         fz   }|||d         fz   }|
r||fz   }|st          d |||||fD                       S t;          |||||          S )aA  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr%   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r  rF   r>  r|  zThe `z` should be specified for rB  rC  )r  r  r  r  r  r  r  r   r   c              3      K   | ]}||V  	d S r   rF   rE  s     r,   rG  z*SpeechT5Decoder.forward.<locals>.<genexpr>y  s0        =  === r.   )rI  r  r   rJ  cross_attentions)r   r  r?  r  rK  r   r"  r+  loggerwarning_oncer   r   r  r  from_legacy_cacher  r   r   rK   r   r   r]  rY   rq  r)   rL  r   rO   r:  r   )r   r   r1   r  r  r>  r|  r  r  r  r?  r@  r  r  r   rM  rN  rO  all_cross_attentions	attn_mask	mask_namerP  decoder_layerrR  rS  rT  s                             r,   r   zSpeechT5Decoder.forward  s4   P 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]#((**3B3/& 	"4= 	" "##p   "	 	v01,dk2R2R2RT`hlhsTtTtTtuuO 	UOU;; 	U\  
 2COTTOETE`!?!?!A!A!Afg:K8N
 

 !,1G1S%?&(;[QS_& & &" 122R6LT6R6R #7@BBD$5?bb4&7h<Q<]rrdh %(4H(IKYoKp$q$q 	 	 Iy$>>##A&3t{+;+;<<$3	 3 3SEUEU 3 3%NN,,Q/3 3 3  
 #,DK"8"8 	V 	VC# I$58H$H! #N} F&+jnn#!4t~!E k )M%'=3<3H3dI]Ii,@,E,Eos /"3#-  M *!,M  V&9]1=M<O&O#(4+?=QRCSBU+U( 	E 1]4D D 	  ':KM`bvw      9+++*1
 
 
 	
r.   NNNNNNNNNNNNr   r   r   r   r   rs   r   r   ri  rh  r   r   rV   r   r  r   r   r   r   s   @r,   rw  rw    sw        	~ 	 	 	 	 	 	 6:59=A=A,07;+/$(,0/3&*15n
 n
 12n
 !!12n
  ((9:	n

 !))9 :n
 EL)n
 'u|4n
 "%n
 D>n
 $D>n
 'tnn
 d^n
 !.n
 
u??	@n
 n
 n
 n
 n
 n
 n
 n
r.   rw  c                       e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej	                 d	eej
                 d
eej
                 deej
                 dee         dee         dee         dee         dee         deej
                 deeef         fdZ xZS )SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    r   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )rr   rs   rk  rZ  rw  wrapped_decoderr=  r6  s     r,   rs   z(SpeechT5DecoderWithSpeechPrenet.__init__  r\  r.   Nr/   r1   r  r  r~  r>  r|  r  r  r  r?  r@  r  r:   c                 t    |                      ||          }|                     ||||||||	|
|||          }|S N)r   r1   r  r  r>  r|  r  r  r  r?  r@  r  rZ  r  )r   r/   r1   r  r  r~  r>  r|  r  r  r  r?  r@  r  decoder_hidden_statesr	  s                   r,   r   z'SpeechT5DecoderWithSpeechPrenet.forward  s^      !%L:L M M&&/)"7#9!5+/!5#) ' 
 
 r.   )NNNNNNNNNNNNNr  r   s   @r,   r  r    su        
~       5959=A=A59,07;+/$(,0/3&*15! !u01! !!12!  ((9:	!
 !))9 :! %U\2! EL)! 'u|4! "%! D>! $D>! 'tn! d^! !.! 
u??	@! ! ! ! ! ! ! !r.   r  c                   p    e Zd ZdZdef fdZd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j                 dee	j                 dee	j                 dee         dee         dee         dee         dee         dee	j                 deeef         fdZ xZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    r   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )rr   rs   r  rZ  rw  r  r=  r6  s     r,   rs   z&SpeechT5DecoderWithTextPrenet.__init__  rd  r.   c                 4    | j                                         S r   rf  rF  s    r,   rg  z2SpeechT5DecoderWithTextPrenet.get_input_embeddings  rh  r.   c                 :    | j                             |           d S r   rj  rl  s     r,   rk  z2SpeechT5DecoderWithTextPrenet.set_input_embeddings  rn  r.   Nr/   r1   r  r  r>  r|  r  r  r  r?  r@  r  r:   c                 |    |                      |||          \  }}|                     |||||||||	|
||          }|S r  r  )r   r/   r1   r  r  r>  r|  r  r  r  r?  r@  r  r  r	  s                  r,   r   z%SpeechT5DecoderWithTextPrenet.forward  se     15L.Zi0j0j-~&&/)"7#9!5+/!5#) ' 
 
 r.   r  )r   r   r   r   r   rs   rg  rk  r   r   ri  rh  r   r   rV   r   r  r   r   r   r   s   @r,   r  r    s        ~      2 2 20 0 0
 5959=A=A,07;+/$(,0/3&*15   u01  !!12   ((9:	 
 !))9 :  EL)  'u|4  "%  D>  $D>  'tn  d^  !.  
u??	@               r.   r  c                   d    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej	                 d	eej
                 d
eej
                 dee         dee         dee         dee         dee         deej
                 deeef         fdZ xZS )SpeechT5DecoderWithoutPrenetrr  r   c                     t                                          |           t          |          | _        |                                  d S r   )rr   rs   rw  r  r=  r6  s     r,   rs   z%SpeechT5DecoderWithoutPrenet.__init__  rt  r.   Nr/   r1   r  r  r>  r|  r  r  r  r?  r@  r  r:   c                 H    |                      |||||||||	|
||          }|S r  )r  )r   r/   r1   r  r  r>  r|  r  r  r  r?  r@  r  r	  s                 r,   r   z$SpeechT5DecoderWithoutPrenet.forward  sI     &&&)"7#9!5+/!5#) ' 
 
 r.   r  r  r   s   @r,   r  r    sa        
~       5959=A=A,07;+/$(,0/3&*15 u01 !!12  ((9:	
 !))9 : EL) 'u|4 "% D> $D> 'tn d^ !. 
u??	@       r.   r  c                        e Zd ZdZdef fdZdej        dej        dej        dej	        fdZ
d	 Zed
             Z xZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
    r   c                 x    t                                                       |j        | _        |j        | _        d S r   )rr   rs   guided_attention_loss_sigmasigmaguided_attention_loss_scalescaler6  s     r,   rs   z-SpeechT5GuidedMultiheadAttentionLoss.__init__#  s1    7
7


r.   rJ  input_masksoutput_masksr:   c                 V   |                      |||j                  }|                    d          |                    d          z  }|                    |j                                      d          }||z  }t	          j        |                    |                    }| j        |z  S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r%   r   r   )_make_guided_attention_masksr   r   r   r   r   masked_selectr  )r   rJ  r  r  guided_attn_masksmaskslosseslosss           r,   r   z,SpeechT5GuidedMultiheadAttentionLoss.forward(  s    " !==k<YcYjkk&&r**[-B-B2-F-FF*++55a88"Z/z&..u5566zD  r.   c                    |                     d          }|                     d          }t          j        t          |          |j        d         |j        d         f|          }t          t          ||                    D ]/\  }\  }}	|                     ||	| j        |          ||d |	d |f<   0|	                    d          S )Nr%   r   rR  )
rR   r   rU   rY   r'   rL  r]  _make_guided_attention_maskr  r   )
r   r  r  r   rc   rV  r  rP  ilenolens
             r,   r  zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksA  s    #++%))"--!K[)9)9<;Ma;PR]RcdeRf(gpvwww!*3}n+M+M!N!N 	t 	tC$373S3STXZ^`d`jlr3s3sc5D5%4%/00 **1---r.   c                 0   t          j        t          j        | |          t          j        ||          d          \  }}|                                |z  }|                                | z  }dt          j        ||z
  dz   d|dz  z  z            z
  S )NrR  xy)indexingr   r   )r   meshgridrX   r   r   )r@   output_lengthr  r   grid_ygrid_xs         r,   r  z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_maskL  s    Lf555Lv666
 
 

 -/,.UY&6/a!78ANKLLLLr.   )r   r   r   r   r   rs   r   ri  
BoolTensorr   r   r  r   r  r   r   s   @r,   r  r    s         
8~ 8 8 8 8 8 8
!+!:?:J!Z_Zj!	! ! ! !2	. 	. 	. M M \M M M M Mr.   r  c                        e Zd ZdZdef fdZ	 ddej        dej        dej        dej        d	ej        d
e	ej                 dej
        fdZ xZS )SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    r   c                 >   t                                                       |j        | _        |j        | _        |j        | _        t                      | _        t          t          j	        d                    | _
        | j        rt          |          | _        d S d S )Ng      @)
pos_weight)rr   rs   use_guided_attention_lossguided_attention_loss_num_headsr0   r   l1_criterionr   r   r   bce_criterionr  attn_criterionr6  s     r,   rs   z SpeechT5SpectrogramLoss.__init__]  s    )/)I&/5/U, & 7"HH.%,s:K:KLLL) 	O"Fv"N"ND	O 	Or.   Nr1   r  r  r  labelsr  r:   c                 b    |dk    }|                     |          }|                     |          }|                     |          }                     ||                               ||          z   }|d d d d df         }	t          j        |	 dz  t          j        |	                    d          d                              |	j                  gd          }
|
d d dd f                              |	          }
|                     |	          }                     ||
          }||z   } j	        rzt          j         fd|D             d          }|dk    }|d d d d df         } j
        dk    r|d d  j
        dz
  d  j
        f         }                     |||          }||z  }|S )Nr3   r   r   r   r   c                 6    g | ]}|d d d j         f         S r   )r  )rG   xr   s     r,   rI   z3SpeechT5SpectrogramLoss.forward.<locals>.<listcomp>  s0    eeeqa#IT%I#I IJeeer.   )r  r  r   r   r[   r   r   r   r  r  r0   r  )r   r1   r  r  r  r  r  rN  l1_lossr  stop_labelsbce_lossr  attnr  r  	attn_losss   `                r,   r   zSpeechT5SpectrogramLoss.forwardi  s    ' %%l33!7!E!El!S!S 5 C CL Q Q ##$96BBTEVEVWmouEvEvv QQQ1W%i%#uz%**Q--/K/K/N/Nu|/\/\ ]cdeee!!!!QRR%(66u==%%e,, %%fk:: ! ) 	9eeeeTdeeeklmmmD(A-K'111a0L$q((+AAAt/Dq/H/aDLa/a,ab++D+|LLIIDr.   r   )r   r   r   r   r   rs   r   rh  ri  r   r   r   r   r   s   @r,   r  r  X  s         
O~ 
O 
O 
O 
O 
O 
O& 9=) )() !& 1)  %0	)
 !) !) #5#45) 
) ) ) ) ) ) ) )r.   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc            $       D    e Zd Z	 	 ddedeej                 deej                 f fdZd Zd Z	d Z
d	 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   dee         dee         deej                 dee         dee         dee         deej                 deeej                 ef         f d            Z xZS )SpeechT5ModelNr   encoderdecoderc                     t                                          |           || _        |t          |          n|| _        |t          |          n|| _        |                                  dS )z
        encoder (`PreTrainedModel`, *optional*):
            The encoder model to use.
        decoder (`PreTrainedModel`, *optional*):
            The decoder model to use.
        N)rr   rs   r   rq  r  r  r  r=  )r   r   r  r  r   s       r,   rs   zSpeechT5Model.__init__  sp     	   ?F3F;;;T[?F3F;;;T[ 	r.   c                     t          | j        t                    r| j                                        S t          | j        t
                    r| j                                        S t          r   )r  r  rb  rg  r  r  NotImplementedErrorrF  s    r,   rg  z"SpeechT5Model.get_input_embeddings  sZ    dl$ABB 	7<44666dl$ABB 	7<44666!!r.   c                     t          | j        t                    r| j                            |           t          | j        t
                    r| j                            |           d S d S r   )r  r  rb  rk  r  r  rl  s     r,   rk  z"SpeechT5Model.set_input_embeddings  sh    dl$ABB 	5L--e444dl$ABB 	5L--e44444	5 	5r.   c                     | j         S r   )r  rF  s    r,   get_encoderzSpeechT5Model.get_encoder  s
    |r.   c                 z    t          | j        t                    r | j        j                                         dS dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  rX  rZ  rG  rF  s    r,   rG  z$SpeechT5Model.freeze_feature_encoder  s@    
 dl$CDD 	9L6688888	9 	9r.   r/   r1   decoder_input_valuesdecoder_attention_maskr>  decoder_head_maskr|  encoder_outputsr  r  r~  r  r?  r@  r  r:   c                 6   ||n| j         j        }||n| j         j        }|
|
n| j         j        }
||n| j         j        }||                     ||||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|Lt          | j        t                    r2| j        j
                            |d         j        d         |          }n|}t          | j        t                    rd|i}ni } | j        d
|||d         ||||	|
||||d|}|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j        	          S )a  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        N)r/   r1   r>  r  r?  r@  r   r   r   rH  r~  )r/   r1   r  r  r>  r|  r  r  r  r?  r@  r  )rI  r  r  decoder_attentionsr  encoder_last_hidden_stater  encoder_attentionsrF   )r   r  r?  r  rK  r  r  r   rY   rX  rZ  rJ  r'   r  r  r   rI  r  r   rJ  r  )r   r/   r1   r  r  r>  r  r|  r  r  r  r~  r  r?  r@  r  r  decoder_argsdecoder_outputss                      r,   r   zSpeechT5Model.forward  s-   T 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] ""ll)-#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O %*T\Cb*c*c%%)\%8%[%["(+^& &"" &4"dl$CDD 	02DELLL&$, 
-1"1!"4#9'!5+/!5#)
 
 
 
   	5"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r.   rg  NNNNNNNNNNNNNNN)r   r   r   r   r   r   r4  rs   rg  rk  r  rG  r   r   r   rh  ri  r  r   rV   r   r   r   r   r   s   @r,   r  r    s8        (,'+	  ")$ ")$	     (" " "5 5 5  9 9 9  04597;=A159=7;EI+/$(:>,0/3&*15!k
 k
u|,k
 !!12k
 'u|4	k

 !))9 :k
 E-.k
 $E$56k
 'u|4k
 "%e.?(@"ABk
 "%k
 D>k
 %U%67k
 $D>k
 'tnk
 d^k
  !.!k
" 
uU&');;	<#k
 k
 k
 ^k
 k
 k
 k
 k
r.   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c            $            e Zd ZdgZdef fdZd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   dee         dee         dee         dee         dee         deej                 deej                 deeef         f d            Z xZS )SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightr   c                 @   t                                          |           |j        t          d| j         d          t          |          }t          |          }t          |||          | _        t          |          | _
        |                                  d S )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rr   rs   r  r)   r   rX  r  r  r  r  text_decoder_postnetr=  )r   r   speech_encodertext_decoderr   s       r,   rs   z SpeechT5ForSpeechToText.__init__>  s       $/ / / /   9@@4V<<%fnlKK$>v$F$F! 	r.   c                 4    | j                                         S r   r  r  rF  s    r,   r  z#SpeechT5ForSpeechToText.get_encoderR      }((***r.   c                 4    | j                                         S r   r  get_decoderrF  s    r,   r  z#SpeechT5ForSpeechToText.get_decoderU  r  r.   c                 \    |                                  j                                         dS r  r  rZ  rG  rF  s    r,   rG  z.SpeechT5ForSpeechToText.freeze_feature_encoderX  +    
 	!88:::::r.   c                 4    | j                                         S r   )r  r  rF  s    r,   r  z-SpeechT5ForSpeechToText.get_output_embeddings_  s    (>>@@@r.   c                 :    | j                             |           d S r   )r  r  r  s     r,   r  z-SpeechT5ForSpeechToText.set_output_embeddingsb  s    !77GGGGGr.   Nr/   r1   decoder_input_idsr  r>  r  r|  r  r  r  r  r?  r@  r  r  r:   c                 8   ||n| j         j        }|'|%t          || j         j        | j         j                  }|                     |||||||||	|
||d|          }|                     |d                   }d}|Kt                      } ||                    d| j         j	                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j        |j        |j        |j        	  	        S )a(  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)r/   r1   r  r  r>  r  r|  r  r  r  r  r?  r@  r  r   r%   r   )	r  r  r  r  r  r  r  r  r  )r   rK  r-   r"   r#   r  r  r   r   r  r   r  r  r  r  r  r  r  )r   r/   r1   r  r  r>  r  r|  r  r  r  r  r?  r@  r  r  r	  r  r  loss_fctoutputs                        r,   r   zSpeechT5ForSpeechToText.forwarde  sb   v &1%<kk$+B] ($6DK4dk6X% %! --%)!2#9/!5++/!5)   
 
" **71:66'))H8FKKDK,BCCV[[QS__UUD 	FY,F)-)9TGf$$vE#3")"?&9$5&-&G")"?&9

 

 

 
	
r.   r  )r   r   r   _tied_weights_keysr   rs   r  r  rG  r  r  r   r   r   ri  rh  r   r  r   rV   r   r   r   r   r   s   @r,   r  r  6  s"        @@~      (+ + ++ + +; ; ;A A AH H H  59598<=A159=7;EI+/$(,0/3&*-115!H
 H
u01H
 !!12H
 $E$45	H

 !))9 :H
 E-.H
 $E$56H
 'u|4H
 "%e.?(@"ABH
 "%H
 D>H
 $D>H
 'tnH
 d^H
 )*H
  !.!H
" 
uo%	&#H
 H
 H
 ^H
 H
 H
 H
 H
r.   r        ?r4         4@Fmodelr~  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
           
        "#$ |t          d          |&d|| j        j        k                                    z
  }
n|}
|                    d          }| j                            ||
d          }|j        }t          | j        j        t                    r6| j        j        j
                            |d         j        d         |
          }
t          |                    d          |z  | j        j        z            }t          |                    d          |z  | j        j        z            }|                    |d| j        j                  }g }g }d }d}i "	 |dz  }| j        j        
                    ||          }| j        j                            |d d dd f         d ||
|d|d          }|r.|                    t'          j        |j        d                     |j                            d          }|j        }| j                            |          }|                    || j        j        | j        j                  }|                    |           |d d dd d f                             |d| j        j                  }t'          j        ||fd          }t'          j        | j                            |                    }||k     r~||k     rGt'          j        |d          |k    }t'          j        |          d                                         }ntA          tC          |                    }"fd	|D             }tC          |          dk    rht'          j"        |          }|#                    dd          $                    dd
          }| j        %                    |          }|D ]}||         "|<   tC          "          |k    rn"fdtA          tC          "                    D             }|	s|dk    r|d         n*t&          j&        j'        j(        )                    |d          }| ||          }n|}|rlt'          j        |d
          }|dk    rL |j        |t          |                    d          |z            g|                                dd          R  }||f}n*g #tA          |          D ]0} #                    ||                              d                     1|0t&          j&        j'        j(        )                    |d          }|#f}nKg $t&          j&        j'        j(        )                    |d          } ||          $#$fd#D             }!$|!f}|rit'          j        |d
          } |j        |t          |                    d          |z            g|                                dd          R  }g ||R }|S )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r/   r1   r@  r%   )r   r1   r  r  r  r  r  r@  r   c                     g | ]}|v|	S rF   rF   rG   r  result_spectrograms     r,   rI   z$_generate_speech.<locals>.<listcomp>R	  s$    SSS!q@R7R7RA7R7R7Rr.   r   c                      g | ]
}|         S rF   rF   r  s     r,   rI   z$_generate_speech.<locals>.<listcomp>[	  s    RRRa&q)RRRr.   )batch_firstc                 z    g | ]7}t                              d           t                    z            |z  8S r   )r>   r   r?   )rG   r  spectrogram_lengths	waveformss     r,   rI   z$_generate_speech.<locals>.<listcomp>u	  sB    sssZ[INN1$5$5<O8P8P$P Q QTU Usssr.   )*r)   r   r"   r>   r   r  r  rI  r  rX  rZ  rJ  r'   r0   r&   rn  r  r  r]   r   r   r  squeezer  speech_decoder_postnetr  r   sigmoidr  rR   rz  rS   rT   rY   stackr   flattenr  r   r   rnnpad_sequence)%r  r/   r~  r1   r  r  r  r  r   r  r  r   encoder_outr  maxlenminlenoutput_sequencespectrogramr  r  rP  r  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesspectrograms
meet_indexr	  r  waveform_lengthsr  r  r  s%                                     @@@r,   _generate_speechr#    s    !
 
 	
 !"lel6O&O%T%T%V%V!V!/


A

C.((!- )  K !, = %.(*IJJ 
!&!7!>!a!aN #%;"
 "
 *//22[@5<C``aaF*//22[@5<C``aaF 099#q%,B[\\OKO
C4q !& 6 = =oOa b bn,<</2337";#9+5 = 	
 	
 # 	T##EIk.JPQ$R$R$RSSS);CCAFF%5 /889LMM==el&CU\E^__8$$$ #111b!!!8,11#q%,:STT)_o$FANNN}U9BBCVWWXX<< V||"')Db"9"9"9Y"F${?;;A>EEGG$SYY//SSSS|SSSL<  1$$${;77+55a;;CCAqII$;CCLQQ". N NJ5A*5M&z22%&&#--i4j SRRR5=O9P9P3Q3QRRRL   3),l1ooux~7I7V7VWcqu7V7v7vgk**GG!G" 	2$y)9qAAAQww#8#3#8-22155;<<$?O?T?T?V?VWYWZWZ?[$ $ $   01G !s 	@ 	@A&&|A';';A'>'>????? 8>-::<UY:ZZL#%89GGI 8>-::<UY:ZZL--Isssss_rsss "23G" 	3$y)9qAAA4/4S)..q11C788 ;K;P;P;R;RSUSVSV;W      32!122GNr.   zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c            (       ,    e Zd ZdZdef fdZedefd            Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&deej                 d	eej                 d
eej                 deej                 deej                 deej                 deej                 deeeej                                   dee         dee         dee         dee         dee         deej                 deej                 deej                 deej                 deeef         f$d            Z ej                    	 	 	 	 	 	 	 	 d'dej        d	eej                 deej                 deded ed!eej                 d"ed#edeej        eej        ej        f         f         fd$            Z ej                    	 	 	 	 	 	 	 	 d'dej        deej                 d	eej                 deded ed!eej                 d"ed#edeej        eej        ej        f         f         fd%            Z xZS )(SpeechT5ForTextToSpeechr!   r   c                 @   t                                          |           |j        t          d| j         d          t          |          }t          |          }t          |||          | _        t          |          | _
        |                                  d S )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rr   rs   r  r)   r   rb  r  r  r  r  r  r=  )r   r   text_encoderspeech_decoderr   s       r,   rs   z SpeechT5ForTextToSpeech.__init__	  s       $/ / / /   5V<<8@@%flNKK&B6&J&J# 	r.   r:   c                     dS r*  rF   )clss    r,   can_generatez$SpeechT5ForTextToSpeech.can_generate	  s	    
 tr.   c                 4    | j                                         S r   r  rF  s    r,   r  z#SpeechT5ForTextToSpeech.get_encoder	  r  r.   c                 4    | j                                         S r   r  rF  s    r,   r  z#SpeechT5ForTextToSpeech.get_decoder	  r  r.   Nr1   r  r  r>  r  r|  r  r  r  r  r?  r@  r~  r  r  r  c                    ||n| j         j        }|.|t          || j         j        |          \  }}| j         j        rd}|                     |||||||||	|
|||d|          }|                     |d                   \  }}}d}|)t          | j                   } |||||||j                  }|s|f|dd         z   }||f|z   n|S t          |||j
        |j        |j        |j        |j        |j        |j        	  	        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr/   r1   r  r  r>  r  r|  r  r  r  r~  r  r?  r@  r  r   r   	r  r  r  r  r  r  r  r  r  )r   rK  r6   r0   r  r  r  r  r  r   r  r  r  r  r  r  )r   r!   r1   r  r  r>  r  r|  r  r  r  r  r?  r@  r~  r  r  r  r	  r  r  r  r  	criterionr  s                            r,   r   zSpeechT5ForTextToSpeech.forward	  s   Z &1%<kk$+B]#+?WDK8:P@ @<$&< {4 )$(!--")!5#9/!5++1/!5)   
 
$ AE@[@[\cde\f@g@g= 5v/<<I9&%( D  	F+-;F)-)9TGf$$vE'-#3")"?&9$5&-&G")"?&9

 

 

 
	
r.   r  r4   r  Fr  r  r  r  r   r  c
                    |m|                     d          }|                     d          |k    r?|                     d          dk    r|                    |d          }nt          d          t          | |||||||||	
  
        S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   ry  r)   r#  )r   r!   r1   r~  r  r  r  r  r   r  kwargsrb   s               r,   generatez SpeechT5ForTextToSpeech.generate0
  s    J )"**J!&&q))Z77%**1--22);)B)B:q)Q)Q&&$o    #!
 
 	
r.   c
                    |m|                     d          }
|                     d          |
k    r?|                     d          dk    r|                    |
d          }nt          d          t          | |||||||||	
  
        S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r3  )r   r!   r~  r1   r  r  r  r  r   r  rb   s              r,   generate_speechz'SpeechT5ForTextToSpeech.generate_speech
  s    R )"**J!&&q))Z77%**1--22);)B)B:q)Q)Q&&$o    #!
 
 	
r.   NNNNNNNNNNNNNNNNNNNr  r4   r  NFF)r   r   r   r2  r   rs   classmethodrV   r+  r  r  r   r   r   rh  ri  r   r  r   r   r   r   r   r   r   r4  r5  r7  r   r   s   @r,   r%  r%  	  s        "O~      ( T    [+ + ++ + +  1559<@=A159=7;EI+/$(,0/3&*:>.2.215%D
 D
E,-D
 !!12D
 'u'89	D

 !))9 :D
 E-.D
 $E$56D
 'u|4D
 "%e.?(@"ABD
 "%D
 D>D
 $D>D
 'tnD
 d^D
 %U%67D
  *+!D
" el+#D
$ !.%D
& 
u..	/'D
 D
 D
 ^D
L U]__ 6::> !'+(-&+Y
 Y
#Y
 !!12Y
 %U%67	Y

 Y
 Y
 Y
 ")$Y
 "&Y
  $Y
 
u %(95;L(L"MM	NY
 Y
 Y
 _Y
v U]__ ;?59 !'+(-&+]
 ]
#]
 %U%67]
 !!12	]

 ]
 ]
 ]
 ")$]
 "&]
  $]
 
u %(95;L(L"MM	N]
 ]
 ]
 _]
 ]
 ]
 ]
 ]
r.   r%  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c            (           e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%de	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e
j                 de	eee
j                                   de	e         de	e         de	e         de	e         de	e         de	e
j                 de	e
j                 de	e
j                 de	e
j                 deeef         f$d            Z e
j                    	 	 	 	 	 	 	 	 d&de
j        de	e
j                 de	e
j                 deded ed!e	ej                 d"ed#ede
j        fd$            Z xZS )'SpeechT5ForSpeechToSpeechr   c                    t                                          |           t          |          }t          |          }t	          |||          | _        t          |          | _        |                                  d S r   )	rr   rs   rX  r  r  r  r  r  r=  )r   r   r  r(  r   s       r,   rs   z"SpeechT5ForSpeechToSpeech.__init__
  sp       8@@8@@%fnnMM&B6&J&J# 	r.   c                 4    | j                                         S r   r  rF  s    r,   r  z%SpeechT5ForSpeechToSpeech.get_encoder
  r  r.   c                 4    | j                                         S r   r  rF  s    r,   r  z%SpeechT5ForSpeechToSpeech.get_decoder  r  r.   c                 \    |                                  j                                         dS r  r  rF  s    r,   rG  z0SpeechT5ForSpeechToSpeech.freeze_feature_encoder  r  r.   Nr/   r1   r  r  r>  r  r|  r  r  r  r  r?  r@  r~  r  r  r  r:   c                    ||n| j         j        }| |t          || j         j        |          \  }}|                     |||||||||	|
|||d|          }|                     |d                   \  }}}d}|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	        |j
        |j        |j        |j        	  	        S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
            a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr/  r   r   r0  )r   rK  r6   r0   r  r  r   r  r  r  r  r  r  r  )r   r/   r1   r  r  r>  r  r|  r  r  r  r  r?  r@  r~  r  r  r  r	  rH   r  r  r  r  s                           r,   r   z!SpeechT5ForSpeechToSpeech.forward  s0   h &1%<kk$+B]#+?WDK8:P@ @<$&< --%)!5#9/!5++1/!5)   
 
$ "&!<!<WQZ!H!H; 	F!^gabbk1F)-)9TGf$$vE'##3")"?&9$5&-&G")"?&9

 

 

 
	
r.   r  r4   r  Fr  r  r  r  r   r  c
                 l    |t          j        d|j                  }t          | |||||||||	
  
        S )a'  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
                a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
                or the soundfile library (`pip install soundfile`).
                To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
                conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        N)r   i   rR  )r   rU   r   r#  )
r   r/   r~  r1   r  r  r  r  r   r  s
             r,   r7  z)SpeechT5ForSpeechToSpeech.generate_speech  sS    T %!&Xl>Q!R!R!R#!
 
 	
r.   r8  r9  )r   r   r   r   rs   r  r  rG  r   r   r   ri  rh  r   r  r   rV   r   r   r   r   r   r   r4  r7  r   r   s   @r,   r<  r<  
  s       
~ 
 
 
 
 
 
+ + ++ + +; ; ;  5959<@=A159=7;EI+/$(,0/3&*:>.2.215%
 
u01
 !!12
 'u'89	

 !))9 :
 E-.
 $E$56
 'u|4
 "%e.?(@"AB
 "%
 D>
 $D>
 'tn
 d^
 %U%67
  *+!
" el+#
$ !.%
& 
u..	/'
 
 
 ^
B U]__ ;?59 !'+(-&+W
 W
'W
 %U%67W
 !!12	W

 W
 W
 W
 ")$W
 "&W
  $W
 
	W
 W
 W
 _W
 W
 W
 W
 W
r.   r<  c                   :     e Zd Zd
 fd	ZddZd Zd Zd	 Z xZS )HifiGanResidualBlockr	   r   r	      皙?c                 d    t                                                       | _        t          j         fdt          t                              D                        _        t          j         fdt          t                              D                        _        d S )Nc                     g | ]<}t          j        d |                             |                             =S r   )rp   dilationr   r   rw   get_padding)rG   r  channelsrK  ro   r   s     r,   rI   z1HifiGanResidualBlock.__init__.<locals>.<listcomp>  sf     
 
 
  	%a[ ,,[(1+FF  
 
 
r.   c                 l    g | ]0}t          j        d d                     d                     1S rJ  rL  )rG   rH   rN  ro   r   s     r,   rI   z1HifiGanResidualBlock.__init__.<locals>.<listcomp>  s^     
 
 
  	 ,,[!<<  
 
 
r.   )	rr   rs   leaky_relu_sloper   r   rT   rY   convs1convs2)r   rN  ro   rK  rP  r   s   ```` r,   rs   zHifiGanResidualBlock.__init__  s     0m
 
 
 
 
 
 
 s8}}--
 
 

 
 m
 
 
 
 
 
 s8}}--
 
 

 
r.   r   c                     ||z  |z
  dz  S r   rF   )r   ro   rK  s      r,   rM  z HifiGanResidualBlock.get_padding	  s    h&1a77r.   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        }| j        D ]} ||           | j        D ]} ||           d S Nr   )r   r   r   r   r   rQ  rR  r   r   r  s      r,   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  s    h*28,m<< 	@(3?K[ 	 	EK[ 	 	EK	 	r.   c                     | j         D ]!}t          j                            |           "| j        D ]!}t          j                            |           "d S r   )rQ  r   r   remove_weight_normrR  r   r  s     r,   rY  z'HifiGanResidualBlock.remove_weight_norm  s`    [ 	/ 	/EH''....[ 	/ 	/EH''....	/ 	/r.   c                    t          | j        | j                  D ]l\  }}|}t          j                            || j                  } ||          }t          j                            || j                  } ||          }||z   }m|S r   )r]  rQ  rR  r   r  
leaky_relurP  )r   r   conv1conv2r  s        r,   r   zHifiGanResidualBlock.forward  s    T[99 	5 	5LE5$HM44]DDYZZM!E-00MM44]DDYZZM!E-00M)H4MMr.   )r	   rE  rG  r
  )	r   r   r   rs   rM  rW  rY  r   r   r   s   @r,   rD  rD    s~        
 
 
 
 
 
>8 8 8 8  / / /      r.   rD  z
    HiFi-GAN vocoder.
    c                        e Zd ZU eed<   dZdef fdZdej        fdZ	d Z
d Z ed	          dej        d
ej        fd            Z xZS )SpeechT5HifiGanr   r  c                 |   t                                          |           t          |j                  | _        t          |j                  | _        t          j        |j	        |j
        ddd          | _        t          j                    | _        t          t          |j        |j                            D ]X\  }\  }}| j                            t          j        |j
        d|z  z  |j
        d|dz   z  z  ||||z
  dz                       Yt          j                    | _        t)          t          | j                            D ]a}|j
        d|dz   z  z  }t          |j        |j                  D ]4\  }}| j                            t-          ||||j                             5bt          j        |dddd          | _        |                     dt5          j        |j	                             |                     dt5          j        |j	                             |                                  d S )N   r   r	   )ro   rp   r   r   r   r  )rr   rs   rY   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   rw   model_in_dimupsample_initial_channelconv_prer   	upsamplerrL  r]  upsample_kernel_sizesr]   ConvTranspose1d	resblocksrT   resblock_dilation_sizesrD  rP  	conv_postr   r   rU   r[   r=  )r   r   r  upsample_ratero   rN  rK  r   s          r,   rs   zSpeechT5HifiGan.__init__0  s5      v;<< !677	+
 
 
 /8V=RTZTp9q9q/r/r 		 		+A+{N!!"31=3a!eE +((=8Q>      s4>**++ 	v 	vA61Q<HH),V-I6Ki)j)j v v%X%%&:8[RZ\b\s&t&tuuuuv 8QAaQRSSSVU[1D%E%EFFFWej1D&E&EFFF 	r.   r  c                     t          |t          j        t          j        f          rR|j        j                            d| j        j                   |j	        "|j	        j        
                                 dS dS dS )zInitialize the weights.r4   r  N)r  r   rw   rl  r   r)  r&  r   r$  rq   r,  )r   r  s     r,   r/  zSpeechT5HifiGan._init_weightsV  sv    fry"*<=>> 	)M&&CT[5R&SSS{& &&(((((	) 	)&&r.   c                 8   t           j        j        }t          t           j        j        d          rt           j        j        j        } || j                   | j        D ]} ||           | j        D ]}|                                  || j	                   d S rU  )
r   r   r   r   r   ri  rj  rm  rW  ro  rV  s      r,   rW  z!SpeechT5HifiGan.apply_weight_norm]  s    h*28,m<< 	@(3?KDM"""^ 	 	EK^ 	& 	&E##%%%%DN#####r.   c                 $   t           j                            | j                   | j        D ]!}t           j                            |           "| j        D ]}|                                 t           j                            | j                   d S r   )r   r   rY  ri  rj  rm  ro  rZ  s     r,   rY  z"SpeechT5HifiGan.remove_weight_normi  s    
##DM222^ 	/ 	/EH''....^ 	' 	'E$$&&&&
##DN33333r.   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r:   c                    | j         j        r|| j        z
  | j        z  }|                                dk    }|s|                    d          }|                    dd          }|                     |          }t          | j	                  D ]}t          j                            || j         j                  } | j        |         |          } | j        || j        z           |          }t          d| j                  D ]&}| | j        || j        z  |z            |          z  }'|| j        z  }t          j                            |          }|                     |          }t%          j        |          }|s=|                    d                              dd                              d          }n|                    d          }|S )a  
        spectrogram (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r	   r   r   r   r%   )r   normalize_beforer   r  r   r   r   ri  rT   rf  r   r  r\  rP  rj  rm  rd  ro  r   tanhr  r   )r   r  
is_batchedr   r  	res_statejwaveforms           r,   r   zSpeechT5HifiGan.forwardq  s   " ;' 	A&2dj@K __&&!+
 	3%//22K#--a33m44t)** 	9 	9AM44]DKD`aaM-DN1-m<<M<q4+;';<]KKI1d.// U UET^A0@,@1,DEmTTT		%(88MM00??}55
=11 	0$,,Q//99!Q??DDRHHHH %,,Q//Hr.   )r   r   r   r    r0  r2  rs   r   r4  r/  rW  rY  r   r   ri  r   r   r   s   @r,   r`  r`  '  s          "!!!#O$4 $ $ $ $ $ $L)BI ) ) ) )
$ 
$ 
$4 4 4 ^  (5#4 (9J ( ( ( ( ( ( ( (r.   r`  )r  r<  r%  r  r  r`  )r   Nr  r9  )gr   r   typingr   r   numpyrM   r   r   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   utils.deprecationr   configuration_speecht5r   r    
get_loggerr   r  _HIDDEN_STATES_START_POSITIONr   r>   r-   r6   r  r   rh  ndarrayrj   rl   r   r   r4  r   r   r   r  r   r  r.  r:  rk  r  r  r  r  r  r  r  r  r  r  r6  rX  rb  rq  rw  r  r  r  r  r  r  r  ri  rV   r#  r%  r<  rD  r`  __all__rF   r.   r,   <module>r     sb      " " " " " " " "            @ @ @ @ @ @ @ @ @ @ ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) @ @ @ @ @ @ 7 7 7 7 7 7 e e e e e e e e 9 9 9 9 9 9              D C C C C C C C , , , , , , , , 0 0 0 0 0 0 I I I I I I I I 
	H	%	% !" %, c [^    " ei0 0,0250KSTYT`Ka0 0 0 04 26t tc?tt t U-.	t
 t Zt t t tp    #=   ,    !;   8    !;   2A8 A8 A8 A8 A8BI A8 A8 A8J* * * * *bi * * *Z    ry   0" " " " " " " "(    29   % % % % %RY % % %R1 1 1 1 1	 1 1 1D D D D D") D D DN1 1 1 1 1") 1 1 1h% % % % % % % %P< < < < <29 < < <2    	+?   ")- )- )- )- )-	+? )- )- )-X& & & & &,@ & & &$c2 c2 c2 c2 c2	 c2 c2 c2L    ")   0: : : : :5 : : :zc c c c c5 c c cL (7 (7 (7 (7 (7o (7 (7 (7V|
 |
 |
 |
 |
- |
 |
 |
~" " " " "&= " " "J' ' ' ' '$; ' ' 'T
 
 
 
 
#: 
 
 
@~
 ~
 ~
 ~
 ~
- ~
 ~
 ~
B/ / / / /&= / / /d3 3 3 3 3$; 3 3 3l* * * * *#: * * *Z8M 8M 8M 8M 8M29 8M 8M 8Mv: : : : :bi : : :z   
Y
 Y
 Y
 Y
 Y
+ Y
 Y
 
Y
x   
s
 s
 s
 s
 s
5 s
 s
 
s
r 7;15#'$)"'L L"L#L !!23L U-.	L
 L L L bi L "L  L 5eE$5u7H$HIIJL L L L^   
e
 e
 e
 e
 e
5 e
 e
 
e
P   
t
 t
 t
 t
 t
 7 t
 t
 
t
n; ; ; ; ;29 ; ; ;|   
t t t t to t t 
tn  r.   