
     `i                        d Z ddlZddlmZ ddlmZmZmZ ddlZ	ddl
Z
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ  ej         e!          Z"e ed           G d de                                  Z#e ed           G d de                                  Z$e
j%        j&        d             Z'	 	 	 	 	 dCdZ(d Z) G d de
j        j*                  Z+ G d  d!ej*                  Z, G d" d#ej*                  Z- G d$ d%ej*                  Z. G d& d'ej*                  Z/ G d( d)ej*                  Z0 G d* d+ej*                  Z1 G d, d-ej*                  Z2 G d. d/ej*                  Z3 G d0 d1ej*                  Z4 G d2 d3ej*                  Z5 G d4 d5ej*                  Z6 G d6 d7ej*                  Z7 G d8 d9e          Z8 G d: d;ej*                  Z9 G d< d=ej*                  Z:e G d> d?e                      Z; ed@           G dA dBe;                      Z<dBd?gZ=dS )DzPyTorch VITS model.    N)	dataclass)AnyOptionalUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputModelOutput)PreTrainedModel)auto_docstringlogging   )
VitsConfigz`
    Describes the outputs for the VITS model, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dS )VitsModelOutputa"  
    waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        The final audio waveform predicted by the model.
    sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
        The length in samples of each element in the `waveform` batch.
    spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
        The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
        GAN decoder model to obtain the final audio waveform.
    Nwaveformsequence_lengthsspectrogramhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler   r        z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/vits/modeling_vits.pyr   r   '   s           -1Hhu()00048hu018886:K% 123:::8<M8E%"345<<<59Ju01299999r&   r   zm
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dS )VitsTextEncoderOutputa  
    prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted mean values of the prior distribution for the latent text variables.
    prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted log-variance values of the prior distribution for the latent text variables.
    Nlast_hidden_stateprior_meansprior_log_variancesr   r   )r   r   r   r    r*   r   r!   r"   r#   r+   r,   r   r$   r   r%   r&   r'   r)   r)   ?   s           6:x 12999/3K%+,3337;%"34;;;8<M8E%"345<<<59Ju01299999r&   r)   c                     | |z   }t          j        |d d d |d d f                   }t          j        |d d |d d d f                   }||z  }|S N)r!   tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactss          r'   fused_add_tanh_sigmoid_multiplyr8   T   sh    wFJvaaa,1233EM&LMM111!4566E5=DKr&   F      @MbP?c	                    | | k    | |k    z  }	|	 }
t          j        |           }t          j        |           }t          j        t          j        d|z
            dz
            }t
          j                            |d          }||d<   ||d<   | |
         ||
<   d||
<   t          | |	         ||	ddf         ||	ddf         ||	ddf         |||||	  	        \  ||	<   ||	<   ||fS )	a	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r!   
zeros_likenplogexpr   
functionalr<   _rational_quadratic_spline)rA   rB   rC   rD   rE   rF   rG   rH   rI   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstants                 r'   (_unconstrained_rational_quadratic_splinerU   ]   s<   \ #zk1f
6JK11v&&G"6**KvbfQ/001455H!}001Iv0VV'/V$(0W%%+,A%BG!"),K%&Ga*+/0Daaa0GH12F2IJ!9:NPQPQPQ:Q!R#%%
H 
H 
HDG !;/C#D Kr&   c	                 	   |}	| }
t          j        |           |
k     st          j        |           |	k    rt          d          |j        d         }||z  dk    rt          d| d|           ||z  dk    rt          d| d|           t
          j                            |d          }|d||z  z
  |z  z   }t          j        |d          }t
          j        	                    |d	d
d          }|	|
z
  |z  |
z   }|
|d<   |	|d<   |dddf         |dddf         z
  }|t
          j        
                    |          z   }t
          j                            |d          }|d||z  z
  |z  z   }t          j        |d          }t
          j        	                    |d	d
d          }|	|
z
  |z  |
z   }|
|d<   |	|d<   |dddf         |dddf         z
  }|r|n|}|dxx         dz  cc<   t          j        | d         |k    d          dz
  }|d         }|                    d|          d         }|                    d|          d         }|                    d|          d         }||z  }|                    d|          d         }|                    d|          d         }|dddf                             d|          d         }|                    d|          d         }||z   d|z  z
  }|s| |z
  |z  }|d|z
  z  }|||                    d          z  ||z  z   z  }|||z  z   }|||z  z   } |                    d          ||                    d          z  d|z  |z  z   |d|z
                      d          z  z   z  }!t          j        |!          dt          j        |          z  z
  }"| |"fS | |z
  }#|#|z  }$|||z
  z  |$z   }%||z  |$z
  }&| |#z  }'|&                    d          d|%z  |'z  z
  }(|(dk                                    st!          d|(           d|'z  |& t          j        |(          z
  z  })|)|z  |z   } |)d|)z
  z  }|||z  z   }|                    d          ||)                    d          z  d|z  |z  z   |d|)z
                      d          z  z   z  }!t          j        |!          dt          j        |          z  z
  }"| |" fS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    z-Input to a transform is not within its domainr?         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rT   r@   )r<   modevaluer=   r>   .Ngư>).N      r   zinvalid discriminant )r!   minmax
ValueErrorshaper   rN   softmaxcumsumr<   softplussumgatherpowrL   allRuntimeErrorsqrt)*rA   rB   rC   rD   rE   rF   rG   rH   rI   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrR   derivative_numeratorrS   intermediate2intermediate3abcdiscriminantroots*                                             r'   rO   rO      s   X K+Ky;&&%)F*;*;k*I*IHIII"(,Hx#%%imii_giijjj 3&&k~kkaikklll]""#6B"??Fa-("::fDDFV,,,I!!)jPS!TTI{*i7+EI#If$IgsABBw)C"H"55F 2=#9#9:R#S#SSKm##$8b#AAGNX$= =HHGg2...J"":6
RU"VVJ+z9KGJ$Jv%Jwabb!JsCRCx$88G")8JJyM'd"iy)]:CCCaGGi G&&r733F;O}}R11&9!((W55f=fE,,r7++F3K#**2w77?!,S!""W!5!<!<R!I!I&!QNN2w//7M%(BBQ_TM &%/)-== %U 3![599Q<<%?BSVkBk%kl	!M4I$II"Y%<<*q11&15+o 5561u9//!"4"445 

 i 455EIk<R<R8RR## !11%5[+<<=M--=L=(uuQxx!a%!)+!&&(( 	GE|EEFFFA1"uz,7778))O; $D 1!M4I$II*q11&!4+o 5561t8.."3"334 

 i 455EIk<R<R8RR$$r&   c                   6     e Zd Zdedef fdZddZd Z xZS )VitsWaveNetconfig
num_layersc                 B   t                                                       |j        | _        || _        t          j                                        | _        t          j                                        | _        t          j	        |j
                  | _        t          t
          j        j        d          rt
          j        j        j        }nt
          j        j        }|j        dk    rCt          j                            |j        d|j        z  |z  d          } ||d          | _        t'          |          D ]}|j        |z  }|j        |z  |z
  dz  }t          j                            |j        d|j        z  |j        ||          } ||d          }| j                            |           ||dz
  k     rd|j        z  }	n|j        }	t          j                            |j        |	d          }
 ||
d          }
| j                            |
           d S )Nweight_normr   r\   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r!   r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__s              r'   r   zVitsWaveNet.__init__0  s   !-$,,..$x2244z&"89928,m<< 	/(3?KK(.K(A--)FFL^H^akHkmnooJ)k*8DDDDOz"" 	8 	8A3Q6H1H<xGAMGx".!33"6! '  H #{8(;;;HN!!(+++ :>!!$%(:$:!!$*$6!"X__V-?ARTUVVN([hGGGN ''7777+	8 	8r&   Nc                    t          j        |          }t          j        | j        g          }||                     |          }t          | j                  D ]} | j        |         |          }|*|dz  | j        z  }|d d ||d| j        z  z   d d f         }	nt          j        |          }	t          ||	|d                   }
| 	                    |
          }
 | j
        |         |
          }|| j        dz
  k     r8|d d d | j        d d f         }||z   |z  }||d d | j        d d d f         z   }||z   }||z  S )Nr\   r   r   )r!   rJ   	IntTensorr   r   r   r   r   r8   r   r   )r   rA   padding_maskglobal_conditioningrR   num_channels_tensorr   r   cond_offsetglobal_statesr7   res_skip_actsres_actss                r'   forwardzVitsWaveNet.forwardY  s   "6**#ot/?.@AA*"&//2E"F"Ft'' 	2 	2A-DN1-f55M".!ed&66 3AAA{[STW[WgSgEg7gijijij4j k % 0 ? ?2=-QdefQghhD<<%%D3D03D99M4?Q&&&(,>d.>,>)AB 8+|;!M!!!T5E5G5G2J$KK!M1%%r&   c                 &   | j         dk    r)t          j        j                            | j                   | j        D ]&}t          j        j                            |           '| j        D ]&}t          j        j                            |           'd S )Nr   )r   r!   r   r   remove_weight_normr   r   r   r   layers     r'   r   zVitsWaveNet.remove_weight_normv  s    &!++HN--do>>>^ 	5 	5EHN--e4444) 	5 	5EHN--e4444	5 	5r&   r.   )	r   r   r   r   intr   r   r   __classcell__r   s   @r'   r   r   /  so        '8z '8s '8 '8 '8 '8 '8 '8R& & & &:5 5 5 5 5 5 5r&   r   c                   ,     e Zd Zdef fdZddZ xZS )VitsPosteriorEncoderr   c                 0   t                                                       |j        | _        t	          j        |j        |j        d          | _        t          ||j
                  | _        t	          j        |j        | j        dz  d          | _        d S )Nr   r   r\   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   s     r'   r   zVitsPosteriorEncoder.__init__  sz    ",	&"96;MqQQ"6f6abbb6#5t7H17LaPPr&   Nc                 6   |                      |          |z  }|                     |||          }|                     |          |z  }t          j        || j        d          \  }}|t          j        |          t          j        |          z  z   |z  }|||fS )Nr   rX   )r   r   r   r!   splitr   
randn_likerM   )r   rA   r   r   statsmean
log_stddevsampleds           r'   r   zVitsPosteriorEncoder.forward  s    v&&5fl4GHHv&&5 ;ud.?QGGGj%*40059Z3H3HHHLXj((r&   r.   r   r   r   r   r   r   r   r   s   @r'   r   r     s_        Qz Q Q Q Q Q Q) ) ) ) ) ) ) )r&   r   c                   :     e Zd Zd
 fd	ZddZd Zd Zd	 Z xZS )HifiGanResidualBlockr   r   r      皙?c                 d    t                                                       | _        t          j         fdt          t                              D                        _        t          j         fdt          t                              D                        _        d S )Nc                     g | ]<}t          j        d |                             |                             =S r   )strider   r   r   r   get_padding).0r   channelsr   r   r   s     r'   
<listcomp>z1HifiGanResidualBlock.__init__.<locals>.<listcomp>  sf     
 
 
  	%a[ ,,[(1+FF  
 
 
r&   c                 l    g | ]0}t          j        d d                     d                     1S r   r   )r   _r   r   r   s     r'   r   z1HifiGanResidualBlock.__init__.<locals>.<listcomp>  s^     
 
 
  	 ,,[!<<  
 
 
r&   )	r   r   leaky_relu_sloper   r   r   lenconvs1convs2)r   r   r   r   r   r   s   ```` r'   r   zHifiGanResidualBlock.__init__  s     0m
 
 
 
 
 
 
 s8}}--
 
 

 
 m
 
 
 
 
 
 s8}}--
 
 

 
r&   r   c                     ||z  |z
  dz  S )Nr\   r%   )r   r   r   s      r'   r   z HifiGanResidualBlock.get_padding  s    h&1a77r&   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        }| j        D ]} ||           | j        D ]} ||           d S Nr   )r   r   r   r   r   r   r   r   r   r   s      r'   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  s    h*28,m<< 	@(3?K[ 	 	EK[ 	 	EK	 	r&   c                     | j         D ]!}t          j                            |           "| j        D ]!}t          j                            |           "d S r.   )r   r   r   r   r   r   s     r'   r   z'HifiGanResidualBlock.remove_weight_norm  s`    [ 	/ 	/EH''....[ 	/ 	/EH''....	/ 	/r&   c                    t          | j        | j                  D ]l\  }}|}t          j                            || j                  } ||          }t          j                            || j                  } ||          }||z   }m|S r.   )zipr   r   r   rN   
leaky_relur   )r   r   conv1conv2residuals        r'   r   zHifiGanResidualBlock.forward  s    T[99 	5 	5LE5$HM44]DDYZZM!E-00MM44]DDYZZM!E-00M)H4MMr&   )r   r   r   r   )	r   r   r   r   r   r   r   r   r   r   s   @r'   r   r     s~        
 
 
 
 
 
>8 8 8 8  / / /      r&   r   c                   r     e Zd Zdef fdZd Zd Z	 d
dej        de	ej                 dej        fd	Z
 xZS )VitsHifiGanr   c                    t                                                       || _        t          |j                  | _        t          |j                  | _        t          j	        |j
        |j        ddd          | _        t          j                    | _        t          t!          |j        |j                            D ]X\  }\  }}| j                            t          j        |j        d|z  z  |j        d|dz   z  z  ||||z
  dz                       Yt          j                    | _        t+          t          | j                            D ]a}|j        d|dz   z  z  }t!          |j        |j                  D ]4\  }}| j                            t/          ||||j                             5bt          j	        |ddddd          | _        |j        dk    r't          j	        |j        |j        d          | _        d S d S )	N   r   r   )r   r   r   r\   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   s          r'   r   zVitsHifiGan.__init__  s   v;<< !677	+
 
 
 /8V=RTZTp9q9q/r/r 		 		+A+{N!!"31=3a!eE +((=8Q>      s4>**++ 	v 	vA61Q<HH),V-I6Ki)j)j v v%X%%&:8[RZ\b\s&t&tuuuuv 8QAaQRY^___(A--	&"?A`bcddDIII .-r&   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        }| j        D ]} ||           | j        D ]}|                                 d S r   )r   r   r   r   r   r   r  r   r   s      r'   r   zVitsHifiGan.apply_weight_norm  s    h*28,m<< 	@(3?K^ 	 	EK^ 	& 	&E##%%%%	& 	&r&   c                     | j         D ]!}t          j                            |           "| j        D ]}|                                 d S r.   )r   r   r   r   r  r   s     r'   r   zVitsHifiGan.remove_weight_norm  s\    ^ 	/ 	/EH''....^ 	' 	'E$$&&&&	' 	'r&   Nr   r   returnc                 j   |                      |          }|||                     |          z   }t          | j                  D ]}t          j                            || j        j                  } | j	        |         |          } | j
        || j        z           |          }t          d| j                  D ]&}| | j
        || j        z  |z            |          z  }'|| j        z  }t          j                            |          }|                     |          }t          j        |          }|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        Nr   )r   r  r   r   r   rN   r   r   r   r   r  r   r  r!   r/   )r   r   r   r   r   	res_statejr   s           r'   r   zVitsHifiGan.forward  s.    k22*)DII6I,J,JJMt)** 	9 	9AM44]DKD`aaM-DN1-m<<M<q4+;';<]KKI1d.// U UET^A0@,@1,DEmTTT		%(88MM00??}55:m,,r&   r.   )r   r   r   r   r   r   r   r!   r"   r   r   r   r   s   @r'   r   r     s        "ez "e "e "e "e "e "eH& & &' ' ' bf    , CKEL]C^ 		               r&   r   c                   ,     e Zd Zdef fdZddZ xZS )VitsResidualCouplingLayerr   c                 0   t                                                       |j        dz  | _        t	          j        | j        |j        d          | _        t          ||j	                  | _
        t	          j        |j        | j        d          | _        d S )Nr\   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r  r   s     r'   r   z"VitsResidualCouplingLayer.__init__)  sz    #-2	$"4f6H!LL"6f6]^^^6#5t7I1MMr&   NFc                    t          j        || j        gdz  d          \  }}|                     |          |z  }|                     |||          }|                     |          |z  }t          j        |          }	|sP||t          j        |	          z  |z  z   }t          j        ||gd          }
t          j	        |	ddg          }|
|fS ||z
  t          j        |	           z  |z  }t          j        ||gd          }
|
d fS )Nr\   r   rX   )
r!   r   r  r   r   r  rJ   rM   catre   )r   rA   r   r   rE   
first_halfsecond_halfr   r   r   rR   log_determinants               r'   r   z!VitsResidualCouplingLayer.forward1  s   "'+ft7I6JQ6NTU"V"V"V
Kj11L@]LBUVV~~m,,|;%d++
 	!uy/D/D!D|!SSKi[ 9qAAAG#i
QF;;OO++&-J;1G1GG,VKi[ 9qAAAGD= r&   NFr   r   s   @r'   r  r  (  s_        Nz N N N N N N! ! ! ! ! ! ! !r&   r  c                   ,     e Zd Zdef fdZddZ xZS )VitsResidualCouplingBlockr   c                     t                                                       t          j                    | _        t          |j                  D ])}| j                            t          |                     *d S r.   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r  )r   r   r   r   s      r'   r   z"VitsResidualCouplingBlock.__init__D  sp    ]__
v566 	A 	AAJ7??@@@@	A 	Ar&   NFc                     |s1| j         D ](} ||||          \  }}t          j        |dg          })n?t          | j                   D ]*}t          j        |dg          } ||||d          \  }}+|S )Nr   TrE   )r  r!   flipreversed)r   rA   r   r   rE   flowr   s          r'   r   z!VitsResidualCouplingBlock.forwardJ  s     	Z
 1 1 D7JKK	FQC001 !,, Z ZFQC00 D7JTXYYY	r&   r  r   r   s   @r'   r  r  C  s_        Az A A A A A A	 	 	 	 	 	 	 	r&   r  c                   .     e Zd Zddef fdZddZ xZS )VitsDilatedDepthSeparableConvr@   r   c                 ,   t                                                       |j        }|j        }|j        | _        t          j        |          | _        t          j	                    | _
        t          j	                    | _        t          j	                    | _        t          j	                    | _        t          | j                  D ]}||z  }||z  |z
  dz  }| j
                            t          j        ||||||                     | j                            t          j        ||d                     | j                            t          j        |                     | j                            t          j        |                     d S )Nr\   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)	r   r   dropout_rater   r   r   r   r   r   s	           r'   r   z&VitsDilatedDepthSeparableConv.__init__W  se   ;% ;z,//]__!}}}t'' 	8 	8A"A~H"X-8Q>G%%	 (!) +#%#  	 	 	  ''	(Ha(H(HIIILX 6 6777LX 6 67777	8 	8r&   Nc                 R   |||z   }t          | j                  D ]} | j        |         ||z            } | j        |         |                    dd                                        dd          }t
          j                            |          } | j        |         |          } | j	        |         |                    dd                                        dd          }t
          j                            |          }| 
                    |          }||z   }||z  S Nr   r?   )r   r   r)  r+  	transposer   rN   gelur*  r,  r   )r   rA   r   r   r   r   s         r'   r   z%VitsDilatedDepthSeparableConv.forwards  s   *11Ft'' 	, 	,A1D.q1&<2GHHM+DLOM,C,CAr,J,JKKUUVWY[\\MM..}==M3D03MBBM+DLOM,C,CAr,J,JKKUUVWY[\\MM..}==M LL77Mm+FF$$r&   )r@   r.   r   r   s   @r'   r$  r$  V  s]        8 8z 8 8 8 8 8 88% % % % % % % %r&   r$  c                   ,     e Zd Zdef fdZddZ xZS )VitsConvFlowr   c                    t                                                       |j        | _        |j        dz  | _        |j        | _        |j        | _	        t          j        | j        | j        d          | _        t          |          | _        t          j        | j        | j        | j        dz  dz
  z  d          | _        d S )Nr\   r   r   )r   r   r   filter_channelsdepth_separable_channelsr  duration_predictor_flow_binsrm   duration_predictor_tail_boundrF   r   r   r   r$  conv_ddsr   r   s     r'   r   zVitsConvFlow.__init__  s    %1#<A; >	$"4d6JANN5f==4#79Kt}_`O`cdOd9eghiir&   NFc                    t          j        || j        gdz  d          \  }}|                     |          }|                     |||          }|                     |          |z  }|j        \  }}	}
|                    ||	d|
                              dddd          }|dd | j	        f         t          j        | j                  z  }|d| j	        d| j	        z  f         t          j        | j                  z  }|dd| j	        z  d f         }t          |||||| j                  \  }}t          j        ||gd          |z  }|st          j        ||z  ddg          }||fS |d fS )	Nr\   r   rX   r?   r   r   .)rE   rF   )r!   r   r  r   r:  r   ra   reshapepermuterm   mathrj   r6  rU   rF   r  re   )r   rA   r   r   rE   r  r  r   
batch_sizer   lengthrB   rC   rD   rS   rR   r  s                    r'   r   zVitsConvFlow.forward  s   "'+ft7I6JQ6NTU"V"V"V
Kj11m\CVWW}55D'1'7$
Hf%--j(BOOWWXY[\^_abcc+C4=,@ADIdNbDcDcc,S$-!dmBS2S-STW[W`aeauWvWvv#0a$-6G6I6I1I#J #K $$
 $
 $
 [ )Z51===L 	!#il(BQFKKOO++D= r&   r  r   r   s   @r'   r4  r4    s_        	jz 	j 	j 	j 	j 	j 	j! ! ! ! ! ! ! !r&   r4  c                   ,     e Zd Zdef fdZddZ xZS )VitsElementwiseAffiner   c                 $   t                                                       |j        | _        t	          j        t          j        | j        d                    | _        t	          j        t          j        | j        d                    | _	        d S Nr   )
r   r   r7  r   r   	Parameterr!   zeros	translate	log_scaler   s     r'   r   zVitsElementwiseAffine.__init__  se    7ek$-&C&CDDek$-&C&CDDr&   NFc                     |sL| j         t          j        | j                  |z  z   }||z  }t          j        | j        |z  ddg          }||fS || j         z
  t          j        | j                   z  |z  }|d fS Nr   r\   )rG  r!   rM   rH  re   )r   rA   r   r   rE   rR   r  s          r'   r   zVitsElementwiseAffine.forward  s     	!nuy'@'@6'IIG,G#i(E1vNNOO++.%)T^O2L2LL|[GD= r&   r  r   r   s   @r'   rB  rB    s_        Ez E E E E E E! ! ! ! ! ! ! !r&   rB  c                   &     e Zd Z fdZddZ xZS )VitsStochasticDurationPredictorc                    t                                                       |j        }|j        }t	          j        ||d          | _        t	          j        ||d          | _        t          ||j	                  | _
        |dk    rt	          j        ||d          | _        t	          j                    | _        | j                            t          |                     t!          |j                  D ])}| j                            t%          |                     *t	          j        d|d          | _        t	          j        ||d          | _        t          ||j	                  | _        t	          j                    | _        | j                            t          |                     t!          |j                  D ])}| j                            t%          |                     *d S )Nr   )r.  r   )r   r   r   r   r   r   r   r   r$  duration_predictor_dropoutr:  r  r   r  r   rB  r   duration_predictor_num_flowsr4  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimr6  r   r   s        r'   r   z(VitsStochasticDurationPredictor.__init__  s   1	 ,	/?AFF?OQGG5:
 
 

 >>	)_a@@DI]__

/77888v:;; 	4 	4AJl6223333Yq/1== i!LL::
 
 

 -//4V<<===v:;; 	9 	9AO""<#7#78888	9 	9r&   NFrW   c                    t          j        |          }|                     |          }|,t          j        |          }||                     |          z   }|                     ||          }|                     |          |z  }|s|                     |          }|                     ||          }|                     |          |z  }t          j	        |
                    d          d|
                    d                                        |j        |j                  |z  }d}	|}
| j        D ]1} ||
|||z             \  }
}t          j        |
dg          }
|	|z  }	2t          j        |
ddgd          \  }}|	t          j        t$          j                            |          t$          j                            |           z   |z  ddg          z  }	t          j        dt+          j        dt*          j        z            |dz  z   z  |z  ddg          |	z
  }|t          j        |          z
  |z  }t          j        t          j        |d                    |z  }t          j        | ddg          }t          j        ||gd          }| j        D ].} ||||          \  }}t          j        |dg          }||z  }/t          j        d	t+          j        dt*          j        z            |dz  z   z  |z  ddg          |z
  }||z   S t9          t;          | j                            }|d d
         |d         gz   }t          j	        |
                    d          d|
                    d                                        |j        |j                  |z  }|D ]*}t          j        |dg          } ||||d          \  }}+t          j        |ddgd          \  }}|S )Nr   r\   )devicedtype)r   r   rX         gh㈵>g      ?r?   T)r   rE   )r!   detachr   r  r:  r   rP  rR  rQ  randnsizetorV  rW  rS  r   r   re   r   rN   
logsigmoidr>  rL   pir0   	clamp_minr  r  listr!  )r   rA   r   r   	durationsrE   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr"  r  r  r  logqlog_determinant_sumlatentsnllr  r   log_durations                         r'   r   z'VitsStochasticDurationPredictor.forward  s+   f%%v&&*"',/B"C"Cdii(;<<<Fv|44'',6 5	  ..y99M ..}lKKM //>>MM INN1--q)..2C2CDDGGv}djdpGqq  -.) 0 A A59T%|R_I_6 6 62!? %*J/@1#$F$F!-@--&+k2CaVQR&S&S&S#J)UY))*558P8PR\Q\8]8]]ammpqstou. . ) 	$$(1tw;"7"7;KQ;N"OPS__bcefaghh/0 
 $emJ&?&??<OJ5?:t#D#DEETJ"')ZK!Q"@"@i[ 9qAAAG
 7 7+/4[a+b+b+b(*Wqc22#6##)C48AK#8#8GQJ#GH<WZ[]^Y_``cvvC:$*--..E#2#J%),E FKKNNAv{{1~~>>AA^d^jAkk   c c*Wqc22!T'<V]abbb
#k'Aq6qAAAOL!r&   )NNFrW   r   r   r   r   r   r   r   s   @r'   rL  rL    sU        9 9 9 9 9@@  @  @  @  @  @  @  @ r&   rL  c                   &     e Zd Z fdZddZ xZS )VitsDurationPredictorc                 D   t                                                       |j        }|j        }t	          j        |j                  | _        t	          j        |j	        |||dz            | _
        t	          j        ||j                  | _        t	          j        ||||dz            | _        t	          j        ||j                  | _        t	          j        |dd          | _        |j        dk    r't	          j        |j        |j	        d          | _        d S d S )Nr\   )r   epsr   r   )r   r   r'  "duration_predictor_filter_channelsr   r   rN  r   r   r   conv_1r-  layer_norm_epsnorm_1conv_2norm_2projr   r  )r   r   r   r6  r   s       r'   r   zVitsDurationPredictor.__init__&  s    ; Cz&"CDDi 2O[ZeijZjkkkl?8MNNNi+WbfgWghhhl?8MNNNIoq!44	(A--	&"?ASUVWWDIII .-r&   Nc                    t          j        |          }|,t          j        |          }||                     |          z   }|                     ||z            }t          j        |          }|                     |                    dd                                        dd          }|                     |          }|                     ||z            }t          j        |          }| 	                    |                    dd                                        dd          }|                     |          }| 
                    ||z            }||z  S r0  )r!   rZ  r  rs  reluru  r1  r   rv  rw  rx  )r   rA   r   r   s       r'   r   zVitsDurationPredictor.forward5  s.   f%%*"',/B"C"Cdii(;<<<FVl233F##V--a4455??2FFf%%Vl233F##V--a4455??2FFf%%6L011$$r&   r.   rl  r   s   @r'   rn  rn  %  sQ        X X X X X% % % % % % % %r&   rn  c                        e Zd ZdZdef fdZdej        dedefdZ		 	 	 	 dd
ej        de
ej                 de
ej                 de
ej                 dedeej        e
ej                 f         fdZd Zd Zd Z xZS )VitsAttentionz?Multi-headed attention with relative positional representation.r   c                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        z  | _	        | j	        dz  | _
        | j	        | j        z  | j        k    r t          d| j         d| j         d          t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        | j        rt          j        t)          j        d| j        dz  dz   | j	                  | j
        z            | _        t          j        t)          j        d| j        dz  dz   | j	                  | j
        z            | _        d S d S )NrX  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   r\   )r   r   r   rT  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingr`   r   Linearuse_biask_projv_projq_projout_projrE  r!   r[  	emb_rel_k	emb_rel_vr   s     r'   r   zVitsAttention.__init__M  s   +3/!-$.8}d*MDN*t~==B\`\j B B/3~B B B  
 iV_UUUiV_UUUiV_UUU	$.$.vWWW 	r\%+a9IA9MPQ9QSWS`*a*adhdp*pqqDN\%+a9IA9MPQ9QSWS`*a*adhdp*pqqDNNN	r 	rr&   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S rJ  )viewr  r  r1  
contiguous)r   r  r  r  s       r'   _shapezVitsAttention._shapef  s<    {{3GGQQRSUVWWbbdddr&   NFr   key_value_statesattention_masklayer_head_maskoutput_attentionsr
  c                 ^	   |                                 \  }}}|                     |          | j        z  }	|                     |                     |          d|          }
|                     |                     |          d|          }|| j        z  d| j        f} |                     |	||          j        | }	 |
j        | }
 |j        | }|
                     d          }t          j
        |	|
                    dd                    }|                                 || j        z  ||fk    r2t          d|| j        z  ||f d|                                            | j        ^|                     | j        |          }t          j        |	|                    dd                    }|                     |          }||z  }||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }t$          j                            |d	          }||                                 | j        fk    r-t          d
| j        f d|                                            |                    dddd          |                    || j        ||          z  }|                    || j        z  ||          }|r=|                    || j        ||          }|                    || j        z  ||          }nd}t$          j                            || j        | j                  }t          j
        ||          }|                                 || j        z  || j        fk    r5t          d|| j        || j        f d|                                            | j        J|                     | j        |          }|                     |          }t          j        ||          }||z  }|                    || j        || j                  }|                    dd          }|                    ||| j                  }|                     |          }||fS )z#Input shape: Batch x Time x Channelr?   r   r\   z$Attention weights should be of size z	, but is NrY  z!Attention mask should be of size rX   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )r\  r  r  r  r  r  r  r  r  r!   bmmr1  r`   r  _get_relative_embeddingsr  matmul'_relative_position_to_absolute_positionr   rN   rb   r   r  r  '_absolute_position_to_relative_positionr<  rT  r  )r   r   r  r  r  r  r  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightss                          r'   r   zVitsAttention.forwardi  s    (,,..Wa {{=11DL@ [[]!;!;REE
{{4;;}#=#=r3GGDN*B>
Ct{{<#>>CZP$Z_j1
(|(*5//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *  
 '&*&C&CDNT[&\&\##l<9P9Z9Z[]_a9b9bccOGGXXLL(L%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 '(,(E(EdnV](^(^%#KKJWW <(8:STTL<'K!&&sDNGT]SS!++Aq11 "))#wGGmmK00111r&   c           	          t          || j        dz   z
  d          }|dk    r&t          j                            |dd||ddg          }t          | j        dz   |z
  d          }|d|z  z   dz
  }|d d ||f         S )Nr   r   r\   )r_   r  r   rN   r<   )r   relative_embeddingsr@  
pad_lengthslice_start_positionslice_end_positions         r'   r  z&VitsAttention._get_relative_embeddings  s    4#3a#78!<<
>>"$-"3"34G!QPZ\fhiklIm"n"n"D$4q$8F#BAFF1AJ>B"111&:;M&M#MNNr&   c                 l   |                                 \  }}}t          j                            |g d          }|                    ||dz  |z  g          }t          j                            |d|dz
  ddg          }|                    ||dz   d|z  dz
  g          }|d d d ||dz
  d f         }|S )N)r   r   r   r   r   r   r\   r   r   r\  r   rN   r<   r  r   xbatch_headsr@  r   x_flatx_finals          r'   r  z5VitsAttention._relative_position_to_absolute_position  s    !"VQ Ma!3!3!344 fqj6&9:;;""6Avz1a+@AA ++{FQJF
QGHH!!!WfWfqjll23r&   c           	      d   |                                 \  }}}t          j                            |d|dz
  ddddg          }|                    ||d|z  dz
  z  g          }t          j                            ||dddg          }|                    ||d|z  g          d d d d dd f         }|S )Nr   r   r\   r  r  s          r'   r  z5VitsAttention._absolute_position_to_relative_position  s    !"VQ Ma!VaZAq!!<==fF
Q&?@AA ""6FAq!+<==++{FAJ?@@AAAqrrJr&   )NNNF)r   r   r   r    r   r   r!   Tensorr   r  r   boolr$   r   r  r  r  r   r   s   @r'   r|  r|  J  sJ       IIrz r r r r r r2eU\ eC ec e e e e 481526"'`2 `2|`2 #5<0`2 !.	`2
 "%,/`2  `2 
u|Xel33	4`2 `2 `2 `2DO O O  
 
 
 
 
 
 
r&   r|  c                   $     e Zd Z fdZd Z xZS )VitsFeedForwardc                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j        |j                  | _        t          j	        |j
                  | _        t          |j        t                    rt          |j                 | _        n|j        | _        |j        dk    r&|j        dz
  dz  }|j        dz  }||ddddg| _        d S d | _        d S )Nr   r\   r   )r   r   r   r   r   ffn_dimffn_kernel_sizers  rv  r   activation_dropoutr   
isinstance
hidden_actstrr	   act_fnr   )r   r   pad_left	pad_rightr   s       r'   r   zVitsFeedForward.__init__  s    i 2FNFDZ[[i0BFDZ[[z&";<<f'-- 	, !23DKK +DK!A%%.2q8H.!3I$iAq!<DLLLDLLLr&   c                    |                     ddd          }|                     ddd          }||z  }| j        %t          j                            || j                  }|                     |          }|                     |          }|                     |          }||z  }| j        %t          j                            || j                  }|                     |          }||z  }|                     ddd          }|S )Nr   r\   r   )	r=  r   r   rN   r<   rs  r  r   rv  )r   r   r   s      r'   r   zVitsFeedForward.forward  s    %--aA66#++Aq!44%4<#M--mT\JJMM22M22]33%4<#M--mT\JJMM22%4%--aA66r&   rl  r   s   @r'   r  r    sG                 $      r&   r  c            	       l     e Zd Zdef fdZ	 	 d
dej        dej        deej                 de	fd	Z
 xZS )VitsEncoderLayerr   c                 h   t                                                       t          |          | _        t	          j        |j                  | _        t	          j        |j	        |j
                  | _        t          |          | _        t	          j        |j	        |j
                  | _        d S )Nrp  )r   r   r|  	attentionr   r   hidden_dropoutr   r-  r   rt  
layer_normr  feed_forwardfinal_layer_normr   s     r'   r   zVitsEncoderLayer.__init__  s    &v..z&"788,v'9v?TUUU+F33 "V-?VEZ [ [ [r&   NFr   r   r  r  c                 :   |}|                      |||          \  }}|                     |          }|                     ||z             }|}|                     ||          }|                     |          }|                     ||z             }|f}|r||fz  }|S )N)r   r  r  )r  r   r  r  r  )r   r   r   r  r  r   r  rR   s           r'   r   zVitsEncoderLayer.forward#  s     !&*nn')/ '5 '
 '
#| ]33=(@AA ))-FF]33--h.FGG " 	'&Gr&   r  )r   r   r   r   r   r!   r  r"   r   r  r   r   r   s   @r'   r  r    s        \z \ \ \ \ \ \ 26"' | ' !.	
         r&   r  c                        e Zd Zdef fdZ	 	 	 	 ddej        dej        deej                 dee	         dee	         d	ee	         d
e
eef         fdZ xZS )VitsEncoderr   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        j	        | _	        d S )Nc                 .    g | ]}t                    S r%   )r  )r   r   r   s     r'   r   z(VitsEncoder.__init__.<locals>.<listcomp>E  s"    $g$g$g!%5f%=%=$g$g$gr&   F)
r   r   r   r   r   r   num_hidden_layerslayersgradient_checkpointing	layerdropr   s    `r'   r   zVitsEncoder.__init__B  sh    m$g$g$g$guVMeGfGf$g$g$ghh&+#)r&   Nr   r   r  r  output_hidden_statesreturn_dictr
  c                    |rdnd }|rdnd }|t          ||j                  }||z  }t                      pt          |           }	| j        D ]i}
|r||fz   }t
          j                            dd          }| j        o
|| j	        k     }|r|	r |
||||          }|d         }|rd}|r||d         fz   }j||z  }|r||fz   }|st          d |||fD                       S t          |||          S )Nr%   r   r   )r  r   r  )NNc              3      K   | ]}||V  	d S r.   r%   )r   vs     r'   	<genexpr>z&VitsEncoder.forward.<locals>.<genexpr>|  s(      mmq_`_l_l_l_l_lmmr&   )r*   r   r   )r   rW  r
   r   r  rK   randomuniformr  r  r$   r   )r   r   r   r  r  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputss                 r'   r   zVitsEncoder.forwardI  s    #7@BBD$5?bb4 %7H[\\N%4022R6LT6R6R![ 	P 	PM# I$58H$H! #%)"3"3Aq"9"9!]U0Cdn0TN! 1[ 1 -!#1!-&7	! ! ! !.a 0 - ,  P&9]1=M<O&O#%4 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r&   )NNNN)r   r   r   r   r   r!   r"   r   r  r  r   r$   r   r   r   r   s   @r'   r  r  A  s        *z * * * * * * 26,0/3&*9
 9
(9
 '9
 !.	9

 $D>9
 'tn9
 d^9
 
uo%	&9
 9
 9
 9
 9
 9
 9
 9
r&   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej        dej        de	ej                 d	e	e
         d
e	e
         de	e
         deeej                 ef         fdZ xZS )VitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    r   c                 $   t                                                       || _        t          j        |j        |j        |j                  | _        t          |          | _
        t          j        |j        |j        dz  d          | _        d S )Nr\   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   s     r'   r   zVitsTextEncoder.__init__  sw    L):F<NPVPcdd"6**y!3V5E5IWXYYYr&   NT	input_idsr   r  r  r  r  r
  c                    |                      |          t          j        | j        j                  z  }|                     ||||||          }|s|d         n|j        }	|                     |	                    dd                                        dd          |z  }
t          j
        |
| j        j        d          \  }}|s|	||f|dd          z   }|S t          |	|||j        |j                  S )N)r   r   r  r  r  r  r   r   r\   rX   )r*   r+   r,   r   r   )r  r>  rj   r   r   r  r*   r  r1  r!   r   r   r)   r   r   )r   r  r   r  r  r  r  r   encoder_outputsr*   r   r+   r,   rR   s                 r'   r   zVitsTextEncoder.forward  s!    )))44tyAX7Y7YY,,'%)/!5# ' 
 
 7BhOA..Gh.88A>>??II!QOOR^^+0;udk>SYZ+[+[+[(( 	(+7JKo^_^`^`NaaGN$/# 3)7&1
 
 
 	
r&   )NNNT)r   r   r   r    r   r   r!   r  r"   r   r  r   r$   r)   r   r   r   s   @r'   r  r    s         Zz Z Z Z Z Z Z 26,0/3&*#
 #
<#
 '#
 !.	#

 $D>#
 'tn#
 d^#
 
uU\"$99	:#
 #
 #
 #
 #
 #
 #
 #
r&   r  c                   <    e Zd ZU eed<   dZdZdZdej	        fdZ
dS )VitsPreTrainedModelr   vitsr  Tmodulec                    | j         j        }t          |t          j                  rJ|j        j                            d|           |j         |j        j        	                                 dS dS t          |t          j
                  r?|j        j        	                                 |j        j                            d           dS t          |t          j        t          j        f          rt          j                            |j                   |j        Yt!          j        |j        |j        |j        d         z  z            }t          j                            |j        | |           dS dS t          |t          j                  rU|j        j                            d|           |j        +|j        j        |j                 	                                 dS dS t          |t0                    r{| j         j        rm| j         j        | j         j        z  }t          j                            |j        |dz             t          j                            |j        |dz             dS dS t          |t<                    r>|j        j        	                                 |j         j        	                                 dS dS )	zInitialize the weightsr@   )r   stdNrW   r   )r   r   rX  )r  )!r   initializer_ranger  r   r  r   datanormal_r   zero_r-  fill_r   r  initkaiming_normal_r>  rj   r&  r   r   uniform_r  padding_idxr|  r  r   r~  r  r  rB  rG  rH  )r   r  r  kr  s        r'   _init_weightsz!VitsPreTrainedModel._init_weights  s   k+fbi(( 	*M&&CS&999{& &&((((( '&-- 	*K""$$$M$$S)))))B,> ?@@ 	*G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888 '& -- 	*M&&CS&999!-"6#56<<>>>>> .-.. 	*{& F;2dk6UU 0hnEEE 0hnEEEEEF F  566 	*!'')))!'')))))	* 	*r&   N)r   r   r   r   r#   base_model_prefixmain_input_namesupports_gradient_checkpointingr   Moduler  r%   r&   r'   r  r    sP         !O&*#*BI * * * * * *r&   r  z@
    The complete VITS model, for text-to-speech synthesis.
    c                        e Zd Zdef fdZd Ze	 	 	 	 	 	 	 ddeej	                 deej	                 dee
         dee         d	ee         d
ee         deej                 deee         ef         fd            Z xZS )	VitsModelr   c                 &   t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        |j	        rt          |          | _        nt          |          | _        |j        dk    r$t          j        |j        |j                  | _        t%          |          | _        |j        | _        |j        | _        |j        | _        |                                  d S rD  )r   r   r   r  text_encoderr  r"  r   decoder"use_stochastic_duration_predictionrL  duration_predictorrn  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_raterc  noise_scale_duration	post_initr   s     r'   r   zVitsModel.__init__  s       +F33-f55	"6**4 	D&Ef&M&MD##&;F&C&CD#""!#f.A6C`!a!aD "6f!=!= $1!-$*$?! 	r&   c                     | j         S r.   )r  )r   s    r'   get_encoderzVitsModel.get_encoder  s      r&   Nr  r  
speaker_idr  r  r  labelsr
  c                     ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          | j        j        j        j        }|)|	                    d          
                    |          }	n:t          j        |          	                    d          
                    |          }	| j         j        dk    r|d|cxk    r| j         j        k     s"n t          d| j         j        dz
   d          t          |t                     rt          j        d|| j        	          }|                     |          	                    d          }
nd}
|                     ||	||||
          }|s|d         n|j        }|                    dd          }|	                    dd          }	|s|d         n|j        }|s|d         n|j        }| j         j        r |                     ||	|
d| j                  }n|                     ||	|
          }d| j        z  }t          j        t          j        |          |	z  |z            }t          j        t          j        |ddg          d                                           }t          j!        |"                                |j        |j                  }|	                    d          |	                    d          k     }|	                    d          
                    |	j                  }t          j	        |	d          t          j	        |d          z  }|j#        \  }}}}t          j$        |d          %                    ||z  d          }t          j!        ||j        |j                  }|	                    d          |k     }|
                    |j                  %                    |||          }|tL          j'        (                    |g d          ddddf         z
  }|	                    d                              dd          |z  }t          j)        |*                    d          |                              dd          }t          j)        |*                    d          |                              dd          }|t          j+        |          t          j        |          z  | j,        z  z   }| -                    |||
d          }||z  } | .                    | |
          }!|!*                    d          }!|t_          j0        | j         j1                  z  }"|s|!|"| f|dd         z   }#|#S te          |!|"| |j3        |j4                  S )a  
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nz&Training of VITS is not supported yet.r?   r   r   z Set `speaker_id` in the range 0-.r   )r\  
fill_valuerV  )r  r   r  r  r  r  r\   T)rE   rc  rW   )rW  rV  )r   r   r   r   r   r   r   r  )r   r   r   r   r   )5r   r  r  use_return_dictNotImplementedErrorr  r  r   rW  	unsqueezer]  r!   	ones_liker  r`   r  r   fullrV  r  r*   r1  r+   r,   r  r  r  r  ceilrM   r`  re   longaranger_   ra   rc   r  r   rN   r<   r  squeezer   rc  r"  r  rK   prodr   r   r   r   )$r   r  r  r  r  r  r  r  
mask_dtypeinput_padding_maskspeaker_embeddingstext_encoder_outputr   r+   r,   rk  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskr?  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsri  r   r   r   rR   s$                                       r'   r   zVitsModel.forward  s   J 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]%&NOOO&3:@
%!/!9!9"!=!=!@!@!L!L!&!;!;!E!Eb!I!I!L!LZ!X!X;#a''J,B
====T[%===== !cDKD\_`D`!c!c!cddd*c** ^"ZTjQUQ\]]]
!%!3!3J!?!?!I!I"!M!M!%"//+)/!5# 0 
 
 7Bl+A..GZGl%//155/99!Q??4?d)!,,EXEd<Gt1!44M`Mt;9 		j22"" 5 3  LL  22=BTVhiiLT//:ei558JJ\YZZ!OEIhA,G,GKKPPRR ,04466>O>U^o^vwww%//225F5P5PQR5S5SS1;;A>>AABTBZ[[ O$6::U_M`bd=e=ee	5>_2
A}l|Hb1166zL7PRSTT,}HN8?[[[))!,,|;%((99>>z<Yfgg&):):=J\J\J\)])]^_^_^_adbdad^d)ee''**44Q::YF l4<<??K@@JJ1aPP#l4<<??<OPPZZ[\^_``#e&6{&C&CeiPcFdFd&dgkgw&ww))M+>@R\`)aa 33<<-?@@##A&&,rwt{7Q/R/RR 	!1;?BUVWVXVXBYYGN-#-;*5
 
 
 	
r&   )NNNNNNN)r   r   r   r   r   r  r   r   r!   r  r   r  r"   r   r$   r   r   r   r   r   s   @r'   r  r    s"       z      4! ! !  -115$(,0/3&*.2~
 ~
EL)~
 !.~
 SM	~

 $D>~
 'tn~
 d^~
 *+~
 
uSz?*	+~
 ~
 ~
 ^~
 ~
 ~
 ~
 ~
r&   r  )Fr9   r:   r:   r:   )>r    r>  dataclassesr   typingr   r   r   numpyrK   r!   r   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   r   r   configuration_vitsr   
get_loggerr   loggerr   r)   jitscriptr8   rU   rO   r
  r   r   r   r   r  r  r$  r4  rB  rL  rn  r|  r  r  r  r  r  r  __all__r%   r&   r'   <module>rJ     s      ! ! ! ! ! ! ' ' ' ' ' ' ' ' ' '            ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7 B B B B B B 9 9 9 9 9 9 < < < < < < < < - - - - - - , , , , , , , , * * * * * * 
	H	%	%   
: : : : :k : :  :$   
: : : : :K : :  :    G  G  G  G TE% E% E%PM5 M5 M5 M5 M5%(/ M5 M5 M5`) ) ) ) )29 ) ) )&; ; ; ; ;29 ; ; ;|U U U U U") U U Up! ! ! ! !	 ! ! !6    	   &+% +% +% +% +%BI +% +% +%\(! (! (! (! (!29 (! (! (!V! ! ! ! !BI ! ! !$a  a  a  a  a bi a  a  a H"% "% "% "% "%BI "% "% "%Jc c c c cBI c c cL' ' ' ' 'bi ' ' 'T$ $ $ $ $1 $ $ $NA
 A
 A
 A
 A
") A
 A
 A
H/
 /
 /
 /
 /
bi /
 /
 /
d  *  *  *  *  */  *  *  *F   
]
 ]
 ]
 ]
 ]
# ]
 ]
 
]
@ -
.r&   