
     `id                        d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZ d
dlmZ  ej        e          Ze ed           G d de	                                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Ze G d de                      ZdgZdS )zPyTorch UnivNetModel model.    )	dataclass)OptionalUnionN)nn   )ModelOutput)PreTrainedModel)auto_docstringlogging   )UnivNetConfigz
    Output class for the [`UnivNetModel`], which includes the generated audio waveforms and the original unpadded
    lengths of those waveforms (so that the padding can be removed by [`UnivNetModel.batch_decode`]).
    )custom_introc                   \    e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dS )UnivNetModelOutputa"  
    waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Batched 1D (mono-channel) output audio waveforms.
    waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
        The batched length in samples of each unpadded waveform in `waveforms`.
    N	waveformswaveform_lengths)
__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/univnet/modeling_univnet.pyr   r      sO           .2Ix)*11148hu0188888r   r   c                   J     e Zd ZdZdef fdZdej        fdZd Z	d Z
 xZS )#UnivNetKernelPredictorResidualBlockz
    Implementation of the residual block for the kernel predictor network inside each location variable convolution
    block (LVCBlock).

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
    configc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        | j        dz
  dz  }t          j
        | j                  | _        t          j        | j        | j        | j        |d          | _        t          j        | j        | j        | j        |d          | _        d S )Nr      Tpaddingbias)super__init__model_in_channelschannelskernel_predictor_conv_sizekernel_sizekernel_predictor_dropoutdropout_probleaky_relu_sloper   DropoutdropoutConv1dconv1conv2)selfr   r#   	__class__s      r   r&   z,UnivNetKernelPredictorResidualBlock.__init__<   s     	0!<"; & 7#a'A-z$"344Yt}dmT=MW^eijjj
Yt}dmT=MW^eijjj


r   hidden_statesc                 "   |}|                      |          }|                     |          }t          j                            || j                  }|                     |          }t          j                            || j                  }||z   S N)r/   r1   r   
functional
leaky_relur-   r2   )r3   r5   residuals      r   forwardz+UnivNetKernelPredictorResidualBlock.forwardL   sy     ]33

=1100@UVV

=1100@UVVx''r   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        } || j                    || j                   d S Nweight_norm)r   utilsr>   hasattrparametrizationsr1   r2   r3   r>   s     r   apply_weight_normz5UnivNetKernelPredictorResidualBlock.apply_weight_normV   s[    h*28,m<< 	@(3?KDJDJr   c                     t           j                            | j                   t           j                            | j                   d S r7   )r   r?   remove_weight_normr1   r2   r3   s    r   rE   z6UnivNetKernelPredictorResidualBlock.remove_weight_norm^   s8    
##DJ///
##DJ/////r   )r   r   r   r   r   r&   r   r   r;   rC   rE   __classcell__r4   s   @r   r   r   2   s         kk k k k k k (U%6 ( ( ( (     0 0 0 0 0 0 0r   r   c                   X     e Zd ZdZ	 	 ddededef fdZdej        fd	Z	d
 Z
d Z xZS )UnivNetKernelPredictora  
    Implementation of the kernel predictor network which supplies the kernel and bias for the location variable
    convolutional layers (LVCs) in each UnivNet LVCBlock.

    Based on the KernelPredictor implementation in
    [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L7).

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
        conv_kernel_size (`int`, *optional*, defaults to 3):
            The kernel size for the location variable convolutional layer kernels (convolutional weight tensor).
        conv_layers (`int`, *optional*, defaults to 4):
            The number of location variable convolutional layers to output kernels and biases for.
    r      r   conv_kernel_sizeconv_layersc                    t                                                       j        | _        dj        z  | _        || _        || _        | j        | j        z  | j        z  | j        z  | _        | j        | j        z  | _        j	        | _
        j        | _        j        | _        j        | _        j        | _        | j        dz
  dz  }t%          j        | j
        | j        ddd          | _        t%          j        fdt-          | j                  D                       | _        t%          j        | j        | j        | j        |d          | _        t%          j        | j        | j        | j        |d          | _        d S )Nr!   r      Tr"   c                 .    g | ]}t                    S r   )r   ).0_r   s     r   
<listcomp>z3UnivNetKernelPredictor.__init__.<locals>.<listcomp>   s#    't't'tXY(KF(S(S't't'tr   )r%   r&   model_hidden_channelsconv_in_channelsconv_out_channelsrL   rM   kernel_channelsbias_channelsnum_mel_binsresnet_in_channels kernel_predictor_hidden_channelsresnet_hidden_channelsr)   resnet_kernel_sizekernel_predictor_num_blocks
num_blocksr-   r   r0   
input_conv
ModuleListrange	resblockskernel_conv	bias_conv)r3   r   rL   rM   r#   r4   s    `   r   r&   zUnivNetKernelPredictor.__init__t   s    	 & <!"V%A!A 0& !D$::T=RRUYUee 	 "3d6FF"("5&,&M#"("C < & 7*Q.14)D$;T=XZ[efmqrrr't't't't]bcgcr]s]s't't'tuu9')=t?V`gnr
 
 
 ');T=T^elp
 
 
r   spectrogramc                    |j         \  }}}|                     |          }t          j                            || j                  }| j        D ]} ||          }|                     |          }|                     |          }|	                    || j
        | j        | j        | j        |                                          }	|	                    || j
        | j        |                                          }
|	|
fS )a  
        Maps a conditioning log-mel spectrogram to a tensor of convolutional kernels and biases, for use in location
        variable convolutional layers. Note that the input spectrogram should have shape (batch_size, input_channels,
        seq_length).

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, input_channels, seq_length)`):
                Tensor containing the log-mel spectrograms.

        Returns:
            tuple[`torch.FloatTensor, `torch.FloatTensor`]: tuple of tensors where the first element is the tensor of
            location variable convolution kernels of shape `(batch_size, self.conv_layers, self.conv_in_channels,
            self.conv_out_channels, self.conv_kernel_size, seq_length)` and the second element is the tensor of
            location variable convolution biases of shape `(batch_size, self.conv_layers. self.conv_out_channels,
            seq_length)`.
        )shaper`   r   r8   r9   r-   rc   rd   re   viewrM   rU   rV   rL   
contiguous)r3   rf   
batch_sizerR   
seq_lengthr5   resblockkernel_hidden_statesbias_hidden_stateskernelsbiasess              r   r;   zUnivNetKernelPredictor.forward   s   " %0$5!
Az4400@UVV 	4 	4H$H]33MM#//>>!^^M:: '++!"!
 
 *,, 	 $(("	
 

 *,, 	 r   c                 .   t           j        j        }t          t           j        j        d          rt           j        j        j        } || j                   | j        D ]}|                                  || j                    || j	                   d S r=   )
r   r?   r>   r@   rA   r`   rc   rC   rd   re   r3   r>   layers      r   rC   z(UnivNetKernelPredictor.apply_weight_norm   s    h*28,m<< 	@(3?KDO$$$^ 	& 	&E##%%%%D$%%%DN#####r   c                    t           j                            | j                   | j        D ]}|                                 t           j                            | j                   t           j                            | j                   d S r7   )r   r?   rE   r`   rc   rd   re   r3   rt   s     r   rE   z)UnivNetKernelPredictor.remove_weight_norm   sw    
##DO444^ 	' 	'E$$&&&&
##D$4555
##DN33333r   )r   rK   r   r   r   r   r   intr&   r   r   r;   rC   rE   rG   rH   s   @r   rJ   rJ   c   s         & !"	$
 $
$
 $
 	$
 $
 $
 $
 $
 $
L,5#4 , , , ,\	$ 	$ 	$4 4 4 4 4 4 4r   rJ   c                        e Zd ZdZdededef fdZddZ	 	 dd	ej	        d
ej	        dej	        dedef
dZ
d Zd Z xZS )UnivNetLvcResidualBlocka  
    Implementation of the location variable convolution (LVC) residual block for the UnivNet residual network.

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
        kernel_size (`int`):
            The kernel size for the dilated 1D convolutional layer.
        dilation (`int`):
            The dilation for the dilated 1D convolutional layer.
    r   r*   dilationc                 "   t                                                       |j        | _        || _        || _        |j        | _        | j        | j        dz
  z  dz  }t          j        | j        | j        | j        || j                  | _	        d S )Nr   r!   )r#   r{   )
r%   r&   rT   hidden_channelsr*   r{   r-   r   r0   conv)r3   r   r*   r{   r#   r4   s        r   r&   z UnivNetLvcResidualBlock.__init__   s     	%;&  & 7-4#3a#78A=I  ]
 
 
			r      c                    |}t           j                            || j                  }|                     |          }t           j                            || j                  }|                     ||||          }t          j        |d d d | j        d d f                   t          j	        |d d | j        d d d f                   z  }||z   }|S N)hop_size)
r   r8   r9   r-   r~   location_variable_convolutionr   sigmoidr}   tanh)r3   r5   kernelr$   r   r:   s         r   r;   zUnivNetLvcResidualBlock.forward   s     00@UVV		-0000@UVV::=&RVai:jjmAAA7M9M7Mqqq4P&QRRUZU_!!!T133QQQ67V
 V
 
 !=0r   r   r5   r   r$   r   c                    |j         \  }}}|j         \  }}}	}
}|||z  k    rt          d||z   d| d          |t          |
dz
  dz            z  }t          j                            |||fdd          }|                    d|d|z  z   |          }||k     r$t          j                            |d|fdd          }|                    d||          }|d	d	d	d	d	d	d	d	d	|f         }|                    dd
          }|                    d
|
d          }t          j	        d||          }|
                    t          j                  }|                    d                              d          
                    t          j                  }||z   }|                                                    ||	d          }|S )u  
        Performs location-variable convolution operation on the input sequence (hidden_states) using the local
        convolution kernel. This was introduced in [LVCNet: Efficient Condition-Dependent Modeling Network for Waveform
        Generation](https://huggingface.co/papers/2102.10815) by Zhen Zheng, Jianzong Wang, Ning Cheng, and Jing Xiao.

        Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.

        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, in_channels, in_length)`):
                The input sequence of shape (batch, in_channels, in_length).
            kernel (`torch.FloatTensor` of shape `(batch_size, in_channels, out_channels, kernel_size, kernel_length)`):
                The local convolution kernel of shape (batch, in_channels, out_channels, kernel_size, kernel_length).
            bias (`torch.FloatTensor` of shape `(batch_size, out_channels, kernel_length)`):
                The bias for the local convolution of shape (batch, out_channels, kernel_length).
            dilation (`int`, *optional*, defaults to 1):
                The dilation of convolution.
            hop_size (`int`, *optional*, defaults to 256):
                The hop_size of the conditioning sequence.
        Returns:
            `torch.FloatTensor`: the output sequence after performing local convolution with shape (batch_size,
            out_channels, in_length).
        z#Dim 2 of `hidden_states` should be z
) but got zX. Please check `hidden_states` or `kernel` and `hop_size` to make sure they are correct.r   r!   constantr   r   NrK   zbildsk,biokl->bolsd)memory_format)rh   
ValueErrorrx   r   r8   padunfold	transposer   einsumtochannels_last_3d	unsqueezerj   ri   )r3   r5   r   r$   r{   r   batchrR   	in_lengthout_channelsr*   kernel_lengthr#   output_hidden_statess                 r   r   z5UnivNetLvcResidualBlock.location_variable_convolution  s   < ,1q)=C\:q,]122]mh6N ] ]Zc ] ] ]  
 S+/Q!6777 ))-'79KZYZ[[%,,Q1w;0FQQhM--ma]JXYZZM%,,Q(CC%aaaAAAqqq)8)&;<%//155%,,QQ??  %|,A=RXYY366UE[6\\~~b!!++B//22AW2XX3d:3>>@@EEe\[]^^##r   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        } || j                   d S r=   )r   r?   r>   r@   rA   r~   rB   s     r   rC   z)UnivNetLvcResidualBlock.apply_weight_normO  sI    h*28,m<< 	@(3?KDIr   c                 N    t           j                            | j                   d S r7   )r   r?   rE   r~   rF   s    r   rE   z*UnivNetLvcResidualBlock.remove_weight_normV  s     
##DI.....r   r   )r   r   )r   r   r   r   r   rx   r&   r;   r   r   r   rC   rE   rG   rH   s   @r   rz   rz      s        
 


 
 	
 
 
 
 
 
,   * ?$ ?$(?$ !?$ 	?$
 ?$ ?$ ?$ ?$ ?$B  / / / / / / /r   rz   c                   d     e Zd ZdZ	 ddededef fdZdej        dej        fd	Z	d
 Z
d Z xZS )UnivNetLvcBlocka#  
    Implementation of the location variable convolution (LVC) residual block of the UnivNet residual block. Includes a
    `UnivNetKernelPredictor` inside to predict the kernels and biases of the LVC layers.

    Based on LVCBlock in
    [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L98)

    Parameters:
        config (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
        layer_id (`int`):
            An integer corresponding to the index of the current LVC resnet block layer. This should be between 0 and
            `len(config.resblock_stride_sizes) - 1)` inclusive.
        lvc_hop_size (`int`, *optional*, defaults to 256):
            The hop size for the location variable convolutional layers.
    r   r   layer_idlvc_hop_sizec           	      z    t                                                       j         _        j        |          _        j        |          _        j        |          _	        | _
        j         _        t           j	                   _        t          j         j         j        d j        z   j         j        dz   j        dz  z    j        dz             _        t#           j         j                   _        t          j         fdt)           j                  D                        _        d S )Nr!   )strider#   output_paddingc                 R    g | ]#}t          j        j        |                   $S r   )rz   r*   	dilations)rQ   ir   r3   s     r   rS   z,UnivNetLvcBlock.__init__.<locals>.<listcomp>  s1    rrrVW$VT-=t~a?PQQrrrr   )r%   r&   rT   r}   resblock_kernel_sizesr*   resblock_stride_sizesr   resblock_dilation_sizesr   cond_hop_lengthr-   lenr_   r   ConvTranspose1d	convt_prerJ   kernel_predictorra   rb   rc   )r3   r   r   r   r4   s   ``  r   r&   zUnivNetLvcBlock.__init__l  s&    	%;!7A28<7A+ & 7dn--+  O;K1$t{Q6;?
 
 
 !7vt?OQUQ` a arrrrr[`aeap[q[qrrr
 
r   r5   rf   c           	      Z   t           j                            || j                  }|                     |          }|                     |          \  }}t          | j                  D ]C\  }}|d d |d d d d d d d d f         }|d d |d d d d f         } ||||| j                  }D|S r   )	r   r8   r9   r-   r   r   	enumeraterc   r   )	r3   r5   rf   rp   rq   r   rm   r   r$   s	            r   r;   zUnivNetLvcBlock.forward  s     00@UVV}55//<<$T^44 	a 	aKAxQQQ111aaaAAA-.F!!!Q111*%D$H]FD4K_```MMr   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        } || j                   | j                                         | j        D ]}|                                 d S r=   )	r   r?   r>   r@   rA   r   r   rC   rc   rs   s      r   rC   z!UnivNetLvcBlock.apply_weight_norm  s    h*28,m<< 	@(3?KDN###//111^ 	& 	&E##%%%%	& 	&r   c                     t           j                            | j                   | j                                         | j        D ]}|                                 d S r7   )r   r?   rE   r   r   rc   rv   s     r   rE   z"UnivNetLvcBlock.remove_weight_norm  s_    
##DN33300222^ 	' 	'E$$&&&&	' 	'r   r   rw   rH   s   @r   r   r   Z  s         *  	
 

 
 	
 
 
 
 
 
<U%6 UEV    & & &' ' ' ' ' ' 'r   r   c                        e Zd ZU eed<   dZdef fdZe	 	 	 	 ddej	        de
ej	                 de
ej	                 de
ej                 de
e         d	eeej	                 ef         fd
            Zd Zd Zd Z xZS )UnivNetModelr   input_featuresc                 6   t                                                     t          j                  | _        j        | _        t          j        j        j	        dddd          | _
        t          j                  }d}g j        D ]}||z  }                    |           t          j        fdt          |          D                       | _        t          j        j	        dddd          | _        |                                  d S )N   r   r   reflect)r*   r   r#   padding_modec                 @    g | ]}t          ||                    S ))r   r   )r   )rQ   r   r   hop_lengthss     r   rS   z)UnivNetModel.__init__.<locals>.<listcomp>  sG          !,Q    r   )r#   r   )r%   r&   r   r   num_kernelsr-   r   r0   r'   rT   conv_prer   appendra   rb   rc   	conv_post	post_init)r3   r   
num_layers
hop_lengthr   r   r4   s    `   @r   r&   zUnivNetModel.__init__  s<      v;<< & 7	$("
 
 
 566

2 	+ 	+F#f,Jz****     z**  	
 	
 6#?Aq_hiii 	r   Nnoise_sequencepadding_mask	generatorreturn_dictreturnc                    ||n| j         j        }|                                dk    }|s|                    d          }|j        \  }}}	|0|                                dk    }
|
s|                    d          }n1||| j         j        f}t          j        |||j        |j	                  }|j        d         }|dk    r|dk    r|
                    |dd          }n#|dk    r|dk    r|
                    |dd          }||k    rt          d| d| d          |V|                                dk    r|                    d          }|j        d         }||k    rt          d	| d| d          |                    d
d          }|                    d
d          }|                     |          }| j        D ]} |||          }t          j                            || j                  }|                     |          }t          j        |          }|                    d          }d}|t          j        |d          }|s||f}|S t/          ||          S )a  
        noise_sequence (`torch.FloatTensor`, *optional*):
            Tensor containing a noise sequence of standard Gaussian noise. Can be batched and of shape `(batch_size,
            sequence_length, config.model_in_channels)`, or un-batched and of shape (sequence_length,
            config.model_in_channels)`. If not supplied, will be randomly generated.
        padding_mask (`torch.BoolTensor`, *optional*):
            Mask indicating which parts of each sequence are padded. Mask values are selected in `[0, 1]`:

            - 1 for tokens that are **not masked**
            - 0 for tokens that are **masked**

            The mask can be batched and of shape `(batch_size, sequence_length)` or un-batched and of shape
            `(sequence_length,)`.
        generator (`torch.Generator`, *optional*):
            A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
            deterministic.
            return_dict:
            Whether to return a [`~utils.ModelOutput`] subclass instead of a plain tuple.

        Example:

         ```python
         >>> from transformers import UnivNetFeatureExtractor, UnivNetModel
         >>> from datasets import load_dataset, Audio

         >>> model = UnivNetModel.from_pretrained("dg845/univnet-dev")
         >>> feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev")

         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> # Resample the audio to the feature extractor's sampling rate.
         >>> ds = ds.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
         >>> inputs = feature_extractor(
         ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
         ... )
         >>> audio = model(**inputs).waveforms
         >>> list(audio.shape)
         [1, 140288]
         ```
        Nr   r   )r   dtypedevicer   z&The batch size of `noise_sequence` is z+ and the batch size of `input_features` is z', but the two are expected to be equal.z$The batch size of `padding_mask` is r!   )dim)r   r   )r   use_return_dictr   r   rh   r'   r   randnr   r   repeatr   r   r   rc   r   r8   r9   r-   r   r   squeezesumr   )r3   r   r   r   r   r   spectrogram_batchedspectrogram_batch_sizespectrogram_lengthrR   noise_sequence_batchednoise_sequence_shapenoise_sequence_batch_sizepadding_mask_batch_sizer5   rm   waveformr   outputss                      r   r;   zUnivNetModel.forward  s   ` &1%<kk$+B] -0022a7" 	9+55a88N8F8L5 2A%%3%7%7%9%9Q%>") =!/!9!9!!<!< %;<NPTP[Pm#n "[$	AU^l^s  N %3$8$;!!A%%*Cq*H*H+223I1aPPNN&**/E/J/J+223LaQRSSN$(>>>h9R h h(>h h h  
 #!!Q&&+55a88&2&8&;#&*@@@ l;R l l,Bl l l   '00A66'11!Q77m44 	D 	DH$H]NCCMM00@UVV}55
=11 !((++  #$y1=== 	!12GN!-
 
 
 	
r   c                    t          |t          j        t          j        t          j        f          rR|j        j                            d| j        j	                   |j
        "|j
        j                                         dS dS dS )zInitialize the weights.g        )meanstdN)
isinstancer   Linearr0   r   weightdatanormal_r   initializer_ranger$   zero_)r3   modules     r   _init_weightszUnivNetModel._init_weightsQ  sz    fry")R5GHII 	)M&&CT[5R&SSS{& &&(((((	) 	)&&r   c                    t           j        j        }t          t           j        j        d          rt           j        j        j        } || j                   | j        D ]}|                                  || j                   d S r=   )	r   r?   r>   r@   rA   r   rc   rC   r   rs   s      r   rC   zUnivNetModel.apply_weight_normX  s    h*28,m<< 	@(3?KDM"""^ 	& 	&E##%%%%DN#####r   c                     t           j                            | j                   | j        D ]}|                                 t           j                            | j                   d S r7   )r   r?   rE   r   rc   r   rv   s     r   rE   zUnivNetModel.remove_weight_normb  s^    
##DM222^ 	' 	'E$$&&&&
##DN33333r   )NNNN)r   r   r   r   r   main_input_namer&   r
   r   r   r   	Generatorboolr   tupler   r;   r   rC   rE   rG   rH   s   @r   r   r     s/        &O%} % % % % % %N  7;48/3&*x
 x
)x
 !!23x
 u01	x

 EO,x
 d^x
 
uU&');;	<x
 x
 x
 ^x
t) ) )$ $ $4 4 4 4 4 4 4r   r   )r   dataclassesr   typingr   r   r   r   modeling_outputsr   modeling_utilsr	   r?   r
   r   configuration_univnetr   
get_loggerr   loggerr   Moduler   rJ   rz   r   r   __all__r   r   r   <module>r      s&   " ! ! ! ! ! ! ! " " " " " " " "        + + + + + + - - - - - - , , , , , , , , 0 0 0 0 0 0 
	H	%	%   	9 	9 	9 	9 	9 	9 	9  	9.0 .0 .0 .0 .0") .0 .0 .0bu4 u4 u4 u4 u4RY u4 u4 u4p|/ |/ |/ |/ |/bi |/ |/ |/~M' M' M' M' M'bi M' M' M'` {4 {4 {4 {4 {4? {4 {4 {4| 
r   