
    Pi7                     T   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ ddl	m
Z
  G d dej                  Z G d	 d
ej                  Z G d de          Zd ZddZ G d dej                  Z G d dej                  Z G d de          Zd Z G d dej                  ZdS )    N)List)RotaryPositionalEmbeddings)ResidualFSQ   TransformerBlockc            	       \     e Zd ZdZ	 ddedededef fdZdej        d	ej        fd
Z	 xZ
S )ISTFTa  
    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
    See issue: https://github.com/pytorch/pytorch/issues/62323
    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
    The NOLA constraint is met as we trim padded samples anyway.

    Args:
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames.
        win_length (int): The size of window frame and STFT filter.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    samen_fft
hop_length
win_lengthpaddingc                     t                                                       |dvrt          d          || _        || _        || _        || _        t          j        |          }| 	                    d|           d S )N)centerr   #Padding must be 'center' or 'same'.window)
super__init__
ValueErrorr   r   r   r   torchhann_windowregister_buffer)selfr   r   r   r   r   	__class__s         p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/neucodec/codec_decoder_vocos.pyr   zISTFT.__init__   s~     	,,,BCCC
$$":..Xv.....    specreturnc                    | j         dk    r.t          j        || j        | j        | j        | j        d          S | j         dk    r| j        | j        z
  dz  }nt          d          |                                dk    s
J d            |j	        \  }}}t          j
                            || j        d	d
          }|| j        ddddf         z  }|d	z
  | j        z  | j        z   }t          j        j                            |d	|fd	| j        fd	| j        f          dddd|| f         }| j                                                            d	|d                              d	d          }	t          j        j                            |	d	|fd	| j        fd	| j        f                                          ||          }
|
dk                                    sJ ||
z  }|S )a  
        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.

        Args:
            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
                            N is the number of frequency bins, and T is the number of time frames.

        Returns:
            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
        r   T)r   r      r      zExpected a 3D tensor as inputr   backward)dimnormN)output_sizekernel_sizestrider   gdy=)r   r   istftr   r   r   r   r   r$   shapefftirfftnn
functionalfoldsquareexpand	transposesqueezeall)r   r   padBNTifftr&   y	window_sqwindow_envelopes              r   forwardzISTFT.forward'   s	    <8##;
    \V##?T_4:CCBCCCxxzzQ ?*1a ytTZQZHHdk$4-00 1u/$/AH$$K(DO,t'	 % 
 

 !!Q3t8
 K&&((//1b99CCAqII	(-22K(DO,t'	 3 
 

 '))CH  %',,.....r   r   __name__
__module____qualname____doc__intstrr   r   Tensorr>   __classcell__r   s   @r   r
   r
      s          LR/ //&)/7:/EH/ / / / / /7EL 7U\ 7 7 7 7 7 7 7 7r   r
   c                   6    e Zd ZdZdej        dej        fdZdS )FourierHeadz'Base class for inverse fourier modules.xr   c                      t          d          )aJ  
        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        -Subclasses must implement the forward method.NotImplementedErrorr   rL   s     r   r>   zFourierHead.forwardd   s     ""QRRRr   NrA   rB   rC   rD   r   rG   r>    r   r   rK   rK   a   sH        11	S 	S%, 	S 	S 	S 	S 	S 	Sr   rK   c            	       Z     e Zd ZdZddedededef fdZdej        d	ej        fd
Z	 xZ
S )	ISTFTHeada  
    ISTFT Head module for predicting STFT complex coefficients.

    Args:
        dim (int): Hidden dimension of the model.
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames, which should align with
                          the resolution of the input features.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    r   r$   r   r   r   c                     t                                                       |dz   }t          j                            ||          | _        t          ||||          | _        d S )Nr!   )r   r   r   r   )r   r   r   r.   Linearoutr
   r*   )r   r$   r   r   r   out_dimr   s         r   r   zISTFTHead.__init__|   s\    !)8??300J5'
 
 



r   rL   r   c                    |                      |          }|                    dd          }|                    dd          \  }}t          j        |          }t          j        |d          }t          j        |          }t          j        |          }||d|z  z   z  }|                     |          }|	                    d          |fS )ay  
        Forward pass of the ISTFTHead module.

        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        r   r!   r$   g      Y@)maxy              ?)
rX   r3   chunkr   expclipcossinr*   	unsqueeze)r   rL   x_predmagpr;   Saudios           r   r>   zISTFTHead.forward   s     !!!!Q''aQ''QinnjS
 
 
 IaLLIaLL 1rAv:

1q!!6))r   r?   r@   rI   s   @r   rU   rU   p   s        	 	
 
C 
 
 
s 
 
 
 
 
 
* *%, * * * * * * * *r   rU   c                 0    | t          j        |           z  S N)r   sigmoid)rL   s    r   nonlinearityrk      s    u}Qr       c                 H    t           j                            || dd          S )Nư>T)
num_groupsnum_channelsepsaffine)r   r.   	GroupNorm)in_channelsro   s     r   	Normalizeru      s*    8KT$    r   c                   0     e Zd Zdddd fd
ZddZ xZS )ResnetBlockNFi   )out_channelsconv_shortcuttemb_channelsc                   t                                                       || _        ||n|}|| _        || _        t          |          | _        t          j        	                    ||ddd          | _
        |dk    r%t          j                            ||          | _        t          |          | _        t          j                            |          | _        t          j        	                    ||ddd          | _        | j        | j        k    r]| j        r+t          j        	                    ||ddd          | _        d S t          j        	                    ||ddd          | _        d S d S )Nr"   r   )r'   r(   r   r   )r   r   rt   rx   use_conv_shortcutru   norm1r   r.   Conv1dconv1rW   	temb_projnorm2Dropoutdropoutconv2ry   nin_shortcut)r   rt   rx   ry   r   rz   r   s         r   r   zResnetBlock.__init__   sn    	&&2&:{{(!.{++
X__1Q % 
 

 1"X__]LIIDN|,,
x''00X__,Aa % 
 

 t000% %*X__1QPQ &5 & &""" %*HOO1QPQ %4 % %!!! 10r   c                    |}|                      |          }t          |          }|                     |          }|3||                     t          |                    d d d d d d f         z   }|                     |          }t          |          }|                     |          }|                     |          }| j        | j        k    r2| j	        r| 
                    |          }n|                     |          }||z   S ri   )r}   rk   r   r   r   r   r   rt   rx   r|   ry   r   )r   rL   tembhs       r   r>   zResnetBlock.forward   s    JJqMMOOJJqMMDNN<#5#566qqq!!!T47GHHAJJqMMOOLLOOJJqMMt000% )&&q))%%a((1ur   ri   )rA   rB   rC   r   r>   rH   rI   s   @r   rw   rw      sb        
 " " " " " " "H       r   rw   c                   6    e Zd ZdZdej        dej        fdZdS )BackbonezeBase class for the generator's backbone. It preserves the same temporal resolution across all layers.rL   r   c                      t          d          )ai  
        Args:
            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
                        C denotes output features, and L is the sequence length.

        Returns:
            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
                    and H denotes the model dimension.
        rN   rO   )r   rL   kwargss      r   r>   zBackbone.forward   s     ""QRRRr   NrR   rS   r   r   r   r      sH        oo
S 
SEL 
S 
S 
S 
S 
S 
Sr   r   c                   H     e Zd ZdZd
 fd	Zdej        dej        fd	Z xZS )VocosBackbonea  
    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization

    Args:
        input_channels (int): Number of input features channels.
        dim (int): Hidden dimension of the model.
        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
        num_layers (int): Number of ConvNeXtBlock layers.
        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
                                                None means non-conditional model. Defaults to None.
             @   c                 d  
 t                                                       t          j        dd          | _        d| _        }d}t          ||| j        |          t          ||| j        |          g}t          j        | | _        |}t          |          

fdt          |          D             }t          j        | | _        t          j        d	
          | _        t          ||| j        |          t          ||| j        |          g}	t          j        |	 | _        d S )N   r"   )r'   r   r   g?)rt   rx   rz   r   r[   c                 4    g | ]}t                     S ))r$   n_headsrotary_embedr   ).0_heads
hidden_dimtime_rotary_embeds     r   
<listcomp>z*VocosBackbone.__init__.<locals>.<listcomp>%  sB     
 
 
  <M  
 
 
r   rn   )rq   )r   r   r.   r~   embedtemb_chrw   
Sequential	prior_netr   rangetransformers	LayerNormfinal_layer_normpost_net)r   r   depthr   pos_meb_dimblock_inr   r   transformer_blocksr   r   r   s    ` `      @r   r   zVocosBackbone.__init__	  sy   Yz:1aPPP
 $%"l	   $%"l	  &
	 	26;GGG
 
 
 
 
 
 5\\	
 
 
 M+=> "ZT B B B$%"l	   $%"l	  %
 x0r   rL   r   c                    |                     dd          }|                     |          }|                     |          }|                     dd          }|                     |          }|                     dd          }|                     |          }|                     dd          }|                     |          }|S )Nr   r!   )r3   r   r   r   r   r   rQ   s     r   r>   zVocosBackbone.forward>  s    KK1JJqMMNN1KK1a  KK1MM!KK1!!!$$r   )r   r   r   r   )	rA   rB   rC   rD   r   r   rG   r>   rH   rI   s   @r   r   r      sn         31 31 31 31 31 31j
 
%, 
 
 
 
 
 
 
 
r   r   c                     t          | t          j                  rMt          j                            | j        d           t          j                            | j        d           d S d S )Ng{Gz?)stdr   )
isinstancer.   r~   inittrunc_normal_weight	constant_biasms    r   init_weightsr   K  s]    !RY %
ahD111
!&!$$$$$% %r   c                   p     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fd	ZddZd Zd Zd Zd Zd Z	d Z
d Zd Z xZS )CodecDecoderVocosr   r   r   r   @  r            ?F @  c                 *   t                                                       || _        t          |g dd          | _        t          ||||          | _        t          || j        dz  | j        d          | _        | 	                                 d S )N)   r   r   r   r   r   r   r   r   )r$   levelsnum_quantizers)r   r   r   r   r   r   )r$   r   r   r   )
r   r   r   r   	quantizerr   backbonerU   headreset_parameters)r   r   r   r   r   r   vq_num_quantizersvq_dimvq_commit_weightvq_weight_initvq_full_commit_losscodebook_sizecodebook_dimr   s                r   r   zCodecDecoderVocos.__init__R  s     	$$777
 
 
 &!e
 
 
 /A%	
 
 
	 	r   Tc                 0   |du rb|                     ddd          }|                     |          \  }}|                     ddd          }|                     ddd          }||d fS |                     |          }|                     |          \  }}||fS )NTr   r!   r   )permuter   r   r   )r   rL   vqqr   s        r   r>   zCodecDecoderVocos.forwardu  s    ::		!Q""A>>!$$DAq		!Q""A		!Q""Aa:MM!yy||1!tr   c                 v    | j                                         | _         | j                             |          }|S ri   )r   evalvq2embr   r   rL   s      r   r   zCodecDecoderVocos.vq2emb  s1    ,,..N!!"%%r   c                 t    | j                                         | _         | j                                         }|S ri   )r   r   get_emb)r   embss     r   r   zCodecDecoderVocos.get_emb  s/    ,,..~%%''r   c                 N    |d d d d d f         }|                      |          }|S ri   modelr   s      r   inference_vqzCodecDecoderVocos.inference_vq  s-    tQQQzNJJqMMr   c                 h    |                      |          \  }}}}|                     |          }|d fS ri   )r   r   )r   rL   r   lossperps        r   inference_0zCodecDecoderVocos.inference_0  s4    >>!,,1dDJJqMM$wr   c                 4    |                      |          }|d fS ri   r   rQ   s     r   	inferencezCodecDecoderVocos.inference  s    JJqMM$wr   c                 6    d }|                      |           dS )z:Remove weight normalization module from all of the layers.c                 r    	 t           j        j                            |            d S # t          $ r Y d S w xY wri   )r   r.   utilsremove_weight_normr   r   s    r   _remove_weight_normzACodecDecoderVocos.remove_weight_norm.<locals>._remove_weight_norm  sG    11!44444   s   $( 
66Napply)r   r   s     r   r   z$CodecDecoderVocos.remove_weight_norm  s,    	 	 	 	

&'''''r   c                 6    d }|                      |           dS )z9Apply weight normalization module from all of the layers.c                     t          | t          j                  st          | t          j                  r&t          j        j                            |            d S d S ri   )r   r.   r~   ConvTranspose1dr   r   weight_normr   s    r   _apply_weight_normz?CodecDecoderVocos.apply_weight_norm.<locals>._apply_weight_norm  sR    !RY'' .:a9K+L+L .**1-----. .r   Nr   )r   r   s     r   apply_weight_normz#CodecDecoderVocos.apply_weight_norm  s,    	. 	. 	. 	

%&&&&&r   c                 :    |                      t                     d S ri   )r   r   )r   s    r   r   z"CodecDecoderVocos.reset_parameters  s    

<     r   )r   r   r   r   r   r   r   r   FFr   r   )T)rA   rB   rC   r   r>   r   r   r   r   r   r   r   r   rH   rI   s   @r   r   r   Q  s         !!  !  !  !  !  ! F     
  
  
  
  	( 	( 	(' ' '! ! ! ! ! ! !r   r   )rl   )r   torch.nnr.   typingr   torchtune.modulesr   vector_quantize_pytorchr   bs_roformer5r   Moduler
   rK   rU   rk   ru   rw   r   r   r   r   rS   r   r   <module>r      s                8 8 8 8 8 8 / / / / / / * * * * * *S S S S SBI S S SlS S S S S") S S S1* 1* 1* 1* 1* 1* 1* 1*h     
   9 9 9 9 9") 9 9 9xS S S S Sry S S S M M M M MH M M M`% % %^! ^! ^! ^! ^!	 ^! ^! ^! ^! ^!r   