
     `i^                        d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
m	c mZ ddlmZ ddlmZmZ dd	lmZ d
dlmZ e G d de                      Ze G d de                      Ze G d de                      Z G d de	j                  Z G d de	j                  Z G d de	j                  Z G d de	j                  Z G d de	j                  Z G d de	j                  Z G d de	j                  Z  G d  d!e	j                  Z!e G d" d#e                      Z" ed$%           G d& d'e"                      Z#d'd#gZ$dS )(zTransformers Xcodec model.    N)	dataclass)OptionalUnion   )PreTrainedAudioTokenizerBase)ModelOutputauto_docstring   )	AutoModel   )XcodecConfigc                   \    e Zd ZU dZdZeej                 ed<   dZ	eej
                 ed<   dS )XcodecOutputao  
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discrete code indices computed using `model.encode`.
        audio_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`, *optional*)
            Decoded audio values obtained using the decoder part of Xcodec.
    Naudio_codesaudio_values)__name__
__module____qualname____doc__r   r   torch
LongTensor__annotations__r   FloatTensor     ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/xcodec/modeling_xcodec.pyr   r      sN           /3K%*+22204L(5,-44444r   r   c                   8    e Zd ZU dZdZeej                 ed<   dS )XcodecEncoderOutputz
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discrete code indices computed using `model.encode`.
    Nr   )	r   r   r   r   r   r   r   r   r   r   r   r   r   r   -   s6           /3K%*+22222r   r   c                   8    e Zd ZU dZdZeej                 ed<   dS )XcodecDecoderOutputz
    Args:
        audio_values (`torch.FloatTensor`  of shape `(batch_size, channels, num_samples)`, *optional*):
            Decoded audio values obtained using the decoder part of Xcodec.
    Nr   )	r   r   r   r   r   r   r   r   r   r   r   r   r    r    8   s6           15L(5,-44444r   r    c                   X     e Zd ZdZdedededef fdZdej        dej        fd	Z	 xZ
S )
ResidualUnitzFResidual block for SemanticEncoder and SemanticDecoder used in Xcodec.configin_channelsout_channelsdilationc           
         t                                                       t          j                    | _        |j        dz
  dz  |z  }t          j        |||j        d||dd          | _        t          j        ||dd          | _        d S )Nr   r
   F)stridepaddingr&   groupsbias)r$   r%   kernel_sizer+   )	super__init__nnELU
activationunit_kernel_sizeConv1dconv1conv2)selfr#   r$   r%   r&   r)   	__class__s         r   r.   zResidualUnit.__init__F   s    &((+a/A5AY#	
 	
 	

 Y<l`ahmnnn


r   hidden_statereturnc                     |                      |          }|                     |          }|                      |          }|                     |          }||z   S N)r1   r4   r5   )r6   r8   output_tensors      r   forwardzResidualUnit.forwardV   sQ    55

=1166

=11m++r   )r   r   r   r   r   intr.   r   Tensorr=   __classcell__r7   s   @r   r"   r"   C   s        PPo| o# oS o\_ o o o o o o ,EL ,U\ , , , , , , , ,r   r"   c                   T     e Zd Zdedededef fdZdej        dej        fdZ xZ	S )	SemanticEncoderBlockr#   r$   r%   r(   c                 
   t                                                       t          j        fdj        D                       | _        |dk    rdnd|z  }|dz
  dz  }t          j        ||||d          | _        d S )Nc                 4    g | ]}t          |          S r   r"   ).0r&   r#   r$   s     r   
<listcomp>z1SemanticEncoderBlock.__init__.<locals>.<listcomp>b   s'    mmm(\&+{HEEmmmr   r   r   r
   Tr,   r(   r)   r+   )r-   r.   r/   
ModuleListblock_dilations	res_unitsr3   conv)r6   r#   r$   r%   r(   kernelr)   r7   s    ``    r   r.   zSemanticEncoderBlock.__init___   s    mmmmmV\Vlmmm
 

 kkF
A:!#Ik<VTZdkrvwww			r   r8   r9   c                 Z    | j         D ]} ||          }|                     |          }|S r;   )rL   rM   r6   r8   units      r   r=   zSemanticEncoderBlock.forwardj   s;    N 	. 	.D4--LLyy..r   
r   r   r   r   r>   r.   r   r?   r=   r@   rA   s   @r   rC   rC   ^   s        	x| 	x# 	xS 	xZ] 	x 	x 	x 	x 	x 	xEL U\        r   rC   c                   B     e Zd Z fdZdej        dej        fdZ xZS )SemanticEncoderc                    t                                                       t          |j                  t          |j                  k    rt          d          t          j        |j        |j        |j	        d|j	        dz  d          | _
        |j        }g }t          |j                  D ]?\  }}t          |j        |j        |         z            }|t          ||||          gz  }|}@t          j        |          | _        d S )Nz:Number of strides must match the number of channel_ratios.r   r
   Fr+   )r-   r.   lenstrideschannel_ratios
ValueErrorr/   r3   semantic_hidden_sizer,   rM   	enumerater>   rC   rJ   conv_blocks)r6   r#   r$   r]   ir(   r%   r7   s          r   r.   zSemanticEncoder.__init__r   s	   v~#f&;"<"<<<YZZZI''!#
 
 
	 1"6>22 	' 	'IAvv:V=RST=UUVVL0lTZ[[\\K&KK=55r   r8   r9   c                 Z    |                      |          }| j        D ]} ||          }|S r;   )rM   r]   r6   r8   blocks      r   r=   zSemanticEncoder.forward   s<    yy..% 	/ 	/E 5..LLr   r   r   r   r.   r   r?   r=   r@   rA   s   @r   rT   rT   q   s^        6 6 6 6 6,EL U\        r   rT   c                   T     e Zd Zdedededef fdZdej        dej        fdZ xZ	S )	SemanticDecoderBlockr#   r$   r%   r(   c           	      b   t                                                       |dk    r t          j        |dddd          | _        n:d|z  }|dz   dz  }|dz  dk    rdnd}t          j        |||||d          | _        t          j        fd	j        D                       | _        d S )
Nr   r   TrI   r
   r   FrV   c                 4    g | ]}t          |          S r   rF   )rG   r&   r#   r%   s     r   rH   z1SemanticDecoderBlock.__init__.<locals>.<listcomp>   s'    oooH\&,hGGooor   )	r-   r.   r/   r3   rM   ConvTranspose1drJ   rK   rL   )	r6   r#   r$   r%   r(   r,   r)   output_paddingr7   s	    ` `    r   r.   zSemanticDecoderBlock.__init__   s    Q;;	  DII f*Kza'G"(1*//QQqN*\;^c  DI oooooX^Xnooo
 
r   r8   r9   c                 Z    |                      |          }| j        D ]} ||          }|S r;   )rM   rL   rP   s      r   r=   zSemanticDecoderBlock.forward   s;    yy..N 	. 	.D4--LLr   rR   rA   s   @r   rd   rd      s        
| 
# 
S 
Z] 
 
 
 
 
 
.EL U\        r   rd   c                   B     e Zd Z fdZdej        dej        fdZ xZS )SemanticDecoderc                    t                                                       t          j        |j        t          |j        |j        d         z            |j        d|j        dz  d          | _        g }t          |j
                  D ]\  }}t          |j        |j        |         z            }|t          |j                  dz
  k     r&t          |j        |j        |dz            z            }n|j        }|t          ||||          gz  }t          j        |          | _        t          j        |j        |j        |j        d|j        dz  d          | _        d S )Nr   r   r
   F)r$   r%   r,   r(   r)   r+   )r(   r)   r+   )r-   r.   r/   r3   r[   r>   rY   r,   r4   r\   rX   rW   rd   rJ   r]   r5   )r6   r#   r]   r^   r(   r$   r%   r7   s          r   r.   zSemanticDecoder.__init__   sg   Y3V86;PQR;SSTT*&!+
 
 

 "6>22 	] 	]IAvf9F<QRS<TTUUKC-..233"6#>AVWX[\W\A]#]^^%:0lTZ[[\\KK=55Y''&!+
 
 



r   r8   r9   c                     |                      |          }| j        D ]} ||          }|                     |          }|S r;   )r4   r]   r5   r`   s      r   r=   zSemanticDecoder.forward   sM    zz,//% 	/ 	/E 5..LLzz,//r   rb   rA   s   @r   rk   rk      s^        
 
 
 
 
>EL U\        r   rk   c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )XcodecEuclideanCodebookz!Codebook with Euclidean distance.c                    t                                                       t          j        |j        |j                  }|j        | _        |                     dt          j        dg                     |                     dt          j        |j                             |                     d|           |                     d|                                           d S )NinitedTcluster_sizeembed	embed_avg)	r-   r.   r   zeroscodebook_sizecodebook_dimregister_bufferr?   clone)r6   r#   rs   r7   s      r   r.   z XcodecEuclideanCodebook.__init__   s    F0&2EFF#1Xu|TF';';<<<^U[9M-N-NOOOWe,,,[%++--88888r   c                 0   | j                                         }|                    d                              dd          }|d|z  |z  z
  |                    d                              dd          z    }|                    d          j        }|S )Nr
   r   T)keepdimr   dim)rs   tpowsummaxindices)r6   hidden_statesrs   scaled_statesdist	embed_inds         r   quantizez XcodecEuclideanCodebook.quantize   s    
%))!,,00D0AA]!2U!::UYYq\\=M=MaY]=M=^=^^_HHH$$,	r   c                     |j         }|                    d|d         f          }|                     |          } |j        |d d          }|S )Nr|   )shapereshaper   view)r6   r   r   r   s       r   encodezXcodecEuclideanCodebook.encode   sR    #%--r59o>>MM-00	"INE#2#J/	r   c                 :    t          j        || j                  }|S r;   )F	embeddingrs   )r6   r   	quantizeds      r   decodezXcodecEuclideanCodebook.decode   s    K	4:66	r   )	r   r   r   r   r.   r   r   r   r@   rA   s   @r   ro   ro      sk        ++9 9 9 9 9          r   ro   c                   4     e Zd ZdZdef fdZd Zd Z xZS )XcodecVectorQuantizationzY
    Vector quantization implementation. Currently supports only euclidean distance.
    r#   c                 p    t                                                       t          |          | _        d S r;   )r-   r.   ro   codebookr6   r#   r7   s     r   r.   z!XcodecVectorQuantization.__init__   s,    /77r   c                 h    |                     ddd          }| j                            |          }|S Nr   r
   r   )permuter   r   )r6   r   embed_ins      r   r   zXcodecVectorQuantization.encode   s3    %--aA66=''66r   c                 h    | j                             |          }|                    ddd          }|S r   )r   r   r   )r6   r   r   s      r   r   zXcodecVectorQuantization.decode  s3    =''	22##Aq!,,r   )	r   r   r   r   r   r.   r   r   r@   rA   s   @r   r   r      sl         8| 8 8 8 8 8 8
        r   r   c                        e Zd ZdZdef fdZd ZddefdZdde	j
        de	j
        fd	Zd
e	j
        de	j
        fdZ xZS ) XcodecResidualVectorQuantizationzv
    Residual vector quantization implementation. Follows Algorithm 1 in https://huggingface.co/papers/2107.03312
    r#   c                     t                                                       t          j        fdt	          j                  D                       | _        j        | _        j        | _        j        | _        d S )Nc                 .    g | ]}t                    S r   )r   )rG   _r#   s     r   rH   z=XcodecResidualVectorQuantization.__init__.<locals>.<listcomp>  s"    (p(p(pa)A&)I)I(p(p(pr   )	r-   r.   r/   rJ   rangenum_quantizers
quantizers
frame_raterv   r   s    `r   r.   z)XcodecResidualVectorQuantization.__init__  ss    -(p(p(p(pSXY_YnSoSo(p(p(pqq +#1$3r   c                 J    t          j        | j                  | j        z  dz  S )zReturn bandwidth per quantizer.i  )mathlog2rv   r   )r6   s    r   get_bandwidth_per_quantizerz<XcodecResidualVectorQuantization.get_bandwidth_per_quantizer  s!    y+,,t>EEr   Nr9   c           	          |                                  }| j        }|8|dk    r2t          t          dt	          j        ||z                                }|S )z:Return num_quantizers based on specified target bandwidth.N        r   )r   r   r>   r   r   floor)r6   	bandwidthbw_per_qr   s       r    get_num_quantizers_for_bandwidthzAXcodecResidualVectorQuantization.get_num_quantizers_for_bandwidth  sU    3355, Y__ Q
9x3G(H(H!I!IJJNr   
embeddingsc                    |                      |          }|}g }| j        d|         D ]F}|                    |          }|                    |          }||z
  }|                    |           Gt          j        |          }	|	S )a  
        Encode the input tensor into discrete indices using RVQ, with the number of quantizers selected based on the given bandwidth.
        Each quantizer /codebook residually quantizes the input and returns the nearest indices in terms of Euclidian distance.
        N)r   r   r   r   appendr   stack)
r6   r   r   r   residualall_indices	quantizerr   r   out_indicess
             r   r   z'XcodecResidualVectorQuantization.encode$  s    
 >>yII.9 	( 	(I&&x00G!((11I)+Hw''''k+..r   codesc                     t          j        d|j                  }t          |          D ],\  }}| j        |         }|                    |          }||z   }-|S )z9Decode the given codes to their quantized representation.r   )device)r   tensorr   r\   r   r   )r6   r   quantized_outr^   r   r   r   s          r   r   z'XcodecResidualVectorQuantization.decode4  sd    S>>>#E** 	6 	6JAw*I!((11I)I5MMr   r;   )r   r   r   r   r   r.   r   r>   r   r   r?   r   r   r@   rA   s   @r   r   r     s         4| 4 4 4 4 4 4F F F #      %,     EL U\        r   r   c                   0    e Zd ZdZeZdZdZd Zd Z	d Z
dS )XcodecPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    xcodecinput_valuesc                 \   t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
        t          j        f          r?|j        j        	                                 |j        j                            d           dS t          |t          j                  rt          j                            |j                   |j        Yt!          j        |j        |j        |j        d         z  z            }t          j                            |j        | |           dS dS |j        j        dk    r!|j        j                            d           dS t          |t          j                  r|                                 dS t          |t          j                  r#|j        j                            dd           dS t          |t8                    r|j                                        D ]g}t          |t          j                  rKt          j                            |j        d	           t          j                             |j        d           h|j!                                        D ]i}t          |t          j                  rKt          j                            |j        d	           t          j                             |j        d           hdS dS )
zInitialize the weightsr   )meanstdNg      ?r   )abSnake1dg{Gz?)r   )"
isinstancer/   Linearweightdatanormal_r#   initializer_ranger+   zero_	LayerNorm	GroupNormfill_r3   initkaiming_normal_r   sqrtr*   r$   r,   uniform_r7   r   alpharg   reset_parameters	EmbeddingXcodecModelacoustic_encodermodulestrunc_normal_	constant_acoustic_decoder)r6   modulek	submodules       r   _init_weightsz#XcodecPreTrainedModel._init_weightsI  s   fbi(( 	9M&&CT[5R&SSS{& &&((((( '&r| <== 	9K""$$$M$$S)))))	** 	9G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888 '& &)33L##C((((( 233 	9##%%%%%-- 	9M&&CT&:::::,, 
	9 $4<<>> 9 9	i33 9G)))*:)EEEG%%ina888#4<<>> 9 9	i33 9G)))*:)EEEG%%ina888
	9 
	99 9r   c                    t           j        j        j        }t	          t           j        j        j        d          rt           j        j        j        j        } || j        j                    || j        j                   | j        j	        D ]I} ||j                   |j
        |j        |j        fD ]"} ||j                    ||j                   #J || j        j        d            || j        j        d           | j        j	        D ]O} ||j        d           |j
        |j        |j        fD ]&} ||j        d            ||j        d           'PdS )znApply weight norm in the acoustic encoder and decoder because the original checkpoint has weight norm applied.weight_normr   nameN)r   r/   utilsr   hasattrparametrizationsr   r4   r5   ra   	res_unit1	res_unit2	res_unit3r   conv_t1)r6   r   ra   res_units       r   apply_weight_normz'XcodecPreTrainedModel.apply_weight_normi  s   hn058>2MBB 	F(.9EKD)/000D)/000*0 	, 	,EK$$$"_eouO , ,HN+++HN++++, 	D)/h????D)/h????*0 	; 	;EKH5555"_eouO ; ;HN::::HN:::::;	; 	;r   c                 ^   | j         | j        fD ]}|                                D ]}	 t          j        j                            |d           n# t          t          f$ r Y nw xY wt          |d          r5d|j
        v r,t          j        j        j                            |dd           dS )z=Remove the weight norm from the acoustic encoder and decoder.r   r   r   T)leave_parametrizedN)r   r   r   r   r/   r   remove_weight_normrZ   AttributeErrorr   r   parametrizeremove_parametrizations)r6   r   ms      r   r   z(XcodecPreTrainedModel.remove_weight_norm  s    ,d.CD 	m 	mF^^%% m mHN55ah5GGGG"N3   D1011 mh!BT6T6THN.FFq(gkFlllm	m 	ms   &AA#"A#N)r   r   r   r   r   config_classbase_model_prefixmain_input_namer   r   r   r   r   r   r   r   >  sc         
  L $O9 9 9@; ; ;0	m 	m 	m 	m 	mr   r   z$The Xcodec neural audio codec model.)custom_introc                       e Zd Z fdZedej        fd            Zdej	        dej	        fdZ
e	 	 ddej        dee         d	ee         deej        ef         fd
            Ze	 ddej        d	ee         deej        ef         fd            Ze	 	 	 ddej        deej                 dee         d	ee         deeej        ej        f         ef         f
d            Z xZS )r   c                    t                                          |           || _        |j        dz  | _        t          j        |j                  }|j        | _	        |j
        | _        |                     | j                   t          |          | _        t          |          | _        t          j        |j                                                  | _        t)          j        |j        |j                  | _        t)          j        |j        |j        j                  | _        t)          j        |j        |j        j                  | _        t5          |          | _        |                                  d S )Nr
   )r-   r.   r#   
hop_lengthpadr   from_configacoustic_model_configencoderr   decoderr   _adjust_dac_decoderrT   encoder_semanticrk   decoder_semanticsemantic_model_configevalsemantic_modelr/   r   hidden_sizefcfc1fc2r   r   	post_init)r6   r#   acoustic_modelr7   s      r   r.   zXcodecModel.__init__  s+      $)".v/KLL . 6 . 6  !6777 / 7 7 / 7 7'3F4PQQVVXX)F.0BCC9V/1M1YZZ9V/1M1YZZ9&AA 	r   r  c                 p   |                                  D ]U}t          |t          j                  r9t          |j        t
                    r|j        d         n|j        }|dz  f|_        Vt          | d          r9t          | j        t          j	                  rt          j
                    | _        dS dS dS )z
        DAC implemented in Xcodec is slightly different from the HF version.
        DAC in Xcodec adjusts the output padding in every ConvTranspose1d in the decoder and removes
        the final `nn.Tanh` activation function.
        r   r
   tanhN)r   r   r/   rg   r(   tuplerh   r   r  TanhIdentity)r  r   r(   s      r   r  zXcodecModel._adjust_dac_decoder  s     oo'' 	6 	6F&""455 6-7u-M-M`q))SYS`)/!%7F## 	)
7<(I(I 	);==GLLL	) 	) 	) 	)r   r   r9   c                 L   |d d dd d f         }t          j        || j        | j        f          }t          j                    5  |                     |d          }|j        }d d d            n# 1 swxY w Y   t          j        |d          }|                    d          S )Nr   T)output_hidden_statesr   r}   )r   r   r   no_gradr  r   r   r   )r6   r   outputsr   stackeds        r   _extract_semantic_featuresz&XcodecModel._extract_semantic_features  s    #AAAq!!!G,u\DHdh+?@@]__ 	2 	2)),T)RRG#1M	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 +m333|||"""s   A//A36A3Nr   return_dictc           	         ||n| j         j        }|j        d         }|dk    rt          d|           || j         j        d         }n.|| j         j        vr t          d| d| j         j         d          |                     |                                          }|                     |                    dd                    }| 	                    |          }|j        d         |j        d         k    rT| 	                    t          j        |ddd	ddf         | j        | j        f                              d                    }t          j        ||gd
          }|                     |                    dd                                        dd          }| j                            ||          }	|	                    d	d          }	|s|	S t%          |	          S )ac  
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
            Float values of the input audio waveform.
        bandwidth (`float`, *optional*):
            The target bandwidth in (kbps) supports only values in `config.target_bandwidths`.
            Defaults to the highest available bandwidth `4.0` kbps.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`].

        Returns:
            `torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)` containing the discrete encoded audio codes.
        Nr   zAudio must be mono, but got r|   z)This model doesn't support the bandwidth z. Select one of .r
   r   r}   )r#   r  r   rZ   target_bandwidthsr  detachr  	transposer   r   r   	unsqueezer   catr	  r   r   r   )
r6   r   r   r  channelse_semantic_input
e_semantic
e_acousticr   r   s
             r   r   zXcodecModel.encode  s   & &1%<kk$+BY%a(q==FHFFGGG5b9IIdk;;;wIwwW[WbWtwww    ::<HHOOQQ**+;+E+Ea+K+KLL
**<88
A*"21"555..qu\!!!Q'5JTXW[W_L`/a/a/k/klm/n/nooJY
J7Q???
WWZ11!Q7788BB1aHH
n++J	BB!++Aq11 	";///r   r   c                 L   ||n| j         j        }|                    dd          }| j                            |          }|                     |                    dd                                        dd          }|                     |          }|s|S t          |          S )a  
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`):
            Discrete code indices computed using `model.encode`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`]

        Returns:
            Decoded audio values of shape `(batch_size, channels, num_samples)` obtained using the decoder part of
            Xcodec.
        Nr   r   r
   )r#   r  r  r   r   r  r   r    )r6   r   r  r   quantized_acousticr   s         r   r   zXcodecModel.decode  s      &1%<kk$+BY!++Aq11N))+66	!XXi&9&9!Q&?&?@@JJ1aPP,,-?@@ 	 "<000r   c                     ||n| j         j        }|j        d         }||                     ||d          }|                     ||          d         dd|f         }|s||fS t          ||          S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
            The raw float values of the input audio waveform.
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`:
            Discrete code indices computed using `model.encode`.
        bandwidth (`float`, *optional*):
            Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
        bandwidth (`float`, *optional*):
            Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
        return_dict (`bool`, *optional*):
            Whether to return a [`XcodecOutput`] instead of a plain tuple.

        Returns:
            `XcodecOutput` or tuple `(audio_codes, audio_values)`:
            - `audio_codes` of shape `(batch_size, num_quantizers, codes_length)`: the quantized discrete codes.
            - `audio_values` of shape `(batch_size, channels, num_samples)`: the reconstructed audio waveform given the codes.

        Example:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoFeatureExtractor, XcodecModel

        >>> model_id = "hf-audio/xcodec-hubert-librispeech"
        >>> model = XcodecModel.from_pretrained(model_id)
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)

        >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
        >>> audio_sample = dataset[0]['audio']['array']

        >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```
        Nr|   F)r  r   .)r   r   )r#   r  r   r   r   r   )r6   r   r   r   r  lengthr   s          r   r=   zXcodecModel.forward  s    \ &1%<kk$+BY#B'++lI5+QQK{{;K{HHKCQXRXQXLY 	/..,OOOOr   )NNr;   )NNN)r   r   r   r.   staticmethodr/   Moduler  r   r   r  r	   r?   r   floatboolr   r   r   r    r   r  r   r=   r@   rA   s   @r   r   r     s           & )RY ) ) ) \)#u7H #UM^ # # # #  &*&*	/0 /0l/0 E?/0 d^	/0
 
u|00	1/0 /0 /0 ^/0b  '+1 1\1 d^1 
u|00	1	1 1 1 ^16  /3%)&*8P 8Pl8P el+8P E?	8P
 d^8P 
uU\5</0,>	?8P 8P 8P ^8P 8P 8P 8P 8Pr   r   )%r   r   dataclassesr   typingr   r   r   torch.nnr/   torch.nn.functional
functionalr   modeling_utilsr   r   r   r	   autor   configuration_xcodecr   r   r   r    r*  r"   rC   rT   rd   rk   ro   r   r   r   r   __all__r   r   r   <module>r6     sz   !    ! ! ! ! ! ! " " " " " " " "                 : : : : : : 0 0 0 0 0 0 0 0       . . . . . . 
5 
5 
5 
5 
5; 
5 
5 
5 3 3 3 3 3+ 3 3 3 5 5 5 5 5+ 5 5 5, , , , ,29 , , ,6    29   &    bi   <    29   >% % % % %bi % % %P    bi   @    ry   ,/ / / / /ry / / /d Km Km Km Km Km8 Km Km Km\ GHHHsP sP sP sP sP' sP sP IHsPl 1
2r   