
    .`ib              	          d dl mZ d dlmZ d dlmZ d dlZd dlmZ	 d dl
Z
d dlmZ 	 d dlZn# e$ r  ed          ZY nw xY w	 d dlmZ n)# e$ r!  ed                              d          ZY nw xY w G d	 d
ee          Ze G d d                      Z edej                  Z ed          Zde	j        ej                 e
j        z  dede	j        ej                 e
j        z  fdZde	j        ej                 dedede	j        ej                 fdZde	j        ej                 dedefdZ G d d          Z dS )    )	dataclass)Enum)LiteralN)PlaceholderModulelibrosascipysignalc                   "    e Zd ZdZdZdZdZdZdS )ChannelReductionz8Method to reduce multi-channel audio to target channels.meanfirstmaxsumN)__name__
__module____qualname____doc__MEANFIRSTMAXSUM     i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/multimodal/audio.pyr   r      s(        BBDE
C
CCCr   r   c                   h    e Zd ZU dZdZedz  ed<   ej        Z	eed<   e
defd            ZdefdZdS )		AudioSpeca  Specification for target audio format.

    This dataclass defines the expected audio format for a model's feature
    extractor. It is used to normalize audio data before processing.

    Attributes:
        target_channels: Number of output channels. None means passthrough
            (no normalization). 1 = mono, 2 = stereo, etc.
        channel_reduction: Method to reduce channels when input has more
            channels than target. Only used when reducing channels.
       Ntarget_channelschannel_reductionreturnc                     | j         duS )z&Whether audio normalization is needed.Nr   selfs    r   needs_normalizationzAudioSpec.needs_normalization5   s     #4//r   c                 D    | j         dS d| j          d| j        j         dS )NzAudioSpec(passthrough)zAudioSpec(channels=z, reduction=))r   r   valuer#   s    r   __repr__zAudioSpec.__repr__:   sB    '++9$"6 9 9/59 9 9	
r   )r   r   r   r   r   int__annotations__r   r   r   propertyboolr%   strr)   r   r   r   r   r   $   s         
 
 #$OS4Z###*:*?'???0T 0 0 0 X0
# 
 
 
 
 
 
r   r   r   )r   r   r"   audiospecr    c                 :   |j         s| S | j        dk    r%|j        dk    r| S t          d|j         d          | j        dk    rt          d| j         d          | j        d         | j        d         k    r(t          | t          j                  r| j        n| j        } | j        d         }||j        k    r| S ||j        k     rt          d| d	|j                   t          | t          j                  }|j        dk    r|j	        t          j        k    r/|rt          j        | d
          n|                     d          }n|j	        t          j        k    r	| d         }n|j	        t          j        k    r4|rt          j        | d
          n|                     d          j        }n[|j	        t          j        k    r/|rt          j        | d
          n|                     d          }nt          d|j	                   |S | d|j                 S )aK  Normalize audio to the specified format.

    This function handles channel reduction for multi-channel audio,
    supporting both numpy arrays and torch tensors.

    Args:
        audio: Input audio data. Can be:
            - 1D array/tensor: (time,) - already mono
            - 2D array/tensor: (channels, time) - standard format from torchaudio
            - 2D array/tensor: (time, channels) - format from soundfile
              (will be auto-detected and transposed if time > channels)
        spec: AudioSpec defining the target format.

    Returns:
        Normalized audio in the same type as input (numpy or torch).
        For mono output (target_channels=1), returns 1D array/tensor.

    Raises:
        ValueError: If audio has unsupported dimensions or channel expansion
            is requested (e.g., mono to stereo).
    r   zCannot expand mono audio to z	 channels   zUnsupported audio shape: z. Expected 1D or 2D.r   zCannot expand z channels to )axis)dimzUnknown reduction method: N)r%   ndimr   
ValueErrorshape
isinstancenpndarrayTr   r   r   r   r   r   r   valuesr   r   )r/   r0   num_channelsis_numpyresults        r   normalize_audior@   H   s?   2 #  zQ1$$LW8LWWWXXX zQVU[VVVWWW {1~A&&%eRZ88Eeg;q>L t+++ d***N\NN8LNN
 
 	

 %,,Hq  !%5%:::/7NRWU++++UZZAZ=N=NFF#'7'===1XFF#'7';;;.6SRVE****EII!I<L<L<SFF#'7';;;.6LRVE****EII!I<L<LFFR$:PRRSSS +t++,,r   orig_sr	target_src                0    t          j        | ||          S )NrA   rB   )r   resampler/   rA   rB   s      r   resample_audio_librosarG      s     E7iHHHHr   c                    ||k    rt          j        | d||z            S ||k     rt          j        | ||z  d          S | S )Nr   )scipy_signalresample_polyrF   s      r   resample_audio_scipyrK      sS     )%Gy4HIII	9		)%g1EqIIILr   c                       e Zd ZdZ	 	 ddedz  ded         fdZdej        e	j
                 d	ed
ej        e	j
                 fdZdS )AudioResamplerz,Resample audio data to a target sample rate.Nr   rB   method)r   r   c                 "    || _         || _        d S )N)rB   rN   )r$   rB   rN   s      r   __init__zAudioResampler.__init__   s    
 #r   r/   rA   r    c                    | j         t          d          | j        dk    rt          ||| j                   S | j        dk    rt	          ||| j                   S t          d| j         d          )NzBAudio resampling is not supported when `target_sr` is not providedr   rD   r   zInvalid resampling method: z.. Supported methods are 'librosa' and 'scipy'.)rB   RuntimeErrorrN   rG   rK   r6   )r$   r/   rA   s      r   rE   zAudioResampler.resample   s     >!T   ;)##)w$.    [G##'w$.    ?dk ? ? ?  r   )Nr   )r   r   r   r   floatr   rP   nptNDArrayr9   floatingrE   r   r   r   rM   rM      s        66 #'.7 4< *+   {2;' 	
 
R[	!     r   rM   )!dataclassesr   enumr   typingr   numpyr9   numpy.typingrT   torchvllm.utils.import_utilsr   r   ImportErrorscipy.signalr	   rI   placeholder_attrr.   r   r   r   MONO_AUDIO_SPECPASSTHROUGH_AUDIO_SPECrU   rV   Tensorr@   rS   rG   rK   rM   r   r   r   <module>rd      s   " ! ! ! ! !                        5 5 5 5 5 5+NNNN + + +	**GGG+I''''''' I I I$$W-->>xHHLLLI    sD    
 
 
 
 
 
 
 
> )AAQAVWWW"4888 K-;r{#el2K-
K- 	[,K- K- K- K-fI;r{#I I 	I
 	[I I I I
;r{#
 
 	
 
 
 
! ! ! ! ! ! ! ! ! !s    - A A A #A10A1