o
    Qdi                     @   s   d Z ddlZddlZddlZddlmZmZ ddlZddlZ			ddee
ef dedefd	d
Zdd ZdddZdd ZddddedefddZdS )aE  We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV

The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
system dependencies. FFmpeg does not need to be installed on the system.

However, the API is quite low-level so we need to manipulate audio frames directly.
    N)BinaryIOUnion>  F
input_filesampling_ratesplit_stereoc                 C   s   t jjjd|s	dnd|d}t }d}t j| ddd-}|jd	d
}t|}t	|d}t
||}|D ]}| }	|	j}||	 q3W d   n1 sLw   Y  ~t  tj| |d}
|
tjd }
|r{|
d	dd }|
ddd }||fS |
S )a}  Decodes the audio.

    Args:
      input_file: Path to the input file or a file-like object.
      sampling_rate: Resample the audio to this sample rate.
      split_stereo: Return separate left and right channels.

    Returns:
      A float32 Numpy array.

      If `split_stereo` is enabled, the function returns a 2-tuple with the
      separated left and right channels.
    Zs16ZmonoZstereo)formatZlayoutrateNrignore)modeZmetadata_errorsr   )audioi  )dtypeg      @      )avr   	resamplerZAudioResamplerioBytesIOopendecode_ignore_invalid_frames_group_frames_resample_framesZ
to_ndarrayr   writegcZcollectnpZ
frombuffer	getbufferZastypeZfloat32)r   r   r   r   Z
raw_bufferr   	containerframesframearrayr   Zleft_channelZright_channel r"   8/home/jaya/work/projects/WHISPER/faster_whisper/audio.pydecode_audio   s6   


r$   c                 c   sF    t | }	 zt|V  W n ty   Y d S  tjjy!   Y qw qN)iternextStopIterationr   errorZInvalidDataError)r   iteratorr"   r"   r#   r   O   s   r   c                 c   s`    t jj }| D ]}d |_|| |d ur!|j|kr!| V  q	|jdkr.| V  d S d S )Nr   )r   r   fifoZ	AudioFifoZptsr   Zsamplesread)r   Znum_samplesr+   r    r"   r"   r#   r   [   s   


r   c                 c   s*    t | d gD ]
}||E d H  qd S r%   )	itertoolschainZresample)r   r   r    r"   r"   r#   r   i   s   r     )axislengthr1   c                C   s`   | j | |kr| jt||d} | j | |k r.dg| j }d|| j |  f||< t| |} | S )zQ
    Pad or trim the Mel features array to 3000, as expected by the encoder.
    )indicesr1   )r   r   r   )shapeZtakerangendimr   Zpad)r!   r2   r1   Z
pad_widthsr"   r"   r#   pad_or_trimo   s   r7   )r   Fr%   )r/   )__doc__r   r   r-   typingr   r   r   Znumpyr   strintboolr$   r   r   r   r7   r"   r"   r"   r#   <module>   s(    

<
