
    *`iN@                     n   d dl Z d dlZd dlmZ d dlmZ d dlZd dlm	Z	 d dl
mZmZmZ  e j        e          ZdZdeded	dfd
Z G d dee          Ze G d d                      Ze G d d                      Ze G d d                      Ze G d d                      Z G d d          ZdS )    N)	dataclass)Enum)Audio)
AudioChunkAudioURLChunkAudioURLType
   num_samplesmult_ofreturnc                 8    | |z  dk    sJ d| d|            d S )Nr   znum_samples= must be a multiple of mult_of= )r
   r   s     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/mistral_common/tokens/tokenizers/audio.py_check_mult_ofr      s7     A%%%'Y+'Y'Yw'Y'Y%%%%%    c                       e Zd ZdZdZdZdS )TranscriptionFormatzTranscription format.

    Should be set by the tokenizer for correct encoding.

    Attributes:
    - INSTRUCT: The instruct format.
    - STREAMING: The streaming format.
    instruct	streamingN)__name__
__module____qualname____doc__INSTRUCT	STREAMINGr   r   r   r   r      s$          HIIIr   r   c                   :    e Zd ZU dZeed<   eed<   eed<   ddZdS )	AudioSpectrogramConfiga^  Configuration for generating an audio spectrogram.

    Attributes:
        num_mel_bins: Number of mel bins, typically 80 or 128.
        hop_length: Length of the overlapping windows for
            the STFT used to obtain the Mel Frequency coefficients, typically 160.
        window_size: Window size of the Fourier transform, typically 400.
    num_mel_bins
hop_lengthwindow_sizer   Nc                     | j         dk    sJ | j                     | j        dk    sJ | j                    | j        dk    sJ | j                    d S Nr   )r   r    r!   selfs    r   __post_init__z$AudioSpectrogramConfig.__post_init__9   sc     1$$$d&7$$$"""DO"""!###T%5#####r   r   N)r   r   r   r   int__annotations__r&   r   r   r   r   r   '   sW           OOO6 6 6 6 6 6r   r   c                      e Zd ZU dZeed<   eed<   eed<   dZedz  ed<   e	j
        Ze	ed<   dZedz  ed<   dZedz  ed	<   dZedz  ed
<   dZedz  ed<   ddZedefd            ZdedefdZedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            ZdS )AudioConfiga`  Configuration for audio processing.

    Attributes:
        sampling_rate: Sampling rate of the audio.
        frame_rate: Number of frames per second accepted by the tokenizer model.
        encoding_config: Configuration for audio spectrogram.
        chunk_length_s: Whether to pad an audio into multiples of chunk_length_s seconds (optional).
    sampling_rate
frame_rateencoding_configNchunk_length_stranscription_formattranscription_delay_msstreaming_look_ahead_msstreaming_look_back_msstreaming_n_left_pad_tokensr   c                 r   | j         dk    sJ | j                     | j        dk    sJ | j                    | j        ?| j        dk    sJ | j                    | j        dk    sJ d| j         d| j                     | j        sh| j        J d| j        d            | j        J d| j        d            | j        J d| j        d            | j        J d| j        d            | j        r| j        J d| j        d	            | j        J d| j        d	            | j        J d| j        d	            | j        J d| j        d	            d
| j         z  }| j        dk    s
J d            | j        |z  dk    sJ d| j        d|            | j        J d| j        d            d S d S )Nr   z7chunk_length_s and sampling_rate must both be > 0, got z and zself.transcription_delay_ms=z must be None.zself.streaming_look_ahead_ms=zself.streaming_look_back_ms=!self.streaming_n_left_pad_tokens= must be set.     @@z*{self.transcription_delay_ms=} must be > 0z) must be a multiple of frame_duration_ms=zself.chunk_length_s=z cannot be set in streaming.)	r-   r,   r/   chunk_framesis_streamingr1   r2   r3   r4   )r%   frame_duration_mss     r   r&   zAudioConfig.__post_init__]   s   """DO"""!A%%%t'9%%%*&***D,?***$q(((x$J]xxdhdvxx )((   	r.668g4;V8g8g8g666/779iD<X9i9i9i777.668g4;V8g8g8g6663;;=q@`=q=q=q;;; 	f.::<jt?Z<j<j<j:::/;;=l@\=l=l=l;;;.::<jt?Z<j<j<j:::3??AtDDdAtAtAt??? & 8.2224`222.1BBaGGG\4.\\HY\\ HGG &..0e43F0e0e0e...	f 	f /.r   c                 ,    | j         t          j        k    S N)r0   r   r   r$   s    r   r:   zAudioConfig.is_streaming|   s    (,?,IIIr   	audio_lenc                     || j         j        z  dk    r%t          j        || j         j        z  dz
            }n|| j         j        z  }t          j        || j        z            S Nr      )r.   r    mathceilaudio_length_per_tok)r%   r>   s     r   num_audio_tokenszAudioConfig.num_audio_tokens   sb    t+66!;;	)d.B.M"MPQ"QRRII!T%9%DDIyT%>>???r   c                     | j         sJ d| j         d            | j        J d| j        d            t          | j        dz  | j        z            }|                     |          S )Nz1Can't call num_delay_tokens if self.is_streaming=.z;Can't call num_delay_tokens if self.transcription_delay_ms=r8   )r:   r1   r(   r,   rE   )r%   	delay_lens     r   num_delay_tokenszAudioConfig.num_delay_tokens   s     YY"YDDU"Y"Y"YYY *66Md.IMMM 766 3f<t?QQRR	$$Y///r   c                 n    | j         J d| j         d            t          | j         | j        z            S )z)Calculate the number of frames per chunk.Nz/Can't call chunk_frames if self.chunk_length_s=rG   )r/   r(   r,   r$   s    r   r9   zAudioConfig.chunk_frames   sC     "..0edNa0e0e0e...4&);;<<<r   c                 :    t          | j        | j        z            S r=   )r(   r,   r-   r$   s    r   raw_audio_length_per_tokz$AudioConfig.raw_audio_length_per_tok   s    4%8999r   c                 f    t          | j                  }|| j        j        z  }t	          |          S )z(Calculate the length of audio per token.)floatrL   r.   r    r(   )r%   downsample_factors     r   rD   z AudioConfig.audio_length_per_tok   s5     "$"?@@T1<<$%%%r   c                 Z    | j         sJ d| j         d            | j        dz   t          z   S )Nz3Can't call n_right_pad_tokens if self.is_streaming=rG   rA   )r:   rI   OFFLINE_STREAMING_BUFFER_TOKENSr$   s    r   n_right_pad_tokenszAudioConfig.n_right_pad_tokens   s>     [["[dFW"["["[[[  %)-LLLr   c                 x    | j         sJ d| j         d            | j        J d| j        d            | j        S )Nz2Can't call n_left_pad_tokens if self.is_streaming=rG   r6   r7   )r:   r4   r$   s    r   n_left_pad_tokenszAudioConfig.n_left_pad_tokens   sY     ZZ"ZTEV"Z"Z"ZZZ 
 /;;=p@`=p=p=p;;;//r   r'   )r   r   r   r   r(   r)   rN   r   r/   r   r   r0   r1   r2   r3   r4   r&   propertyboolr:   rE   rI   r9   rL   rD   rR   rT   r   r   r   r+   r+   ?   s%          ++++#'NEDL''' 1D0L-LLL ,0EDL/// -1UT\000+/EDL///.2t222f f f f> Jd J J J XJ@# @# @ @ @ @ 0# 0 0 0 X0 =c = = = X=
 :# : : : X: &c & & & X& MC M M M XM 03 0 0 0 X0 0 0r   r+   c                   4    e Zd ZU dZee         ed<   eed<   dS )AudioEncodingzEncapsulates the tokens and audio data for an audio chunk.

    Attributes:
        tokens: Text tokens corresponding to this audio chunk.
        audio: Original audio waveform data.
    tokensaudioN)r   r   r   r   listr(   r)   r   r   r   r   rX   rX      s5           ILLLLLr   rX   c                   D    e Zd ZU dZedz  ed<   edz  ed<   edz  ed<   dS )SpecialAudioIDsa  Special text tokens corresponding to audio token sequence.

    Attributes:
        audio: Token representing audio.
        begin_audio: Token representing the beginning of audio.
        streaming_pad: Token representing streaming pad of audio. Only relevant for steaming models.
    NrZ   begin_audiostreaming_pad)r   r   r   r   r(   r)   r   r   r   r]   r]      sN           :t:r   r]   c                   \   e Zd ZdZdededdfdZdej        de	d	e
dej        fd
Zde	d	e
dee	e	f         fdZde	de	de	fdZdee	         fdZde	dee	         fdZded	e
defdZdedefdZdedefdZdeez  defdZede	fd            Zede	fd            Zede	fd            ZdS )AudioEncodera	  Encodes audio chunks into a format suitable for further processing.

    Attributes:
        audio_config: Configuration for audio processing.
        encoding_config: Configuration for audio spectrogram.
        special_ids: Special tokens for audio encoding.
    audio_configspecial_idsr   Nc                 :    || _         |j        | _        || _        d S r=   )rb   r.   rc   )r%   rb   rc   s      r   __init__zAudioEncoder.__init__   s#    (+;&r   audio_arrayr,   is_online_streamingc                    | j         j        rG|                     |j        d         |          }t	          j        |d||j        d         z
  f          }n| j         j        r<|                     |j        d         |          \  }}t	          j        |||f          }ndt          | j	        t                    rJ|j        d         | j	        j        k     r/t	          j        |d| j	        j        |j        d         z
  f          }|S )zPad the audio array to the desired length.

        Args:
            audio_array: Audio data as a numpy array.
            sampling_rate: Sampling rate of the audio.

        Returns:
            Padded audio array.
        r   )rb   r/   next_multiple_of_chunk_framesshapenppadr:   _get_streaming_pad
isinstancer.   r   r!   )r%   rf   r,   rg   rj   left_pad	right_pads          r   rm   zAudioEncoder.pad   s    + 	m,0,N,N{O`acOdfs,t,t)&q2OR]RcdfRg2g.hiiKK+ 		m"&"9"9+:KB:OQd"e"eHi&x.CDDKKt+-CDD	m!"%(<(HHH &q$2F2RU`UfgiUj2j.kllKr   r
   c                    | j         j        }|rd}nKt          |||z  z
  |z            }| j         j        }t          ||z            }t	          ||           ||z  }| j         j        }t          ||z            }t	          ||           ||fS r#   )rb   rL   r(   rR   r   rT   )	r%   r
   rg   r   rq   _extra_right_pad_tokens_extra_right_pad_samples_extra_left_pad_tokensrp   s	            r   rn   zAudioEncoder._get_streaming_pad   s    #< 
	2 IIWg(=>'IJJI&*&7&J#'*75L+L'M'M$3W===11I "&!2!Dw!7788x)))""r   audio_array_lenc                     || j         j        k    sJ d|d| j         j                    | j         j        J d| j         j        d            t          j        || j         j        z            | j         j        z  S )zCalculate the next multiple of chunk frames.

        Args:
            audio_array_len: Length of the audio array.
            sampling_rate: Sampling rate of the audio.

        Returns:
            The next multiple of chunk frames.
        zExpected sampling_rate=z' to be self.audio_config.sampling_rate=NzMCan't call next_multiple_of_chunk_frames if self.audio_config.chunk_length_s=rG   )rb   r,   r/   rB   rC   r9   )r%   rv   r,   s      r   rj   z*AudioEncoder.next_multiple_of_chunk_frames  s      1 ????QQQt/@/NQQ @??  /;;_4;L;[___ <;; y4+<+IIJJTM^Mkkkr   c                     t          | j        j        t                    sJ d| j        j                    | j        j        J | j        j        | j        j        z   }| j        g|z  }|S )NzQAudio encoder must be spectrogram encoder, got self.audio_config.encoding_config=)ro   rb   r.   r   r1   rT   rI   r_   )r%   stream_pad_prefix_lenrY   s      r   _encode_streaming_tokensz%AudioEncoder._encode_streaming_tokens,  s    $+;=STT 	
 	
bd>O>_bb	
 	
T  7CCC !% 1 CdFWFh h$%(==r   signal_lengthc                     || j         j        z  dk    r%t          j        || j         j        z  dz
            }n|| j         j        z  }t          j        || j        j        z            }| j        g| j        g|z  z   }|S r@   )r.   r    rB   rC   rb   rD   begin_audio_tokenaudio_token)r%   r{   rE   rY   s       r   _encode_audio_tokensz!AudioEncoder._encode_audio_tokens8  s    4/::a?? Imd6J6U&UXY&YZZMM)T-A-LLM9]T5F5[%[\\()T-=,>AQ,QQr   rZ   c                 <   |                     | j        j                   |rlt          | j        j        j        dz  | j        j        j        z
            }|j        j        d         |z  dk    s J d|j        j        d         d|            | 	                    |j        | j        j        |          |_        | j        j
        t          j        k    r|                                 }n%|                     |j        j        d                   }t          ||          S )N   ri   r   zaudio.audio_array.shape[-1]=r   )rY   rZ   )resamplerb   r,   absr.   r!   r    rf   rk   rm   r0   r   r   rz   r   rX   )r%   rZ   rg   r   rY   s        r   encode_audiozAudioEncoder.encode_audioD  s"   t(6777 		 !1=ADDUDeDpp G $*2.8A===R5$*2.RRRR >== !HHU%68I8WYlmm15H5RRR2244FF..u/@/Fq/IJJF
 
 
 	
r   contentc                 b    t          j        |j                  }|                     |d          S NF)rg   )r   from_raw_audioinput_audior   )r%   r   rZ   s      r   _encode_audio_chunkz AudioEncoder._encode_audio_chunk^  s.    $W%899  E BBBr   c                 F   |                                 }|t          j        t          j        hv rt	          j        |j                  }nC|t          j        k    rt	          j        |j                  }nt	          j        |j                  }| 	                    |d          S r   )
get_url_typer   filefile_urir   	from_fileurlfrom_urlfrom_base64r   )r%   r   url_typerZ   s       r   _encode_audio_url_chunkz$AudioEncoder._encode_audio_url_chunkb  s    '')))<+@AAAOGK00EE)))N7;//EE%gk22E  E BBBr   c                     t          |t                    r|                     |          S t          |t                    r|                     |          S t          dt          |                     )zCall the encoder on an audio chunk or URL chunk.

        Args:
            content: Audio or URL chunk to encode.

        Returns:
            Encoded audio data and tokens.
        zUnsupported content type: )ro   r   r   r   r   
ValueErrortype)r%   r   s     r   __call__zAudioEncoder.__call__n  so     g}-- 	K//888,, 	K++G444I$w--IIJJJr   c                 b    | j         j        J d| j         j        d            | j         j        S )zGet the audio token.Nzself.special_ids.audio=r7   )rc   rZ   r$   s    r   r~   zAudioEncoder.audio_token~  s<     %113\d6F6L3\3\3\111%%r   c                 b    | j         j        J d| j         j        d            | j         j        S )zGet the begin audio token.Nzself.special_ids.begin_audio=r7   )rc   r^   r$   s    r   r}   zAudioEncoder.begin_audio_token  s<     +779hD<L<X9h9h9h777++r   c                 b    | j         j        J d| j         j        d            | j         j        S )zGet the streaming pad token.Nzself.special_ids.streaming_pad=r7   )rc   r_   r$   s    r   r_   zAudioEncoder.streaming_pad  s<     -99;ld>N>\;l;l;l999--r   )r   r   r   r   r+   r]   re   rl   ndarrayr(   rV   rm   tuplern   rj   r[   rz   r   r   rX   r   r   r   r   r   r   rU   r~   r}   r_   r   r   r   ra   ra      sG        '[ ' 'SW ' ' ' '
rz # TX ]_]g    4#c # #QVWZ\_W_Q` # # # #4lS lQT lY\ l l l l&
$s) 
 
 
 

# 
$s) 
 
 
 

% 
d 
} 
 
 
 
4C: C- C C C C
C} 
C 
C 
C 
C 
CK
] : K} K K K K  &S & & & X&
 ,3 , , , X,
 .s . . . X. . .r   ra   )loggingrB   dataclassesr   enumr   numpyrl   mistral_common.audior   &mistral_common.protocol.instruct.chunkr   r   r   	getLoggerr   loggerrQ   r(   r   strr   r   r+   rX   r]   ra   r   r   r   <module>r      s     ! ! ! ! ! !           & & & & & & Z Z Z Z Z Z Z Z Z Z		8	$	$ #% Z Zc Zd Z Z Z Z    #t    6 6 6 6 6 6 6 6. v0 v0 v0 v0 v0 v0 v0 v0r                w. w. w. w. w. w. w. w. w. w.r   