o
    Qdi                      @   sJ   d dl Z d dlmZ d dlmZmZmZ d dlZG dd dZdZ	dZ
dS )    N)cached_property)ListOptionalTuplec                
   @   s  e Zd ZdZ		d.dejdedee dee fddZ	e
d	efd
dZe
d	efddZe
d	efddZe
d	efddZe
d	efddZe
d	efddZe
d	efddZe
d	efddZed	efddZed	ee fddZded	ee fdd Zd!ee d	efd"d#Zd!ee d	efd$d%Ze
d	ee fd&d'Zd!ee d	eee eee  f fd(d)Zd!ee d	eee eee  f fd*d+Zd!ee d	eee eee  f fd,d-ZdS )/	Tokenizerz-Simple wrapper around a tokenizers.Tokenizer.N	tokenizermultilingualtasklanguagec                 C   s   || _ |r:|tvrtd|dtf |tvr#td|dtf | j d| | _| j d| | _|| _d S d | _d | _d| _d S )Nz-'%s' is not a valid task (accepted tasks: %s)z, z?'%s' is not a valid language code (accepted language codes: %s)z<|%s|>en)	r   _TASKS
ValueErrorjoin_LANGUAGE_CODEStoken_to_idr	   r
   language_code)selfr   r   r	   r
    r   </home/jaya/work/projects/WHISPER/faster_whisper/tokenizer.py__init__   s(   

zTokenizer.__init__returnc                 C      | j dS )Nz<|transcribe|>r   r   r   r   r   r   
transcribe*      zTokenizer.transcribec                 C   r   )Nz<|translate|>r   r   r   r   r   	translate.   r   zTokenizer.translatec                 C   r   )Nz<|startoftranscript|>r   r   r   r   r   sot2   r   zTokenizer.sotc                 C   r   )Nz<|startoflm|>r   r   r   r   r   sot_lm6   r   zTokenizer.sot_lmc                 C   r   )Nz<|startofprev|>r   r   r   r   r   sot_prev:   r   zTokenizer.sot_prevc                 C   r   )Nz<|endoftext|>r   r   r   r   r   eot>   r   zTokenizer.eotc                 C   r   )Nz<|notimestamps|>r   r   r   r   r   no_timestampsB   r   zTokenizer.no_timestampsc                 C   s   | j dp| j dS )Nz<|nospeech|>z<|nocaptions|>r   r   r   r   r   	no_speechF   s   zTokenizer.no_speechc                 C   s
   | j d S )N   )r!   r   r   r   r   timestamp_beginL   s   
zTokenizer.timestamp_beginc                 C   s8   | j g}| jd ur|| j | jd ur|| j |S )N)r   r
   appendr	   )r   sequencer   r   r   sot_sequenceP   s   

zTokenizer.sot_sequencetextc                 C   s   | j j|ddjS )NF)add_special_tokens)r   encodeids)r   r(   r   r   r   r*   \   s   zTokenizer.encodetokensc                    s    fdd|D } j |S )Nc                    s   g | ]	}| j k r|qS r   )r    ).0tokenr   r   r   
<listcomp>`   s    z$Tokenizer.decode.<locals>.<listcomp>)r   decode)r   r,   text_tokensr   r   r   r0   _   s   zTokenizer.decodec                    sn   g g}|D ]%}| j kr#d| j  d dd}|| |g  q|d | qd fdd|D S )	Nz<|g{Gz?z.2fz|> c                    s&   g | ]}t |tr|n j|qS r   )
isinstancestrr   r0   )r-   sr   r   r   r/   o   s   & z4Tokenizer.decode_with_timestamps.<locals>.<listcomp>)r$   r%   r   )r   r,   outputsr.   	timestampr   r   r   decode_with_timestampsc   s   

z Tokenizer.decode_with_timestampsc                 C   s   t d}|d 7 }td}tdd |D sJ | dd | dd h}|t | D ]"}| || d	| fD ]}t|d
ksG||v rN||d  q;q-tt|S )u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c                 s   s,    | ]}d t |  kodkn  V  qdS )i@&  i&  N)ord)r-   cr   r   r   	<genexpr>   s   * z.Tokenizer.non_speech_tokens.<locals>.<genexpr>z -r   z ' r#   )	listsplitsetallr*   lenaddtuplesorted)r   symbolsZmiscellaneousresultsymbolr,   r   r   r   non_speech_tokensr   s    zTokenizer.non_speech_tokensc                 C   s   | j dv r
| |S | |S )N>   jathmyyuezhlo)r   split_tokens_on_unicodesplit_tokens_on_spaces)r   r,   r   r   r   split_to_word_tokens   s   


zTokenizer.split_to_word_tokensc              	   C   s   |  |}d}g }g }g }d}|D ]E}|| |  |}	z|	|}
|
|7 }
W n ty3   d }
Y nw |
d u sD|
t|k rV||
 |krV||	 || g }|t|	7 }q||fS )Nu   �r   )r9   r%   indexr   rB   )r   r,   Zdecoded_fullZreplacement_charwordsword_tokensZcurrent_tokensZunicode_offsetr.   decodedZreplacement_char_indexr   r   r   rP      s0   





z!Tokenizer.split_tokens_on_unicodec                 C   s   |  |\}}g }g }t||D ]=\}}|d | jk}|d}	| tjv }
|s3|	s3|
s3t|dkr>|| || q|d | |d< |d 	| q||fS )Nr   r=   r2   )
rP   zipr    
startswithstripstringpunctuationrB   r%   extend)r   r,   ZsubwordsZsubword_tokens_listrT   rU   ZsubwordZsubword_tokensspecialZ
with_spacer[   r   r   r   rQ      s   

z Tokenizer.split_tokens_on_spaces)NN) __name__
__module____qualname____doc__
tokenizersr   boolr   r5   r   r   intr   r   r   r   r   r    r!   r"   propertyr$   r   r'   r*   r0   r9   r   rI   rR   rP   rQ   r   r   r   r   r   	   sj    
#

 r   )r   r   )dafZamarasZazbabeZbgZbnZbobrbsZcaZcsZcydaZdeZelr   ZesZetZeuZfafiZfofrZglZguZhaZhawZhehiZhrZhtZhuZhyidisitrJ   ZjwZkakkZkmZknZkoZlaZlblnrO   ltZlvZmgmiZmkZmlmnZmrZmsZmtrL   nenlnnnoZocZpaplZpsZptZroZrusaZsdsiZskslZsnsoZsqsrsuZsvswZtateZtgrK   ZtkZtltrttZukZurZuzZviZyiZyorN   rM   )rZ   	functoolsr   typingr   r   r   rb   r   r   r   r   r   r   r   <module>   s     N