
    ri                      P    d dl Z d dlmZ d dlmZmZmZ d dlZ G d d      ZdZ	dZ
y)    N)cached_property)ListOptionalTuplec            
       $   e Zd ZdZ	 	 ddej
                  dedee   dee   fdZ	e
defd	       Ze
defd
       Ze
defd       Ze
defd       Ze
defd       Ze
defd       Ze
defd       Ze
defd       Zedefd       Zedee   fd       Zdedee   fdZdee   defdZdee   defdZe
dee   fd       Zdee   deee   eee      f   fdZdee   deee   eee      f   fdZdee   deee   eee      f   fdZy)	Tokenizerz-Simple wrapper around a tokenizers.Tokenizer.N	tokenizermultilingualtasklanguagec                    || _         |r|t        vr%t        d|ddj                  t              d      |t        vr%t        d|ddj                  t              d      | j                   j                  d|z        | _        | j                   j                  d|z        | _        || _        y d | _        d | _        d| _        y )N'z'' is not a valid task (accepted tasks: z, )z9' is not a valid language code (accepted language codes: z<|%s|>en)	r	   _TASKS
ValueErrorjoin_LANGUAGE_CODEStoken_to_idr   r   language_code)selfr	   r
   r   r   s        N/home/jaya/work/projects/FASTER-ASR-WHISPER/faster-asr/faster_asr/tokenizer.py__init__zTokenizer.__init__   s     #6! TYYv.0 
 . ?!;= 
 228d?CDI NN66x(7JKDM!)DDI DM!%D    returnc                 8    | j                   j                  d      S )Nz<|transcribe|>r	   r   r   s    r   
transcribezTokenizer.transcribe*   s    ~~))*:;;r   c                 8    | j                   j                  d      S )Nz<|translate|>r   r   s    r   	translatezTokenizer.translate.       ~~))/::r   c                 8    | j                   j                  d      S )Nz<|startoftranscript|>r   r   s    r   sotzTokenizer.sot2   s    ~~))*ABBr   c                 8    | j                   j                  d      S )Nz<|startoflm|>r   r   s    r   sot_lmzTokenizer.sot_lm6   r"   r   c                 8    | j                   j                  d      S )Nz<|startofprev|>r   r   s    r   sot_prevzTokenizer.sot_prev:   s    ~~))*;<<r   c                 8    | j                   j                  d      S )Nz<|endoftext|>r   r   s    r   eotzTokenizer.eot>   r"   r   c                 8    | j                   j                  d      S )Nz<|notimestamps|>r   r   s    r   no_timestampszTokenizer.no_timestampsB   s    ~~))*<==r   c                 r    | j                   j                  d      xs | j                   j                  d      S )Nz<|nospeech|>z<|nocaptions|>r   r   s    r   	no_speechzTokenizer.no_speechF   s2    ~~)).9 
T^^=W=W>
 	
r   c                      | j                   dz   S )N   )r,   r   s    r   timestamp_beginzTokenizer.timestamp_beginL   s    !!A%%r   c                     | j                   g}| j                  |j                  | j                         | j                  |j                  | j                         |S N)r$   r   appendr   )r   sequences     r   sot_sequencezTokenizer.sot_sequenceP   sF    HH:==$OODMM*99 OODII&r   textc                 P    | j                   j                  |d      j                  S )NF)add_special_tokens)r	   encodeids)r   r7   s     r   r:   zTokenizer.encode\   s"    ~~$$Te$DHHHr   tokensc                     |D cg c]  }|| j                   k  s| }}| j                  j                  |      S c c}w r3   )r*   r	   decode)r   r<   tokentext_tokenss       r   r>   zTokenizer.decode_   s;    *0EEDHH4DuEE~~$$[11 Fs   ;;c           	      n   g g}|D ]_  }|| j                   k\  r:d|| j                   z
  dz  dd}|j                  |       |j                  g        L|d   j                  |       a dj                  |D cg c]/  }t        |t              r|n| j
                  j                  |      1 c}      S c c}w )Nz<|g{Gz?z.2fz|> )r1   r4   r   
isinstancestrr	   r>   )r   r<   outputsr?   	timestampss         r   decode_with_timestampsz Tokenizer.decode_with_timestampsc   s    $ 	*E,,, %$*>*>">$!Fs K2N	y)r"""5)	* wwLSTq*Q$Q$..*?*?*BBT
 	
Ts   74B2c                    t        d      }|dj                         z  }t        d      }t        d |D              sJ | j	                  d      d   | j	                  d      d   h}|t        |      z   D ]S  }| j	                  |      | j	                  d|z         fD ])  }t        |      d	k(  s||v s|j                  |d          + U t        t        |            S )
u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c              3   N   K   | ]  }d t        |      cxk  xr dk  nc   yw)i@&  i&  N)ord).0cs     r   	<genexpr>z.Tokenizer.non_speech_tokens.<locals>.<genexpr>   s!     E!6SV-v--Es   #%z -r   z ' r0   )	listsplitsetallr:   lenaddtuplesorted)r   symbolsmiscellaneousresultsymbolr<   s         r   non_speech_tokenszTokenizer.non_speech_tokensr   s     =>Z``b	
 34E}EEEE ++d#A&D(9!(<=] 33 	*FF#C&L) * v;!#v'>JJvay)*	* VF^$$r   c                 b    | j                   dv r| j                  |      S | j                  |      S )N>   jalomythzhyue)r   split_tokens_on_unicodesplit_tokens_on_spaces)r   r<   s     r   split_to_word_tokenszTokenizer.split_to_word_tokens   s7     !FF //77**622r   c                 z   | j                  |      }d}g }g }g }d}|D ]  }|j                  |       | j                  |      }		 |	j                  |      }
|
|z  }
|
|
t	        |      k  sM||
   |k(  sV|j                  |	       |j                  |       g }|t	        |	      z  } ||fS # t        $ r d }
Y _w xY w)Nu   �r   )rI   r4   indexr   rU   )r   r<   decoded_fullreplacement_charwordsword_tokenscurrent_tokensunicode_offsetr?   decodedreplacement_char_indexs              r   re   z!Tokenizer.split_tokens_on_unicode   s     226:# 	/E!!%(11.AG.)07G)H&&.8& &-&\):: !78<LLW%"">2!##g,.#	/& k!!  .)-&.s   B,,B:9B:c                    | j                  |      \  }}g }g }t        ||      D ]  \  }}|d   | j                  k\  }|j                  d      }	|j	                         t
        j                  v }
|s|	s|
st        |      dk(  r#|j                  |       |j                  |       |d   |z   |d<   |d   j                  |        ||fS )Nr   rP   rB   )
re   zipr*   
startswithstripstringpunctuationrU   r4   extend)r   r<   subwordssubword_tokens_listrl   rm   subwordsubword_tokensspecial
with_spacerw   s              r   rf   z Tokenizer.split_tokens_on_spaces   s     )-(D(DV(L%%'*85H'I 		7#G^$Q'4883G ++C0J!--/V-?-??K*s5zQW%"">2!"I/b	B&&~6		7 k!!r   )NN) __name__
__module____qualname____doc__
tokenizersr   boolr   rE   r   r   intr   r!   r$   r&   r(   r*   r,   r.   propertyr1   r   r6   r:   r>   rI   r   r]   rg   re   rf    r   r   r   r   	   sQ   7 #"&&''& & sm	&
 3-&< <C < < ;3 ; ; CS C C ; ; ; =# = = ;S ; ; >s > > 
3 
 

 & & & 	d3i 	 	I3 I49 I2T#Y 23 2
T#Y 
3 
 !%5: !% !%F	33i	3	tCy$tCy/)	*	3"3i"	tCy$tCy/)	*"@"3i"	tCy$tCy/)	*"r   r   )r   r!   )dafamarasazbabebgbnbobrbscacscydadeelr   eseteufafifofrglguhahawhehihrhthuhyidisitr_   jwkakkkmknkolalblnr`   ltlvmgmimkmlmnmrmsmtra   nenlnnnoocpaplpsptrorusasdsiskslsnsosqsrsusvswtatetgrb   tktltrttukuruzviyiyorc   rd   )rv   	functoolsr   typingr   r   r   r   r   r   r   r   r   r   <module>r      s0     % ( ( J" J"Z

er   