
    .`i7                        d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ  ee          ZdZ dZ!dZ"d  e#dd          D             Z$d  e#dd          D             Z%e e"e!gZ&e e"e!dZ'dZ(dZ)dede*dz  de*dz  de*dz  de+e*e	f         f
dZ,dede-e	e+e*e.f         f         fd Z/ G d! d"e          Z0dS )#z&Tokenizer for Grok-2 .tok.json format.    N)
CollectionSet)Path)AnyLiteraloverload)hf_hub_download)EntryNotFoundErrorHfHubHTTPErrorRepositoryNotFoundErrorRevisionNotFoundError)BatchEncoding)chat_template_utils)ChatCompletionMessageParam)init_logger   )TokenizerLikez<|pad|>z<|eos|>z<|separator|>c                     g | ]}d | d	S )z<|reserved_|> .0is     i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/tokenizers/grok2.py
<listcomp>r      s$    CCC+a+++CCC          c                     g | ]}d | d	S )z	<|controlr   r   r   s     r   r   r       s$    @@@Q(1(((@@@r   i  )padsepeosa  {% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
model_pathrepo_idrevisiondownload_dirreturnc                    | dz  }|                                 rC|                    dd          5 }t          j        |          cd d d            S # 1 swxY w Y   |i S 	 t	          |d||          }nO# t
          t          t          f$ r i cY S t          $ r(}t          
                    d||           i cY d }~S d }~ww xY w	 t          |                              dd          5 }t          j        |          cd d d            S # 1 swxY w Y   d S # t          j        $ r'}t          
                    d|           i cY d }~S d }~wt          $ r'}t          
                    d|           i cY d }~S d }~ww xY w)	Nztokenizer_config.jsonrutf-8)encodingr$   filenamer%   	cache_dirzFailed to download tokenizer_config.json from %s. This may be due to a network or authentication issue. The default chat template will be used. Error: %szXFailed to parse tokenizer_config.json. The default chat template will be used. Error: %szWFailed to open tokenizer_config.json. The default chat template will be used. Error: %s)is_fileopenjsonloadr	   r   r   r
   r   loggerwarningr   JSONDecodeErrorOSError)r#   r$   r%   r&   config_pathfconfig_fileexcs           r   _maybe_load_tokenizer_configr;   9   su    66K  cG44 	 9Q<<	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	%,"	
 
 
 $%:<NO   			   @ 	
 	
 	
 						+##C'#:: 	 a9Q<<	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	    @	
 	
 	

 						   @	
 	
 	

 						s   AAA"A6 6C	CB=7C=C$D *D>D DD DD E='E	E=	E=E82E=8E=
vocab_filec           
         	 dd l n"# t          $ r}t          d          |d }~ww xY w|                     d          5 }t          j        |          }d d d            n# 1 swxY w Y   d |                    dg           D             }d |                    dg           D             }|                    d          d	k    rt          }n%t          d
|                    d                    |                    d|          }t          |           |||d}d|v r|d         |d<    j	        di |}d }	d|v rd |d         D             }	|	pt                      |_        t          |_        t                      dddt          dt          d         t          t                   z  dt          d         t           t                   z  dt"          t$                   ffd}
t'          j        |
|          |_        |xj        t          t                                                    z  c_        |xj        t          t.          t0          z             z  c_        ||fS )Nr   z1Grok-2 tokenizer requires the `tiktoken` package.rbc                 F    i | ]}t          |d                    |d         S )bytestoken)r@   r   items     r   
<dictcomp>z+_load_tiktoken_encoding.<locals>.<dictcomp>z   s:        	d7md7m  r   regular_tokensc                 p    i | ]3}t          |d                                        dd          |d         4S )r@   r*   replaceerrorsrA   r@   decoderB   s     r   rD   z+_load_tiktoken_encoding.<locals>.<dictcomp>~   sM        	d7m##GI#>>W  r   special_tokens
word_splitV1zUnknown word_split: pat_str)namerO   mergeable_ranksrL   
vocab_sizeexplicit_n_vocabdefault_allowed_specialc                 V    h | ]&}t          |                              d d          'S )r*   rG   rH   rJ   )r   
bytes_lists     r   	<setcomp>z*_load_tiktoken_encoding.<locals>.<setcomp>   sC     #
 #
 #
 *$$WY$??#
 #
 #
r   allallowed_specialdisallowed_specialtextrZ   r[   r'   c                    ~t          |t                    r
|| j        z  }j                            | ||d          S )Nr   rY   )
isinstanceset_default_allowed_specialEncodingencode)selfr\   rZ   r[   tiktokens       r   encode_patchedz/_load_tiktoken_encoding.<locals>.encode_patched   sS     os++ 	=t<<O ''+!	 ( 
 
 	
r   r   )rd   ImportErrorr0   r1   r2   get	PAT_STR_B
ValueErrorstrra   r_   r`   DEFAULT_CONTROL_TOKENS_control_tokensr   r   r   listint	functoolspartialrb   valuesCONTROL_TOKEN_TEXTSRESERVED_TOKEN_TEXTS)r<   r:   r8   	xtok_dictrQ   rL   rO   kwargs	tokenizerrT   re   rd   s              @r   _load_tiktoken_encodingrw   o   s   X X X XMNNTWWX 
		 !!IaLL	! ! ! ! ! ! ! ! ! ! ! ! ! ! ! MM"2B77  O MM"2B77  N
 }}\""d**O	l0K0KOOPPPmmIw//G J*(	 F y  %.|%<!"!!++F++I/3 I--#
 #
'(AB#
 #
 #

 *A)ICEEI& 6I 69UU?D
 
 

 !#c(2	

 $ENZ_<
 
c
 
 
 
 
 
" !(CCI&&#.D.K.K.M.M*N*NN&&&&#22+ + && n$$s!    
'"'A  A$'A$c                   `    e Zd Zedddddeez  dededz  dedz  dd f
d	            Zdd
dededededz  deee	f         dz  ddf fdZ
defdZedee         fd            Zedee         fd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zdeeef         fdZdeeef         fdZdee         dedz  dee         fdZ	 	 	 d4d!ed"edz  dedz  d#edee         f
d$Zd5d%ee         ez  d&edefd'Zededefd(            Zedee         dee         fd)            Zdeee         z  deee         z  fd*Z	 d5d%ee         d&edee         fd+Zdee         defd,Z 	 	 	 	 d6d!eee         z  d-edz  d#ed"ededz  de!fd.Z"	 d7dedz  d/eeee	f                  dz  dedz  fd0Z#	 	 	 d8d1ee$         d/eeee	f                  dz  dedz  d2edeee         z  f
d3Z% xZ&S )9Grok2TokenizerFN)trust_remote_coder%   r&   path_or_repo_idrz   r%   r&   r'   c                V   |rt                               d           t          |          }|                                r|}|j        }	d }
na|                                r
|dz  }|}	d }
nCt          t          t          |          d||                    }|j        }	t          |          }
|                                st          d| d          t          |	|
||          } | |t          |          |
                    dd          |
                    d	          |
          S )Nz2Ignoring extra positional args for Grok2Tokenizer.ztokenizer.tok.jsonr,   z tokenizer.tok.json not found at .)r$   r%   r&   truncation_sideleftchat_template)r<   name_or_pathr~   r   init_kwargs)r3   
debug_oncer   r/   parentis_dirr	   rj   FileNotFoundErrorr;   rg   )clsr{   rz   r%   r&   argsru   pathr<   r#   r$   configs               r   from_pretrainedzGrok2Tokenizer.from_pretrained   sa     	TRSSSO$$<<>> 	+JJGG[[]] 	+ 44JJGG001%*	   J $*J/**G!!## 	V#$Tz$T$T$TUUU-%	
 
 
 s!_--"JJ'8&AA **_55
 
 
 	
r   )r   r<   r   r~   r   r   c                   t                                                       || _        || _        |pi | _        |pt
          | _        t          |          \  | _        | _	        i | _
        i | _        | j        j                                        D ]0\  }}|                    dd          }|| j
        |<   || j        |<   1| j	                                        D ]\  }}|| j
        |<   || j        |<   | j	                            t                     }	|	| j	                            t"                    }	|	| j	                            t$                    }	|	d}	|	| _        | j	                            t$          | j                  | _        | j	                            t"          | j                  | _        | j        | _        d S )Nr*   rG   rH   r   )super__init__r   _truncation_sider   DEFAULT_CHAT_TEMPLATE_chat_templaterw   
_tokenizer_special_tokens_token_to_id_id_to_token_mergeable_ranksitemsrK   rg   SEPPADEOS_bos_token_id_eos_token_id_pad_token_id_unk_token_id)rc   r<   r   r~   r   r   rA   token_id	token_strbos_token_id	__class__s             r   r   zGrok2Tokenizer.__init__   s    	( /&,"+D/D0G
0S0S--,.,.#?EEGG 	4 	4OE8WY??I+3Di(*3Dh''#399;; 	0 	0OE8'/De$*/Dh''+//44/33C88L/33C88LL)!155c4;MNN!155c4;MNN!/r   c                     dS )Nr   r   rc   s    r   num_special_tokens_to_addz(Grok2Tokenizer.num_special_tokens_to_add  s    qr   c                 N    t          | j                                                  S N)rm   r   keysr   s    r   all_special_tokensz!Grok2Tokenizer.all_special_tokens  s    D(--//000r   c                 N    t          | j                                                  S r   )rm   r   rq   r   s    r   all_special_idszGrok2Tokenizer.all_special_ids  s    D(//11222r   c                     | j         S r   )r   r   s    r   r   zGrok2Tokenizer.bos_token_id#      !!r   c                     | j         S r   )r   r   s    r   eos_token_idzGrok2Tokenizer.eos_token_id'  r   r   c                     | j         S r   )r   r   s    r   pad_token_idzGrok2Tokenizer.pad_token_id+  r   r   c                     dS )NFr   r   s    r   is_fastzGrok2Tokenizer.is_fast/  s    ur   c                     | j         j        S r   r   n_vocabr   s    r   rR   zGrok2Tokenizer.vocab_size3  s    &&r   c                      | j         j        dz
  S )Nr   r   r   s    r   max_token_idzGrok2Tokenizer.max_token_id7  s    &**r   c                     | j         S r   )r   r   s    r   r~   zGrok2Tokenizer.truncation_side;  s    $$r   c                 *    t          | j                  S r   )dictr   r   s    r   	get_vocabzGrok2Tokenizer.get_vocab?  s    D%&&&r   c                 *    t          | j                  S r   )r   r   r   s    r   get_added_vocabzGrok2Tokenizer.get_added_vocabB  s    D()))r   tokens
max_lengthc                 p    |t          |          |k    r|S | j        dk    r|| d          S |d |         S )Nr   )lenr~   )rc   r   r   s      r   _maybe_truncatezGrok2Tokenizer._maybe_truncateE  sK    V
!:!:M6)):+,,''kzk""r   Tr\   
truncationadd_special_tokensc                 l    ~| j                             |          }|r|                     ||          }|S r   )r   rb   r   )rc   r\   r   r   r   r   s         r   rb   zGrok2Tokenizer.encodeL  s>     ''-- 	>))&*==Fr   idsskip_special_tokensc                      t          |t                    r|g}|r fd|D             } j                            |          S )Nc                 J    g | ]}|j                                         v| S r   )r   rq   )r   r   rc   s     r   r   z)Grok2Tokenizer.decode.<locals>.<listcomp>]  s=       4#7#>#>#@#@@@ @@@r   )r^   rn   r   rK   )rc   r   r   s   `  r   rK   zGrok2Tokenizer.decodeY  sd    c3 	%C 	    #  C
 %%c***r   c                     d S r   r   rc   r   s     r   convert_tokens_to_idsz$Grok2Tokenizer.convert_tokens_to_idsd  s    9<r   c                     d S r   r   r   s     r   r   z$Grok2Tokenizer.convert_tokens_to_idsg  s    EHSr   c                      t          |t                    r  j                            | j                  S  fd|D             S )Nc                 P    g | ]"}j                             |j                  #S r   )r   rg   r   )r   rA   rc   s     r   r   z8Grok2Tokenizer.convert_tokens_to_ids.<locals>.<listcomp>m  s/    UUUU!%%eT-?@@UUUr   )r^   rj   r   rg   r   r   s   ` r   r   z$Grok2Tokenizer.convert_tokens_to_idsj  sL    fc"" 	E$((1CDDDUUUUfUUUUr   c                     g }|D ]N}|r|| j                                         v r |                    | j                            |d                     O|S )Nz<|unk|>)r   rq   appendr   rg   )rc   r   r   r   r   s        r   convert_ids_to_tokensz$Grok2Tokenizer.convert_ids_to_tokenso  sk      	F 	FH" x43G3N3N3P3P'P'PMM$+//)DDEEEEr   c                 Z    |                      |          }|                     |d          S )NF)r   )r   rK   )rc   r   	token_idss      r   convert_tokens_to_stringz'Grok2Tokenizer.convert_tokens_to_stringy  s+    ..v66	{{9%{@@@r   	text_pairc                 0    |t          d          t          |t                    r/ fd|D             }d |D             }t          ||d          S                      |          }dgt          |          z  }	t          ||	d          S )Nz.text_pair is not supported for Grok2Tokenizer.c                 B    g | ]}                     |           S )r   r   r   )rb   )r   rC   r   r   rc   r   s     r   r   z+Grok2Tokenizer.__call__.<locals>.<listcomp>  sJ     0 0 0  ))'9	   0 0 0r   c                 4    g | ]}d gt          |          z  S )r   )r   )r   r   s     r   r   z+Grok2Tokenizer.__call__.<locals>.<listcomp>  s$    #N#N#NsQC#c((N#N#N#Nr   )	input_idsattention_maskr   r   )NotImplementedErrorr^   rm   r   rb   r   )
rc   r\   r   r   r   r   input_ids_batchattention_mask_batchr   r   s
   `  ```    r   __call__zGrok2Tokenizer.__call__}  s      %&VWWWdD!! 	0 0 0 0 0 0 0 !0 0 0O $O#No#N#N#N  -AUVV   KK!!1	   
 
	 s9~~-9WWXXXr   toolsc                     ~|p| j         S r   )r   )rc   r   r   s      r   get_chat_templatez Grok2Tokenizer.get_chat_template  s     3 33r   messagestokenizec                     |                      ||          }|t          d          t          j        d|||d|}|r|                     |d          S |S )N)r   z?No chat template available. Provide `chat_template` explicitly.)conversationr   r   F)r   r   )r   ri   hf_chat_utilsapply_chat_templaterb   )rc   r   r   r   r   ru   templateprompts           r   r   z"Grok2Tokenizer.apply_chat_template  s     ))-u)EEQ   2 
!"
 
 	
 
  	A;;v%;@@@r   )NNT)F)NTFNr   )NNF)'__name__
__module____qualname__classmethodrj   r   boolr   r   r   r   rn   r   propertyrm   r   r   r   r   r   r   rR   r   r~   r   r   r   rb   rK   r   r   r   r   r   r   r   r   r   __classcell__)r   s   @r   ry   ry      s       
 #(##'1
 1
 1
t1
  	1

 *1
 Dj1
 
1
 1
 1
 [1
t .2'0 '0 '0 '0 	'0
 '0 Tz'0 #s(^d*'0 
'0 '0 '0 '0 '0 '0R3     1DI 1 1 1 X1 3c 3 3 3 X3 "c " " " X" "c " " " X" "c " " " X"     X 'C ' ' ' X' +c + + + X+ % % % % X%'4S> ' ' ' '*c3h * * * *#d3i #S4Z #DQTI # # # # #'!%#'  4K $J	
 ! 
c   	+ 	+$s)c/ 	+ 	+QT 	+ 	+ 	+ 	+ <C<C<<< X<HDIH$s)HHH XHVC$s)O Vd3i V V V V ;@ 937	c   AtCy AS A A A A !%#' !%!Y !YDIo!Y :!Y !	!Y
 !Y $J!Y 
!Y !Y !Y !YH OS4 4 4Z404T#s(^0Dt0K4	t4 4 4 4 .2$( 12 DcN#d* Tz	
  
tCy       r   ry   )1__doc__ro   r1   collections.abcr   r   pathlibr   typingr   r   r   huggingface_hubr	   huggingface_hub.utilsr
   r   r   r   transformersr   transformers.utilsr   r   vllm.entrypoints.chat_utilsr   vllm.loggerr   protocolr   r   r3   r   r   r   rangers   rr   DEFAULT_SPECIAL_TOKENSrk   r   rh   rj   r   r;   tuplern   rw   ry   r   r   r   <module>r     sr   - ,      + + + + + + + +       ) ) ) ) ) ) ) ) ) ) + + + + + +            ' & & & & & C C C C C C B B B B B B # # # # # # # # # # # #	X		CCUU1c]]CCC @@%%3--@@@ sC !$S==  "> 
33 4Z3 Dj	3
 *3 
#s(^3 3 3 3lH%H%
3S#XH% H% H% H%VA A A A A] A A A A Ar   