
    Pi                         d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ dddd	d
ddddddddZdZ edd          D ]Zedk    r	eede d<   edz  ZdZ G d dee          ZdS )    )AnyDictListMappingOptionalTuple)MessagePromptTemplatetruncate)ModelTokenizer)	Transform)GPT2BaseTokenizer頇 i i i i i i i i i i i )z<|dummy_0|><|endoftext|>z<|fim_prefix|>z<|fim_middle|>z<|fim_suffix|>z<|dummy_1|>z<|dummy_2|>z<|dummy_3|><|im_start|>
<|im_end|>
<|im_sep|>z<|endofprompt|>   i i  z<|dummy_z|>   zs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+c                   b   e Zd ZdZ	 	 	 	 	 ddededeeeef                  dee         dee         f
dZ	e
d	             Z	 ddedededee         fdZddee         dedefdZdedefdZddddee         dededeee         ee         f         fdZdeeef         deeef         fdZdS )Phi4TokenizeraQ  
    TikToken tokenizer configured with Phi4 (14B) special tokens.

    Args:
        merges_path (str): Path to merges.txt file.
        vocab_path (str): Path to vocab.json file.
        special_tokens (Optional[Dict[str, int]]): Mapping containing special text tokens and
            their registered token IDs. If left as None, this will be set to the canonical
            Phi4 special tokens.
        max_seq_len (Optional[int]): Max sequence length to truncate tokens to.
        prompt_template (Optional[PromptTemplate]): Template used to format the messages based on their role.
    Nmerges_path
vocab_pathspecial_tokensmax_seq_lenprompt_templatec                    |pt           | _        | j        d         | _        | j        d         | _        | j        d         | _        | j        g| _        || _        || _        t          ||| j        | j        | j        | j                  | _	        d S )Nr   r   z<|dummy_85|>)
PHI4_SPECIAL_TOKENSr   eos_idbos_idpad_idstop_tokensr   r   r   tokenizer_model)selfr   r   r   r   r   s         t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/phi4/_tokenizer.py__init__zPhi4Tokenizer.__init__6   s     -C0C ),7)/:).9 K=&.0KKKK 
  
    c                     | j         j        S )N)r#   
vocab_size)r$   s    r%   r)   zPhi4Tokenizer.vocab_sizeR   s    #..r'   Ttextadd_bosadd_eosreturnc                 <    | j                             |||          S )N)r*   r+   r,   )r#   encode)r$   r*   r+   r,   s       r%   r/   zPhi4Tokenizer.encodeV   s"     #**gw*WWWr'   idsskip_special_tokensc                 T    fd|D             }| j                             |          S )zDecode token IDs to strings.c                 6    g | ]}rd |cxk    rdk    n |S )r   i  ).0token_idr1   s     r%   
<listcomp>z(Phi4Tokenizer.decode.<locals>.<listcomp>]   sL     
 
 
'
 -4x,J,J,J,J7,J,J,J,J ,J,J,Jr'   )r#   decode)r$   r0   r1   ids_for_decodes     ` r%   r8   zPhi4Tokenizer.decode[   sD    
 
 
 

 
 

 #**>:::r'   rolec                     | j         d         g}|                    |                     |dd                     |                    | j         d                    |S )Nr   Fr+   r,   r   )r   extendr/   append)r$   r:   tokenized_messagess      r%   _tokenize_headerzPhi4Tokenizer._tokenize_headerd   s[    "1.AB!!$++dE5+"Q"QRRR!!$"5l"CDDD!!r'   F)r,   ignore_system_promptmessagesrA   c                "   | j         r|                      |          n|}g }g }|D ]}|r|j        dk    r|                     |j                  }|                    |           |                    |j        gt          |          z             g }	|j        D ]Z}
|
d         dk    r5|	|                     |
d                             d          dd          z  }	Ct          d|
d                    |r,|j        d	k    r!|	
                    | j        d
                    n+|j        d	k    r |	
                    | j        d
                    |                    |	           |                    |j        gt          |	          z             | j        rt          |          | j        k    r n| j        rVt          |          | j        k    r>t          || j        |r| j        nd           }t          || j        |r|j        nd           }||fS )Nsystemtyper*   content Fr<   z"Unsupported message content type: 	assistantr   )r   r:   r@   r=   maskedlenrF   r/   rstripRuntimeErrorr>   r   r   r   r   )r$   rB   r,   rA   templated_messagesr?   maskmessagetokenized_headertokensitems              r%   tokenize_messageszPhi4Tokenizer.tokenize_messagesj   s[    /3.BPD  *** 	  ) 	 	G# (@(@#44W\BB%%&6777KK(3/?+@+@@AAAF  <6))dkkY..s33UE *   FF 'KT&\KK    A7<;66d1,?@@@@,,d1,?@@@%%f---KK(3v;;6777 C(:$;$;t?O$O$O  	Y$6 7 74;K K K!)"D$4W6VdkkRV" " D$"2g4WGNNSWXXD!4''r'   samplec                 t    |                     d          }|                     |          \  }}||d<   ||d<   |S )zR
        Apply `tokenize_messages` to the "messages" field in the sample.
        rB   rQ   rN   )poprS   )r$   rT   rB   rQ   rN   s        r%   __call__zPhi4Tokenizer.__call__   sD     ::j))--h77!xvr'   )NNNNN)TT)T)__name__
__module____qualname____doc__strr   r   intr
   r&   propertyr)   boolr   r/   r8   listr@   r	   r   rS   r   r   rW   r4   r'   r%   r   r   (   s          37%)48
 

 
 !c3h0	

 c]
 ".1
 
 
 
8 / / X/ @DX XX"&X8<X	cX X X X
; ;$s) ;$ ;# ; ; ; ;"S "T " " " " %*3( 3( 3(w-3( 	3(
 #3( 
tCy$t*$	%3( 3( 3( 3(jwsCx0 WS#X5F      r'   r   N)typingr   r   r   r   r   r   torchtune.datar	   r
   r   torchtune.modules.tokenizersr   torchtune.modules.transformsr   'torchtune.modules.transforms.tokenizersr   r   current_dummy_indexranger6   CL100K_PATTERNr   r4   r'   r%   <module>ri      s]   = < < < < < < < < < < < < < < < < < < < < < < < < < 7 7 7 7 7 7 2 2 2 2 2 2 E E E E E E      ff%%  H6>F:#6:::;1 L    NI     r'   