
    *`iVH                     (   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 	 ddl
Z
n# e$ r dZ
Y nw xY w	 ddlZn# e$ r dZY nw xY wddlmZmZ ddlmZmZ ddlmZ  ej                      ej        e          Z G d	 d
e          Z G d de          ZdS )zRThis module provides the tokenizer info class to handle the tokenizer information.    N)Enum)AnyDictListOptionalUnion)PreTrainedTokenizerBasePreTrainedTokenizerFast   )	XGRObject_core)loggingc                   "    e Zd ZdZdZ	 dZ	 dZdS )	VocabTypezThe type of the vocabulary. Used in TokenizerInfo. XGrammar supports three types of
    vocabularies: RAW, BYTE_FALLBACK, BYTE_LEVEL.
    r   r      N)__name__
__module____qualname____doc__RAWBYTE_FALLBACK
BYTE_LEVEL     k/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/xgrammar/tokenizer_info.pyr   r      s=          C M J r   r   c                      e Zd ZdZej        fdddddeee         ee	         f         dede
e         de
eee         ef                  d	ed
dfdZeded
efd            Zeded
efd            Zeded
efd            Zeddddede
e         de
eee         ef                  d
d fd            Zed
efd            Zed
efd            Zed
efd            Zed
efd            Zed
ee         fd            Zed
ee         fd            Zed
ee         fd            Zd
e	fdZedeeee	f                  de	d
d fd            Zede	d
ee	ef         fd            Z d
e	fdZ!ede	d
d fd             Z"dS )!TokenizerInfou  The tokenizer info contains the vocabulary, the type of the vocabulary, and necessary
    information for the grammar-guided generation.

    Note that although some tokenizers will encode the tokens in a special format, e.g.
    "<0x1B>" for "" in the ByteFallback tokenizer, and "Ġ" for " " in the Byte-Level BPE
    tokenizer, TokenizerInfo always decodes the vocabulary to the original format (e.g. ""
    and " ").

    Also note that some models (e.g. Phi-3 and Deepseek-V2) may pad the vocabulary to a multiple
    of 32. In this case, the model's vocab_size is larger than the tokenizer's vocabulary size.
    Please pass the model's vocab_size to the vocab_size parameter in the constructor, because
    this information is used to determine the size of the token mask.
    NF
vocab_sizestop_token_idsadd_prefix_spaceencoded_vocab
vocab_typer   r    r!   returnc          	          t          |t                    r|g}|                     t          j        ||j        |||                     dS )a  Construct the tokenizer info.

        Parameters
        ----------
        encoded_vocab : Union[List[bytes], List[str]]
            The encoded vocabulary of the tokenizer.

        vocab_type : VocabType, default: VocabType.RAW
            The type of the vocabulary. See also VocabType.

        vocab_size : Optional[int], default: None
            The size of the vocabulary. If not provided, the vocabulary size will be len(encoded_vocab).

        stop_token_ids : Optional[List[int]], default: None
            The stop token ids. If not provided, the stop token ids will be auto detected (but may not
            be correct).

        add_prefix_space : bool, default: False
            Whether the tokenizer will prepend a space before the text in the tokenization process.
        N)
isinstanceint_init_handler   r   value)selfr"   r#   r   r    r!   s         r   __init__zTokenizerInfo.__init__H   s_    : nc** 	.,-Nz/^M] 	
 	
 	
 	
 	
r   	tokenizerc                     t           dS t          | d          ot          | j        t           j                  }t          | d          od| j        v od| j        d         v }|p|S )NFr,   vocab_files_names
vocab_filetiktoken)r0   hasattrr&   r,   Encodingr.   )r,   has_tiktoken_encodingfilename_patterns      r   _is_tiktoken_tokenizerz$TokenizerInfo._is_tiktoken_tokenizerm   s    5 !(	; ? ? !
J!2E
 E

 I233 H	 ;;Hi9,GG 	 %8(88r   c                     t           dS |                     d          }|                                dk     rdS |                     |          }|d         }|dk    S )aE  Checking whether the tokenizer has byte-level whitespace conversion.

        Parameters
        ----------
        tokenizer : PreTrainedTokenizerBase
            The huggingface tokenizer.

        Returns
        -------
        is_byte_level : bool
            The tokenizer has byte-level whitespace conversion.
        NF r   r   u   Ġ)r0   encode__len__convert_ids_to_tokens)r,   new_ids
new_tokenstokens       r   _is_byte_level_tokenizerz&TokenizerInfo._is_byte_level_tokenizer   sd     5 ""4((??q  544W==
1}r   c                 j   t           dS t          | d          ot          | j        t           j                  }t          | d          o8t          | j        d          o#t          | j        j        t           j                  p.t          | d          ot          | j        t           j                  }|p|S )NFsp_modelr,   tok)sentencepiecer1   r&   r@   SentencePieceProcessorr,   rA   )r,   has_sp_model_attrhas_nested_sp_model_attrs      r   _is_sentencepiece_tokenizerz)TokenizerInfo._is_sentencepiece_tokenizer   s     5 $Iz:: 
z D@
 @

 I{++ _	+Z88_9.79]^^$
 Iu%% P9=-*NOO 	! !<$<<r   )r   r    c                   t          |t                    r|g}t          |t                    r"t          |          dk    rt	          d          	 |                                 }n5# t          $ r(}dt          |            d}t	          |          |d}~ww xY wt          |	                                          }t          t          |          |dz             }|p|}dg|z  }|
                                D ]\  }	}
|
|k     r|	||
<   t          | t                    r| j                                        }|:t          | d          r| j        	| j        g}nt                               d	           t$                              |          }t%          ||d
         |||d                   S t$                              |           r|:t          | d          r| j        	| j        g}nt                               d	           t*          j        }t$                              |           rt*          j        }t%          ||||d          S t$                              |           rt          | d          r| j        }nIt          | d          r"t          | j        d          r| j        j        }nt          | d          r| j        }|Xt          | d          r| j        	| j        g}n8|                                }|dk    r|g}nt                               d	           d|v rt*          j        }nt*          j        }t%          ||||d          S t	          dt          |                      )aj  Construct the tokenizer info from the huggingface tokenizer. This constructor supports
        various tokenizer backends, including the huggingface fast tokenizer and tiktoken tokenizer.
        Necessary information is automatically detected from the tokenizer.

        The vocab_size parameter is introduced to handle the misalignment between the model's
        vocab_size and the tokenizer's vocabulary size. User should pass the model's vocab_size
        (could be defined in the model config) here. See docs of vocab_size for more details.

        The stop token ids is by default the eos_token_id of the tokenizer. If there are other
        stop tokens, you can specify them manually.

        Parameters
        ----------
        tokenizer : PreTrainedTokenizerBase
            The huggingface tokenizer.

        vocab_size : Optional[int], default: None
            The vocabulary size **defined by the model** (**not the tokenizer**). This equals to the
            vocab dimention of the model's lm_head. This is the size of the token mask.

            It can be:

            1. the same as the tokenizer's vocabulary size. This is the most common case.
            2. larger than the tokenizer's vocabulary size. This happens when the model has padding
               to lm_head, possibly due to aligning lm_head to the power of 2.
               E.g. Phi-3 and Deepseek-V2.
            3. smaller than the tokenizer's vocabulary size. This happens when the tokenizer has
               some added tokens that will not supported by the model. E.g.
               Llama-3.2 Vision and Molmo-72B-0924 has padded `<|image|>` tokens, but they will not
               be considered in lm_head or generated by the model.

            model_vocab_size need to be provided for case 2 and 3. If not provided, it will be
            set to the tokenizer's vocabulary size.

        stop_token_ids : Optional[List[int]], default: None
            The stop token ids. If not provided, the eos_token_id of the tokenizer will be used.

        Returns
        -------
        tokenizer_info : TokenizerInfo
            The tokenizer info.
        r   zstop_token_ids cannot be emptyz+Cannot get the vocabulary of the tokenizer z/. The tokenizer should have a get_vocab method.Nr    eos_token_idzWhen constructing TokenizerInfo from a huggingface tokenizer, stop_token_ids is neither provided by user nor found from the tokenizer. It will be automatically detected.r#   r!   )r#   r   r    r!   Fr   r@   r,   rA   z<0x0A>TzUnsupported tokenizer type: )r&   r'   listlen
ValueError	get_vocabAttributeErrortypemaxvaluesitemsr
   backend_tokenizerto_strr1   rI   loggerwarningr   _detect_metadata_from_hfr5   r   r   r>   r   rF   r@   r,   rA   eos_idr   )r,   r   r    
vocab_dictemsgmax_idtokenizer_vocab_sizer"   r=   idxbackend_strmetadatar#   r@   rY   s                   r   from_huggingfacezTokenizerInfo.from_huggingface   s   b nc** 	.,-Nnd++ 	?N0C0Cq0H0H=>>>	)",,..JJ 	) 	) 	)2d9oo 2 2 2  S//q(	) Z&&(())"3z??FQJ??7#7
 z)$**,, 	+ 	+JE3Z%*c"i!899 ]	O $5<<>>K%9n55 ):P:\&/&<%=NNNN=  
 %==kJJH #L1%-!)*<!=    11)<< A	O %9n55 ):P:\&/&<%=NNNN=  
 #J55i@@ 2 '1
 %-!&    66yAA '	O y*-- )$-K00 )WY=PR\5]5] )$.7E** )$=%9n55 ):P:\&/&<%=NN%__..F||*0A   :%%&4

&]
 %%-!%    MDOOMMNNNs   A& &
B0#BBc                 4    t          | j        j                  S )zThe type of the vocabulary.)r   _handler#   r*   s    r   r#   zTokenizerInfo.vocab_typeZ  s     0111r   c                     | j         j        S )zThe size of the vocabulary.)rd   r   re   s    r   r   zTokenizerInfo.vocab_size_  s     |&&r   c                     | j         j        S )z_Whether the tokenizer will prepend a space before the text in the tokenization
        process.)rd   r!   re   s    r   r!   zTokenizerInfo.add_prefix_spaced  s     |,,r   c                 D    t                               d           | j        S )zWhether the tokenizer will prepend a space before the text in the tokenization
        process.

        This property is deprecated. Use add_prefix_space instead.
        zJprepend_space_in_tokenization is deprecated. Use add_prefix_space instead.)rV   rW   r!   re   s    r   prepend_space_in_tokenizationz+TokenizerInfo.prepend_space_in_tokenizationj  s      	cddd$$r   c                     | j         j        S )zThe decoded vocabulary of the tokenizer. This converts the tokens in the LLM's
        vocabulary back to the original format of the input text. E.g. for type ByteFallback,
        the token <0x1B> is converted back to "".
        )rd   decoded_vocabre   s    r   rk   zTokenizerInfo.decoded_vocabt  s     |))r   c                     | j         j        S )zThe stop token ids.)rd   r    re   s    r   r    zTokenizerInfo.stop_token_ids|  s     |**r   c                     | j         j        S )zThe special token ids. Special tokens include control tokens, reserved tokens,
        padded tokens, etc. Now it is automatically detected from the vocabulary.)rd   special_token_idsre   s    r   rn   zTokenizerInfo.special_token_ids  s     |--r   c                 4    | j                                         S )zDump the metadata of the tokenizer to a json string. It can be used to construct the
        tokenizer info from the vocabulary and the metadata string.)rd   dump_metadatare   s    r   rp   zTokenizerInfo.dump_metadata  s     |))+++r   ra   c                 r    t                               t          j                             | |                    S )a=  Construct the tokenizer info from the vocabulary and the metadata string in json
        format.

        Parameters
        ----------
        encoded_vocab : List[Union[bytes, str]]
            The encoded vocabulary of the tokenizer.

        metadata : str
            The metadata string in json format.
        )r   _create_from_handler   from_vocab_and_metadata)r"   ra   s     r   rs   z%TokenizerInfo.from_vocab_and_metadata  s2     0077xPP
 
 	
r   r`   c                     t           j                            |           }t          j        |          }t          |d                   |d         dS )zDetect the metadata from the huggingface tokenizer backend string. For implementation
        use only.

        It returns {"vocab_type": VocabType, "add_prefix_space": bool}.
        r#   r!   )r#   r!   )r   r   rX   jsonloadsr   )r`   metadata_strra   s      r   rX   z&TokenizerInfo._detect_metadata_from_hf  sQ     *CCKPP:l++#H\$:;; (); <
 
 	
r   c                 4    | j                                         S )zSerialize the tokenizer info to a JSON string.

        Returns
        -------
        json_string : str
            The JSON string.
        )rd   serialize_jsonre   s    r   ry   zTokenizerInfo.serialize_json  s     |**,,,r   json_stringc                 p    t                               t          j                             |                     S )af  Deserialize a tokenizer info from a JSON string.

        Parameters
        ----------
        json_string : str
            The JSON string.

        Returns
        -------
        tokenizer_info : TokenizerInfo
            The tokenizer info.

        Raises
        ------
        InvalidJSONError
            When the JSON string is invalid.
        DeserializeFormatError
            When the JSON string does not follow the serialization format of the tokenizer info.
        DeserializeVersionError
            When the __VERSION__ field in the JSON string is not the same as the current version.
        )r   rr   r   deserialize_json)rz   s    r   r|   zTokenizerInfo.deserialize_json  s*    . 001D1U1UVa1b1bcccr   )#r   r   r   r   r   r   r   r   bytesstrr   r'   boolr+   staticmethodr	   r5   r>   rF   rb   propertyr#   r   r!   ri   rk   r    rn   rp   rs   r   r   rX   ry   r|   r   r   r   r   r   9   s        " !*#

 %):>!&#
 #
 #
T%[$s)34#
 #

 SM#
 !tCy#~!67#
 #
 
#
 #
 #
 #
J 9*A 9d 9 9 9 \9" ,C     \2 =/F =4 = = = \=*  %):>	hO hO hO*hO SMhO !tCy#~!67	hO
 
hO hO hO \hOT 2I 2 2 2 X2 'C ' ' ' X' -$ - - - X-
 %t % % % X% *tE{ * * * X* +S	 + + + X+ .49 . . . X.
,s , , , ,
 
E%*-.
:=
	
 
 
 \
$ 
c 
d38n 
 
 
 \
- - - - - dc do d d d \d d dr   r   )r   ru   enumr   typingr   r   r   r   r   rB   ImportErrorr0   transformersr	   r
   baser   r   supportr   enable_logging	getLoggerr   rV   r   r   r   r   r   <module>r      s   X X        3 3 3 3 3 3 3 3 3 3 3 3 3 3   MMMOOOO   HHH J I I I I I I I " " " " " " " "           		8	$	$       @Vd Vd Vd Vd VdI Vd Vd Vd Vd Vds   ! ++4 >>