
     `i                         d dl mZ d dlmZmZmZmZ d dlZddlm	Z	  G d d          Z
 G d d	          Z G d
 d          ZdS )    )OrderedDict)DictListSetTupleN   )	TokenListc                       e Zd ZddZdS )TokenizerPrefixTreeNodereturnNc                 "    g | _         i | _        d S N)tokenschildrenselfs    x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/lmformatenforcer/tokenizerprefixtree.py__init__z TokenizerPrefixTreeNode.__init__   s    !#<>    r   N)__name__
__module____qualname__r    r   r   r   r      s(        ? ? ? ? ? ?r   r   c                   l    e Zd ZdZ G d d          ZdededdfdZd	ed
efdZ	dedede
fdZddZdS )JsonFreetextTokenCachea  
    JSON string can contain almost any unicode character, so creating a list of allowed tokens is very expensive.
    The list can be cached, but JSON Schema also allows 'minLength' and 'maxLength' constraint on the string,
    that make some tokens illegal depending on how long the generated string is already. This class precalculates
    a separate allowlist for all possible constraint states up to maximum token length (16 in Llama, for example).
    After deduplication, this results in about ~75 lists for the Llama tokenizer.
    c                   Z    e Zd ZdZd Zdeeeef                  fdZ	d	dee         fdZ
dS )
.JsonFreetextTokenCache._StringLengthTokenCachezThis is an internal data structure, that given a list of string+token pairs,
        can quickly return all token ids of strings between certain lengthsc                 $    g | _         dg| _        d S Nr   )r   first_index_geq_than_lengthr   s    r   r   z7JsonFreetextTokenCache._StringLengthTokenCache.__init__   s    %'DK;<#D,,,r   token_strs_to_idxc                 p   t          |d           }d |D             | _        d |D             }t          |          D ]O\  }}t          | j                  |k    r2| j                            |           t          | j                  |k    2P| j                            t          |                     d S )Nc                 ,    t          | d                   S r    len)ps    r   <lambda>zFJsonFreetextTokenCache._StringLengthTokenCache.build.<locals>.<lambda>   s    s1Q4yy r   )keyc                     g | ]
}|d          S )r   r   .0pairs     r   
<listcomp>zHJsonFreetextTokenCache._StringLengthTokenCache.build.<locals>.<listcomp>   s    AAAt47AAAr   c                 8    g | ]}t          |d                    S )r   r%   r+   s     r   r.   zHJsonFreetextTokenCache._StringLengthTokenCache.build.<locals>.<listcomp>!   s"    HHHdSa\\HHHr   )sortedr   	enumerater&   r!   append)r   r"   token_lengthsidxtoken_lengths        r   buildz4JsonFreetextTokenCache._StringLengthTokenCache.build   s     &'8>P>P Q Q QAA/@AAADKHH6GHHHM%.}%=%= A A!\$:;;|KK4;;C@@@ $:;;|KK,33C4F4FGGGGGr   r   c                    |t          | j                  k    rg S |dk    r| j        |         nd}|dk    rd}n@|dz   t          | j                  k     r| j        |dz            }nt          | j                  }| j        ||         S )Nr   r   )r&   r!   r   )r   
min_length
max_lengthstart_index	end_indexs        r   get_indices_between_lengthzIJsonFreetextTokenCache._StringLengthTokenCache.get_indices_between_length'   s    S!ABBBB	JTWX..$::FF^_KQ		a#d&F"G"GGG <Z!^L		,,	;{9455r   N)r7   r7   )r   r   r   __doc__r   r   r   strintr6   r=   r   r   r   _StringLengthTokenCacher      s        	O 	O	> 	> 	>		H4c3h+@ 		H 		H 		H 		H
	6 
	6dSVi 
	6 
	6 
	6 
	6 
	6 
	6r   rA   use_bitmask
vocab_sizer   Nc                     i | _         i | _        d| _        t                                          | _        t                                          | _        || _        || _        d S r    )	token_num_to_strallowlist_cachemax_token_lenr   rA   regular_tokens_length_cachequote_tokens_length_cacherB   rC   )r   rB   rC   s      r   r   zJsonFreetextTokenCache.__init__3   sU    02AC+A+Y+Y+[+[()?)W)W)Y)Y&&$r   	token_str	token_intc                    | j         r
J d            d|d d         v }d|dd         v }d|v pd|v }|s|s|r5	 t          j        d| d           n# t          j        j        $ r Y d S w xY wt          |          dk    rd S || j        |<   d S )Nz:Cannot add more tokens after allowlists were precalculated\r7   "r   
)rF   jsonloadsdecoderJSONDecodeErrorr&   rE   )r   rJ   rK   has_non_trailing_backslashhas_quote_before_endhas_newlines         r   	add_tokenz JsonFreetextTokenCache.add_token<   s    'ee)eee'%)Yss^%;""i"o5i'<49+<% 	)= 	 	
+y+++,,,,</    y>>Q F+4i(((s   A A*)A*min_remainingmax_lenc                 *   ||f}|| j         vrz| j                            |dz   |dz             }| j                            d|          }||z   }t	          | j        | j                  }|                    |           || j         |<   | j         |         S )a-  
        Get the list of tokens that are allowed within a JSON string, such that:
        1. all candidate tokens are at most `max_len` characters long (excluding the trailing quote), and
        2. if a token ends with a quote, it's at least `min_remaining` chars long (excluding the quote).
        r   r7   )rF   rI   r=   rH   r	   rB   rC   extend)r   rY   rZ   	cache_keytokens_with_quotetokens_without_quotecombinednew_tokenlists           r   lookup_allowed_tokensz,JsonFreetextTokenCache.lookup_allowed_tokensO   s     #G,	D000 $ > Y YZgjkZkmtwxmx y y#'#C#^#^_acj#k#k (+??H%d&6HHM   ***.;D +#I..r   c                 T   t          d | j                                        D                       }|s
J d            t          d |D                       r
J d            g }g }|D ]H}|d                             d          r|                    |           3|                    |           I| j                            |           | j                            |           t          t          | j        j                  t          | j        j                            | _        | `dS )z
        Precalculate token allowlists for all valid combinations of `min_remaining` and `max_len`
        based on the tokens that were added with `add_token()`.
        c              3   $   K   | ]\  }}||fV  d S r   r   )r,   nss      r   	<genexpr>z0JsonFreetextTokenCache.freeze.<locals>.<genexpr>f   s*      0a0aCAa!Q0a0a0a0a0a0ar   z6Cannot precalculate allowlists for an empty token listc              3   .   K   | ]}|d          dk    V  dS )r    Nr   r+   s     r   rg   z0JsonFreetextTokenCache.freeze.<locals>.<genexpr>h   s*      <<tAw"}<<<<<<r   z'Tokenizer must not contain empty tokensr   rN   N)listrE   itemsanyendswithr2   rH   r6   rI   maxr&   r!   rG   )r   
all_tokensregular_tokensquote_tokensr-   s        r   freezezJsonFreetextTokenCache.freezea   s?   
 -10a0a4CXC^C^C`C`0a0a0a,a,a
SSSSSz<<<<<<<gg>ggg<02.0 	, 	,DAw$$ ,##D))))%%d++++(..~>>>&,,\::: T%E%a!b!b!$T%C%_!`!`b b!!!r   r   )r   r   r   r>   rA   boolr@   r   r?   rX   r	   rb   rr   r   r   r   r   r      s         6 6 6 6 6 6 6 6<%D %c %d % % % %53 53 5 5 5 5&/3 / / / / / /$" " " " " "r   r   c                   T    e Zd Zdeeeeef                  dedefdZdedede	fdZ
d	S )
TokenizerPrefixTreerp   rB   rC   c                    t                      | _        t          ||          | _        t	                      | _        d |D             | _        |D ]Y\  }}}|                     ||| j                   | j                            ||           |r| j        	                    |           Z| j        
                                 d S )Nc                     i | ]	\  }}}||
S r   r   )r,   	token_idxrJ   _s       r   
<dictcomp>z0TokenizerPrefixTree.__init__.<locals>.<dictcomp>~   s!    bbb8O	9ay)bbbr   )r   rootr   json_freetext_tokenssetnew_word_tokenstokens_to_strs_add_token_to_treerX   addrr   )r   rp   rB   rC   rx   decodedis_new_words          r   r   zTokenizerPrefixTree.__init__z   s    +--	$:;
$S$S!),bbSabbb/= 	4 	4+Iw##GY	BBB%//CCC 4$((333!((*****r   rJ   rx   nodec                     |D ].}||j         vrt                      |j         |<   |j         |         }/|j                            |           d S r   )r   r   r   r2   )r   rJ   rx   r   	characters        r   r   z&TokenizerPrefixTree._add_token_to_tree   sZ    " 	, 	,I--+B+D+Di(=+DD9%%%%%r   N)r   r   r   r   r   r@   r?   rs   r   r   r   r   r   r   ru   ru   y   sy        +tE#sD.,A'B +QU +cf + + + +&C &C &G^ & & & & & &r   ru   )collectionsr   typingr   r   r   r   rQ   	tokenlistr	   r   r   ru   r   r   r   <module>r      s    # # # # # # ) ) ) ) ) ) ) ) ) ) ) )             ? ? ? ? ? ? ? ?i" i" i" i" i" i" i" i"X& & & & & & & & & &r   