
     `i+                         d dl mZmZ d dlZd dlmZmZmZmZm	Z	m
Z
mZmZ d dlZddlmZ ddlmZmZmZ ddlmZmZ ddlmZ  G d	 d
          Z G d d          ZdS )    )	dataclassfieldN)AnyCallableDictHashableListOptionalTupleUnion   )LMFormatEnforcerException)CharacterLevelParserForceStopParserCharacterLevelParserConfig)TokenizerPrefixTreeTokenizerPrefixTreeNode)	TokenListc            
           e Zd ZdZdeeeeef                  de	ee         gef         de
eee         f         dedef
dZdS )	TokenEnforcerTokenizerDatazTokenEnforcerTokenizerData contains all of the preprocessing for preparing the TokenEnforcer to work with a 
    specific tokenizer. It does some calculations, so it is recommended to reuse it for multiple TokenEnforcersregular_tokensdecodereos_token_iduse_bitmask
vocab_sizec                 *   fd|D             }|| _         t          | j         |          | _        || _        || _        d                    d | j        j        j                                        D                       | _	        | _
        || _        dS )aA  
        Create the tokenizer data that the TokenEnforcer needs. This can be reused for multiple TokenEnforcers if they work with the same tokenizer.
        :param regular_tokens: A list of tuples (token_id, token_string, is_new_word_token) for all the regular (not special) tokens in the tokenizer vocabulary.
        Note that token_string is expected to include leading / trailing whitespaces if relevant.
        :param decoder: A function that decodes a list of token ids into a string.
        :param eos_token_id: The token id(s) of the end-of-string token(s).
        c                 ,    g | ]}|d          k    |S )r    ).0token_tupler   s     r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/lmformatenforcer/tokenenforcer.py
<listcomp>z7TokenEnforcerTokenizerData.__init__.<locals>.<listcomp>   s+    "q"q"q;T_`aTbfpTpTp;TpTpTp     c              3   @   K   | ]}t          |          d k    |V  dS )r   N)len)r   	token_strs     r!   	<genexpr>z6TokenEnforcerTokenizerData.__init__.<locals>.<genexpr>!   s_        *F  *F	qtu~qq  DE  rE  rE)  rE  rE  rE  rE  *F  *Fr#   N)r   r   tokenizer_treer   r   joinrootchildrenkeystokenizer_alphabetr   r   )selfr   r   r   r   r   filtered_regular_tokenss        ` r!   __init__z#TokenEnforcerTokenizerData.__init__   s     #r"q"q"q."q"q"q51$2E{T^__("$''  *F  *FTEXE]EfEkEkEmEm  *F  *F  *F  #F  #F$&r#   N)__name__
__module____qualname____doc__r	   r   intstrboolr   r   r1   r   r#   r!   r   r      s        s s'!%eCdN&;!<'"DI;#34'  %S$s)^4' #	'
 !' ' ' ' ' 'r#   r   c            	           e Zd ZdZe G d d                      ZdedefdZde	e
         defd	Zd
eddfdZdedededee         fdZddde	e
         fdZdS )TokenEnforcerzTokenEnforcer provides a token filtering mechanism, given a CharacterLevelParser and some information about the tokenizer.
    It is the main entry point for extending lm-format-enforcer to new inference libraries. See __init__() and get_allowed_tokens()c                   p    e Zd ZU eed<    ed          Zedz  ed<    ee          Z	e
e         ed<   dS )TokenEnforcer.OutputTensorStateparserN)defaultallowed_tokens)default_factorycurrent_word_tokens)r2   r3   r4   r   __annotations__r   r?   r   listrA   r	   r6   r   r#   r!   OutputTensorStater<   )   sb         $$$$+05+>+>+>	D(>>>).t)D)D)DT#YDDDDDr#   rD   tokenizer_datar=   c                     i | _         || _        |j        | _        |j        | _        |j        | _        |j        | _        i | _        |j        | _        |j        | _        t          |j
                  }||_        dS )z
        Create a new TokenEnforcer.
        :param tokenizer_data: Per tokenizer data that the token enforcer needs in order to operate.
        :param parser: A CharacterLevelParser that defines the allowed strings.
        )alphabetN)prefix_statesroot_parserr)   r   r   r   allowed_token_cacher   r   r   r.   config)r/   rE   r=   rK   s       r!   r1   zTokenEnforcer.__init__0   s~     LN!,;%-*7,;8: )5(3 ,^5VWWWr#   token_sequencereturnc                    t          |          }|dd         }|| j        v r| j        |         j        S || j        vrGt                              | j                  }|| j        |<   |                     ||           |j        S | j        |         }|                     ||          }|| j        |<   |                     ||           |j        S )a  
        Get a list of allowed tokens, given a list of tokens that were already generated.
        :param token_sequence: The tokens that were already generated, and the next token will be generated for.
        :return: A list of token ids that are allowed to be selected next.
        Nr=   )tuplerH   r?   r:   rD   rI   _compute_allowed_tokens_apply_new_characters)r/   rL   
sent_tupleprev_step_tuplestateprev_step_state	new_states          r!   get_allowed_tokensz TokenEnforcer.get_allowed_tokensD   s     >**
$SbS/+++%j1@@D$666 "334;K3LLE-2Dz*((U;;;'' #0AO22?NSSI-6Dz*((Y???++r#   state_tokensrV   r<   c                 $   	 t          | j        | j                  }|j                                        }||| j        v r| j        |         |_        d S |j                                        }|                     |j        | j	        j
        ||           |j                                        rOt          | j        t                    r|                    | j                   n|                    | j                   |st#          d          ||_        ||| j        |<   d S d S # t$          $ r  t&          $ r t)          j        t(          j                   |                     t          |                    }t)          j        d| d           t          | j        | j                  |_        t          | j        t                    r"|j                            | j                   Y d S |j                            | j                   Y d S w xY w)Nz+Parser reached state with no allowed tokens)levelz+Unknown LMFormatEnforcer Problem. Prefix: 'z'
Terminating the parser. Please open an issue at 
https://github.com/noamgat/lm-format-enforcer/issues with the prefix and CharacterLevelParser parameters)r   r   r   r=   	cache_keyrJ   r?   shortcut_key_collect_allowed_tokensr)   r+   can_end
isinstancer   rC   extendappend
ValueErrorr   	ExceptionloggingbasicConfigERRORr   	exception)r/   rZ   rV   r?   r]   r^   prefixs          r!   rR   z%TokenEnforcer._compute_allowed_tokensb   s3   %	?(1$2BDO(T(TN..00I$d6N)N)N'+'?	'J$ <4466L((t7J7OQ_amnnn|##%% =d/66 ="))$*;<<<<"))$*;<<<! Q !OPPP $2E $6D(333 %$( 	 	 	  	? 	? 	?gm4444\\$|"4"455F @F @ @ @ A A A $-T-=t#O#OE $+T22 ?$++D,=>>>>>>$++D,=>>>>>>	?s    AD$ CD$ $CH,HH	tree_noder?   r^   c                    |                     |j                   |                                }|j                                        }t          |                              |          }t          |t                    r|d         dk    rt          |          dk    sJ |\  }}	}
}| j
        j        }t          |j        t          d|
|	z
                      }t          |j        ||	z
            }|                     |                    ||          j                   |                    dg          }|D ]<}|                    |          }|j        |         }|                     |||d            =d S )Nr   json_freetext   ")rb   tokensget_allowed_charactersr,   r-   setintersectionra   rQ   r&   r)   json_freetext_tokensminmax_token_lenmaxlookup_allowed_tokensr?   add_characterr_   )r/   r=   rk   r?   r^   allowed_charactersrelevant_characterscharacters_to_explore_cur_lenmin_lenmax_lencachemin_remainingmax_allowed_len	characternext_parsernext_tree_nodes                     r!   r_   z%TokenEnforcer._collect_allowed_tokens   s   i.///#::<<'05577 #$7 8 8 E EFX Y Y
 lE** 		N|A//Q/Q|$$))))+7(Aw'<E 3SGg<M5N5NOOM!%"5w7HIIO!!%"="=m_"]"]"lmmm$9$F$Fu$M$M!. 	\ 	\I ..y99K&/	:N((nnVZ[[[[	\ 	\r#   c           	      2   t                               |j                  }|d         }|| j        j        v r|g|_        | j        j        |         }n[|j        |gz   |_        |                     |j                  }|                     |j                  }|t          |          d          }|D ]g}	 |j        	                    |          |_        ## t          $ r8}	t          j        d| d|	 d           t                      |_        Y d }	~	`d }	~	ww xY w|S )NrP   rO   zReceived an invalid character 'z+', switching to ForceStopParser (Exception:))r:   rD   r=   r)   new_word_tokensrA   tokens_to_strsr   r&   ry   re   rf   debugr   )
r/   rV   rL   rX   	new_tokennew_charactersprev_decodednew_decodedr   es
             r!   rS   z#TokenEnforcer._apply_new_characters   s<   !335<3HH	"2&	+;;;-6KI)!0?	JNN,1,E,SI)<<(ABBL,,y'DEEK(\):):););<N' 	5 	5I5#,#3#A#A)#L#L	   5 5 5z	zzvwzzz{{{#2#4#4	      5 s   2C
D.DDN)r2   r3   r4   r5   r   rD   r   r   r1   r	   r6   r   rY   r   rR   r   r
   r   r_   rS   r   r#   r!   r:   r:   &   sD       G GE E E E E E E YE'A K_    (,c ,y , , , ,<&?E &?Bc &? &? &? &?P\.B \Of \  yB \  RZ  [c  Rd \ \ \ \4+L ^bcf^g      r#   r:   )dataclassesr   r   systypingr   r   r   r   r	   r
   r   r   rf   
exceptionsr   characterlevelparserr   r   r   tokenizerprefixtreer   r   	tokenlistr   r   r:   r   r#   r!   <module>r      s)   ( ( ( ( ( ( ( ( 



 N N N N N N N N N N N N N N N N N N N N  1 1 1 1 1 1 c c c c c c c c c c M M M M M M M M            ' ' ' ' ' ' ' '4P P P P P P P P P Pr#   