
    *`iI                        d Z ddlZddlZddlmZmZmZmZmZ ddl	Z	ddl
mZ ddlmZmZ ddlmZ e	j        Z	 ded	ed
eeef         fdZ e	j        de          Zded	ed
e	j        fdZde	j        d
dfdZdddde	j        de	j        d	ee         deee                  d
df
dZ G d de          Z G d de          ZdS )zaMatch the output of the LLM to the specified grammar, then generate the mask for the next
token.
    N)ListLiteralOptionalTupleUnion)	ArrayLike   )	XGRObject_core)CompiledGrammar
batch_size
vocab_sizereturnc                 4    | t          j        |dz            fS )zEReturn the shape of the bitmask: (batch_size, ceil(vocab_size / 32)).    )mathceilr   r   s     d/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/xgrammar/matcher.pyget_bitmask_shaper      s    	*r/2233    dtypec                 `    t          j        t          | |          t          t                    S )av  Allocate the bitmask for the next token prediction. The bitmask is an int32 tensor on
    CPU with shape (batch_size, ceil(vocab_size / 32)). Users who have their own needs to
    manage CUDA memory can construct the tensor with get_bitmask_shape and bitmask_dtype
    themselves.

    The reason why we use int32 instead of uint32 is that old versions of PyTorch do not support
    uint32.

    Parameters
    ----------
    batch_size : int
        The batch size of the bitmask.

    vocab_size : int
        The size of the vocabulary.

    Returns
    -------
    bitmask : torch.Tensor
        The shape of the bitmask.
    r   )torchfullr   
_FULL_MASKbitmask_dtyper   s     r   allocate_token_bitmaskr       s'    . :'
J??S`aaaar   bitmaskc                 :    |                      t                     dS )z#Reset the bitmask to the full mask.N)fill_r   )r!   s    r   reset_token_bitmaskr$   5   s    MM*r   )r   indiceslogitsr%   c                *   |j         | j         k    r"t          dd| j          d|j          z             | j         j        dk    rddlm}  || |||           d
S | j         j        dk    rddlm}  || |||           d
S dd	lm}  || |||           d
S )a  Apply the bitmask to the logits in-place. The bitmask is a 01 bitwise compressed tensor,
    where 0 means the token is masked and 1 means the token is not masked. It can be generated by
    allocate_token_bitmask and filled by fill_next_token_bitmask. After applying the bitmask, the
    masked logits will be set to -inf.

    The shape of logits and bitmask should be (batch_size, vocab_size) and
    (batch_size, bitmask_size) respectively. bitmask_size = ceil(vocab_size / 32). The operation is:

    .. code:: python

        for i in range(batch_size):
            for j in range(vocab_size):
                if get_bitmask_value(bitmask, i, j) == 0:
                    logits[i, j] = -inf

    get_bitmask_value(bitmask, i, j) gets the j-th bit of the i-th row of the bitmask.

    Notes
    -----
    Padding:
        This method allows additional padding on the vocabulary dimension of logits or bitmask. If
        padding exists, provide the real vocab size to the vocab_size parameter, and the operation
        will be applied to logits[..., :vocab_size] and bitmask[..., :ceil(vocab_size / 32)].

        If vocab_size is not provided, the vocab size will be detected as min(logits.shape[-1],
        bitmask.shape[-1] * 32).

    Indices:
        Indices can be used to specify which logits in the batch to apply the bitmask to. It is
        especially useful when there are structured requests and unstructured requests mixed in the
        same batch by skipping masking the logits in the unstructured requests. When specified, the
        operation will be

        .. code:: python

            for batch_id in indices:
                for j in range(vocab_size):
                    if get_bitmask_value(bitmask, batch_id, j) == 0:
                        logits[batch_id, j] = -inf

        When indices is specified, the batch sizes of logits and bitmask do not need to be the same.
        As long as the indices are valid, the operation will be performed.

    Device:
        The logits and bitmask should be on the same device. If both them are on GPU, we launch a GPU
        kernel to apply bitmask. If both them are on CPU, we use a CPU implementation. The GPU kernel
        is optimized and should be preferred.

        In practice, the bitmask is allocated on CPU, and the logits is usually on GPU, so users should
        manually copy the bitmask to GPU before calling this function.

    Parameters
    ----------
    logits : torch.Tensor
        The tensor to apply the bitmask to.

    bitmask : torch.Tensor
        The bitmask to apply.

    vocab_size : Optional[int], default: None
        The size of the vocabulary. If not provided, the vocab size will be detected as
        min(logits.shape[-1], bitmask.shape[-1] * 32).

    indices : Optional[List[int]], default: None
        A list of indices to specify which logits in the batch to apply the bitmask to. Should be
        unique. If None, apply the bitmask to all logits in the batch.
    z1logits and bitmask should be on the same device. zBut got logits.device: z, bitmask.device: cpur	   )apply_token_bitmask_inplace_cpucuda)"apply_token_bitmask_inplace_triton))apply_token_bitmask_inplace_torch_compileN)	device
ValueErrortype'kernels.apply_token_bitmask_inplace_cpur)   *kernels.apply_token_bitmask_inplace_tritonr+   1kernels.apply_token_bitmask_inplace_torch_compiler,   )r&   r!   r   r%   r)   r+   r,   s          r   apply_token_bitmask_inplacer3   :   s   T ~&&?YYYYYZ
 
 	
 }U""\\\\\\''WMMMMM		v	%	%bbbbbb**67JPPPPP	
 	
 	
 	
 	
 	
 	21&':wWWWWWr   c                   H   e Zd ZdZdddddedeeeee         f                  de	d	ed
df
dZ
dddede	d
e	fdZdddeeef         de	d
e	fdZ	 ddddedede	d
e	fdZd
efdZd ded
dfdZd
e	fdZd!dZed
efd            Zed
ee         fd            Zd
efdZdS )"GrammarMatchera  Match the output of the LLM to the specified grammar, then generate the mask for the next
    token. This is the core class in the grammar-guided generation.

    This class maintains a stateful matcher that can accept tokens and strings, then match them
    to the specified grammar. The matcher can provide a bitmask for the next token prediction,
    so that the output of the LLM follows the specified grammar. Its state can be reset and
    rolled back by tokens. It also provides utilities for jump-forward decoding.

    After matching the whole grammar, the matcher will accept a stop token. The token mask at
    this time will only allow stop tokens. After accepting the stop token, the matcher will
    terminate, then it cannot accept any new token or generate a new token mask, meaning the
    generation is finished.

    Under the hood, it utilizes a pushdown automaton with backtracking to match the grammar,
    with optimizations specific to LLM token mask generation.
    NFr   )override_stop_tokensterminate_without_stop_tokenmax_rollback_tokenscompiled_grammarr6   r7   r8   r   c                   t          |t                    st          d          |dk    st          j        dt
                     t          |t                    r|g}|                     t          j	        |j
        |||                     dS )a|  Construct the grammar matcher.

        Parameters
        ----------
        compiled_grammar : CompiledGrammar
            The initialization context for the grammar matcher.

        override_stop_tokens : Optional[Union[int, List[int]]], default: None
            If not None, the stop tokens to override the ones in the grammar.

        terminate_without_stop_token : bool, default: False
            Whether to terminate the matcher without accepting a stop token.

        max_rollback_tokens : int, default: -1
            Deprecated. You don't need to set it and it's always unlimited (-1).
            The new Earley parser significantly reduces the number of states, so we can allow
            unlimited rollback.

            The maximum number of rollback tokens allowed. The rollback operation is useful for
            jump-forward decoding and speculative decoding.
        zCThe grammar should be compiled before passing it to GrammarMatcher.r   z[max_rollback_tokens is deprecated. You don't need to set it and it's always unlimited (-1).N)
isinstancer   r.   warningswarnDeprecationWarningint_init_handler   r5   _handle)selfr9   r6   r7   r8   s        r   __init__zGrammarMatcher.__init__   s    : *O<< 	dbccc"b((M""   *C00 	:$8#9   ($,#	 	
 	
 	
 	
 	
r   )debug_printtoken_idrD   c                8    | j                             ||          S )ao  Accept one token and update the state of the matcher.

        In the following cases, the matcher will not accept the token and return False:

        1. The token does not match the grammar.
        2. The matcher has terminated after accepting the stop token, but is trying to accept a
           new token.
        3. The token id is out of range.
        4. The token is a special token.

        The user should capture the return value and handle the cases where the token is not
        accepted.

        Parameters
        ----------
        token_id : int
            The id of the token to accept.

        debug_print : bool, default: False
            Whether to print information about the internal state of the matcher. Helpful
            for debugging.

        Returns
        -------
        accepted : bool
            Whether the token is accepted.
        )rA   accept_token)rB   rE   rD   s      r   rG   zGrammarMatcher.accept_token   s    8 |((;???r   	input_strc                8    | j                             ||          S )at  Accept a string and update the state of the matcher. The whole string is considered
        as one step in rollback. It is used to complement the functionality of accept_token, and
        accept_token should always be used to accept tokens.

        Parameters
        ----------
        input_str : Union[str, bytes]
            The string to be accepted.

        debug_print : bool, default: False
            Whether to print information about the internal state of the matcher. Helpful for
            debugging.

        Returns
        -------
        accepted : bool
            Whether the string is accepted.
        )rA   accept_string)rB   rH   rD   s      r   rJ   zGrammarMatcher.accept_string   s    & |)))[AAAr   r   r!   indexc                :    | j                             |||          S )a2  Fill the bitmask for the next token prediction. The input bitmask can be generated
        by allocate_token_bitmask, and must be on CPU. bitmask[index] will be filled with the
        next token bitmask.

        This method does not change the matcher state.

        Parameters
        ----------
        bitmask : ArrayLike
            The bitmask for the next token prediction. It supports torch.Tensor and other
            array-like objects, as long as they support the DLPack protocol.

        index : int, default: 0
            The batch id of the bitmask.

        debug_print : bool, default: False
            Whether to print information about generated bitmask. Helpful for debugging.

        Returns
        -------
        need_apply : bool
            Whether the bitmask need to be applied (not all-true). An optimization: if False,
            this means the bitmask is already all-true, so no need to apply it.

        Raises
        ------
        RuntimeError
            If the bitmask is invalid (not on CPU, not int32, shape mismatch).
        )rA   fill_next_token_bitmask)rB   r!   rK   rD   s       r   rM   z&GrammarMatcher.fill_next_token_bitmask  s    @ |33GUKPPPr   c                 4    | j                                         S )a  Find the jump-forward string for jump-forward decoding. This is the longest string that
        certainly conforms with the current grammar from the current matcher state. This string
        can become the output of the LLM without requiring LLM decoding.

        This method does not change the matcher state.

        Returns
        -------
        jump_forward_string : str
            The jump-forward string.
        )rA   find_jump_forward_stringrB   s    r   rO   z'GrammarMatcher.find_jump_forward_string6  s     |44666r   r	   
num_tokensc                 :    | j                             |           dS )a;  Rollback the matcher to a previous state by several tokens.

        Parameters
        ----------
        num_tokens : int, default: 1
            The number of tokens to rollback. It cannot exceed the current number of steps, nor can
            it exceed the specified maximum number of rollback tokens.
        N)rA   rollback)rB   rQ   s     r   rS   zGrammarMatcher.rollbackD  s      	j)))))r   c                 4    | j                                         S )aX  Check if the matcher has terminated. If terminate_without_stop_token is False, the
        matcher will terminate if it has accepted the stop token. Otherwise, the matcher will
        terminate after matching the whole grammar.

        Returns
        -------
        terminated : bool
            Whether the matcher has terminated.
        )rA   is_terminatedrP   s    r   rU   zGrammarMatcher.is_terminatedO  s     |))+++r   c                 4    | j                                         S )z'Reset the matcher to the initial state.)rA   resetrP   s    r   rW   zGrammarMatcher.reset[  s    |!!###r   c                     dS )zDepracated. Now max_rollback_tokens is always unlimited (-1).

        Get the maximum number of rollback tokens allowed.

        Returns
        -------
        max_rollback_tokens : int
            The maximum number of rollback tokens.
        r    rP   s    r   r8   z"GrammarMatcher.max_rollback_tokens_  s	     rr   c                     | j         j        S )a"  The ids of the stop tokens used in the matcher. If specified, the provided stop tokens
        will be used. Otherwise, the stop tokens will be detected from the vocabulary.

        Returns
        -------
        stop_token_ids : List[int]
            The ids of the stop tokens.
        )rA   stop_token_idsrP   s    r   r[   zGrammarMatcher.stop_token_idsl  s     |**r   c                 4    | j                                         S )a  Print the internal state of the matcher. This is used for debugging. The
        representation of the internal state is subject to change.

        Returns
        -------
        internal_state : str
            The internal state of the matcher.
        )rA   _debug_print_internal_staterP   s    r   r]   z*GrammarMatcher._debug_print_internal_statex  s     |77999r   )r   )r	   )r   N)__name__
__module____qualname____doc__r   r   r   r?   r   boolrC   rG   strbytesrJ   r   rM   rO   rS   rU   rW   propertyr8   r[   r]   rY   r   r   r5   r5      sQ        * AE-2#%1
 1
 1
)1
 'uS$s)^'<=	1

 '+1
 !1
 
1
 1
 1
 1
f BG @ @ @S @$ @4 @ @ @ @< RW B B BuS%Z'8 B$ B[_ B B B B, 01 QIN Q  Q  Q  Q), QBF Q	 Q  Q  Q  QD7# 7 7 7 7	* 	*3 	*t 	* 	* 	* 	*
,t 
, 
, 
, 
,$ $ $ $ 
S 
 
 
 X
 	+S	 	+ 	+ 	+ X	+	:S 	: 	: 	: 	: 	: 	:r   r5   c                   6   e Zd ZdZddeeed         f         ddfdZ	 	 dded	         d
e	de
ee                  deddf
dZe	 dded	         dee         dedee         fd            Ze	 dded	         deeeef                  dedee         fd            ZdS )BatchGrammarMatcherzA batch version of GrammarMatcher that can fill the next token bitmask for multiple
    matchers in parallel. It utilizes multiple threads to speed up the computation. It is
    especially useful when the batch size is large.
    automax_threadsr   Nc                 T    |                      t          j        |                     dS )aG  Construct the batch grammar matcher.

        Parameters
        ----------
        max_threads : Union[int, Literal["auto"]], default: "auto"
            The maximum number of threads to use for parallel processing. If set to "auto", the
            max_threads will be set to std::thread::hardware_concurrency() / 2.
        N)r@   r   rg   )rB   ri   s     r   rC   zBatchGrammarMatcher.__init__  s)     	%3K@@AAAAAr   Fmatchersr5   r!   r%   rD   c                 X    d |D             }| j                             ||||           dS )a  Fill the next token bitmask for multiple matchers.

        Parameters
        ----------
        matchers : List[GrammarMatcher]
            The list of matchers to fill the bitmask for.

        bitmask : ArrayLike
            Must be a 2-dimensional int32 tensor with shape (bitmask_batch_size, bitmask_size).
            Bitmask_batch_size could be larger than the actual batch size to allow padding.
            Bitmask_size equals to ceil(vocab_size/32), and could be computed through
            xgrammar.allocate_token_bitmask.

        indices : Optional[List[int]], default: None
            A list of indices to specify which rows in the bitmask to fill. If None, fill
            the bitmask [0:len(matchers))].

        debug_print : bool, default: False
            Whether to print information about generated bitmask. Helpful for debugging.

        Raises
        ------
        RuntimeError
            If the bitmask is invalid (not on CPU, not int32, shape mismatch).
        c                     g | ]	}|j         
S rY   rA   .0matchers     r   
<listcomp>zEBatchGrammarMatcher.batch_fill_next_token_bitmask.<locals>.<listcomp>      CCCw7?CCCr   N)rA   batch_fill_next_token_bitmask)rB   rk   r!   r%   rD   matcher_handless         r   rt   z1BatchGrammarMatcher.batch_fill_next_token_bitmask  s<    @ DC(CCC22?GWVabbbbbr   tokensc                 \    d | D             }t           j                            |||          S )a  Accept a batch of tokens for multiple matchers.

        Parameters
        ----------
        matchers : List[GrammarMatcher]
            The list of matchers to accept tokens for.

        tokens : List[int]
            The list of tokens to accept.

        debug_print : bool, default: False
            Whether to print information about generated bitmask. Helpful for debugging.

        Returns
        -------
        accepted : List[bool]
            A list of booleans indicating whether each token was accepted by its corresponding matcher.

        Raises
        ------
        RuntimeError
            If the sizes of matchers and tokens do not match.
        c                     g | ]	}|j         
S rY   rn   ro   s     r   rr   z:BatchGrammarMatcher.batch_accept_token.<locals>.<listcomp>  rs   r   )r   rg   batch_accept_token)rk   rv   rD   ru   s       r   ry   z&BatchGrammarMatcher.batch_accept_token  s4    6 DC(CCC(;;OVU`aaar   stringsc                 \    d | D             }t           j                            |||          S )a  Accept a batch of strings for multiple matchers.

        Parameters
        ----------
        matchers : List[GrammarMatcher]
            The list of matchers to accept tokens for.

        strings : List[Union[str, bytes]]
            The list of strings to accept.

        debug_print : bool, default: False
            Whether to print information about generated bitmask. Helpful for debugging.

        Returns
        -------
        accepted : List[bool]
            A list of booleans indicating whether each string was accepted by its corresponding matcher.

        Raises
        ------
        RuntimeError
            If the sizes of matchers and strings do not match.
        c                     g | ]	}|j         
S rY   rn   ro   s     r   rr   z;BatchGrammarMatcher.batch_accept_string.<locals>.<listcomp>  rs   r   )r   rg   batch_accept_string)rk   rz   rD   ru   s       r   r}   z'BatchGrammarMatcher.batch_accept_string  s4    : DC(CCC(<<_gWbcccr   )rh   )NF)F)r^   r_   r`   ra   r   r?   r   rC   r   r   r   rb   rt   staticmethodry   rc   rd   r}   rY   r   r   rg   rg     s        

B 
BE#wv*>$? 
BT 
B 
B 
B 
B  (,!"c "c'("c "c $s)$	"c
 "c 
"c "c "c "cH QVb b'(b26s)bJNb	db b b \b:  "d d'(deCJ'(d d 
d	d d d \d d dr   rg   )ra   r   r<   typingr   r   r   r   r   r   numpy.typingr   baser
   r   compilerr   int32r   r?   r   tensorr   Tensorr    r$   r3   r5   rg   rY   r   r   <module>r      s2      8 8 8 8 8 8 8 8 8 8 8 8 8 8  " " " " " " " " " " " " " " % % % % % % &4# 43 45c? 4 4 4 4
 U\"M222
bs b b b b b b4 $     !%#'_X _X _XL_X\_X 	_X
 d3i _X 
_X _X _X _XDe: e: e: e: e:Y e: e: e:Prd rd rd rd rd) rd rd rd rd rdr   