
    *`i                         d dl mZmZ d dlZ	 d dlZd dlmZ n# e$ rZ	 ed          e	dZ	[	ww xY wej
        dej        dej        fd            Z	 	 ddej        dej        d	ee         d
eee                  fdZdS )    )ListOptionalNzTriton is not installedNUM_SMS
BLOCK_SIZEc	                    t          j        d          }	t          j        ||          }
t          j        |	||
z  |          D ]}||
z  }||
z  |z  }||nt          j        ||z             }|t          j        d|          z   }|dz  t          j        d|dz            z   }||k     }||k     }t          j        |||z  z   |z   |          }|dddf         t          j        dd          dddf         z	  dz  dk    }|                    |          }t          j        | ||z  z   |z   t          d           ||z             dS )a  Apply a bitmask to logits in-place using Triton. The bitmask is a 01 bitwise compressed tensor,
    where 0 means the token is masked and 1 means the token is not masked. After applying the bitmask,
    the masked logits will be set to -inf.

    Parameters
    ----------
    logits_ptr : tl.tensor
        Pointer to the logits tensor to apply the bitmask to.

    bitmask_ptr : tl.tensor
        Pointer to the bitmask tensor to apply.

    indices_ptr : Optional[tl.tensor]
        Optional pointer to indices tensor specifying which rows to apply the mask to.

    num_rows : int
        Number of rows to process. If indices_ptr is provided, this is the number of unique indices.

    vocab_size : int
        Size of the vocabulary dimension. If the logits does not have a vocab padding, this is the
        same as the logits's second dimension. Otherwise, this is the actual size of the vocabulary.

    logits_strides : int
        Stride between rows in the logits tensor.

    bitmask_strides : int
        Stride between rows in the bitmask tensor.

    NUM_SMS : int
        Number of streaming multiprocessors to use.

    BLOCK_SIZE : int
        Size of processing blocks.
    r   N       inf)	tl
program_idcdivrangeloadarangereshapestorefloat)
logits_ptrbitmask_ptrindices_ptrnum_rows
vocab_sizelogits_stridesbitmask_stridesr   r   pid
num_blockswork_idrow_idblock_offsetbatch_idoffsetsbitmask_offsets
vocab_maskpacked_bitmask_maskpacked_bitmaskbitmasks                        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/xgrammar/kernels/apply_token_bitmask_inplace_triton.py"apply_token_bitmask_inplace_kernelr(      s   ^ -

CZ00J8CJ!6@@ 
 
J&*,
:(066bgkF>R6S6S1j!9!99&",ryJ"<L/M/MMz)
-?(_44FH[
 
 #111d7+	!R0@0@qqq0IJaOTUU//*--
N22W<uU||mZZaMa	
 	
 	
 	

 
    logitsr&   r   indicesc                 B   t           j                            d          j        }d}t           j                            d          j        }t           j        j        d|vrd}nd}|j        t           j        k    s
J d            t          | j
        d         |j
        d         dz            }||}n||k    sJ d	| d
|             |t          |          n| j        dk    r| j
        d         nd}	|<t          j        |t           j                  }
|
                    | j        d          }|f}t!          |         | |||	||                                 d         |                                d         ||||z  d|                                 z  z  d           d S )Ncudai   r   gfx1@   r   zbitmask must be of type int32zvocab_size z( is larger than the detected vocab_size    r	   )dtypeT)devicenon_blocking      )	num_warps
num_stages)torchr-   get_device_propertiesmulti_processor_countgcnArchNameversionhipr2   int32minshapelenndimtensortor3   r(   strideelement_size)r*   r&   r   r+   r   r   arch	WARP_SIZEdetected_vocab_sizer   indices_cpugrids               r'   "apply_token_bitmask_inplace_tritonrM   P   s    j..v66LGJ:++A..:D}$t););			=EK''')H'''fl2.b0AB0FGG(

 ----bbbM`bb .--  '2s7|||6;Z[K[K[QabHl7%+>>>..D.II:D&t,	)bF4G4G4I4I.IJ     r)   )NN)typingr   r   r9   tritontriton.languagelanguager   ImportErrorerrjit	constexprr(   TensorintrM    r)   r'   <module>rY      s   ! ! ! ! ! ! ! ! :MMM        : : :
+/
0
0c9: @
 \@
 @
 @
 @
 @
L !%#'	. .L.\. . d3i 	. . . . . .s   
 /*/