
    )`i              
          d Z ddlZddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZmZ dd	lmZ ej        d
efd            Zdej        dej        ddfdZe	 ddej        dej        dee         dej        fd            Ze	 ddej        dej        dee         dej        fd            Ze	 ddej        dej        dee         dej        fd            Zed             ZdS )a3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)SimpleNamespace)Optional   )flashinfer_api)gen_act_and_mul_module)device_support_pdlregister_custom_opregister_fake_opget_compute_capability)get_fp4_quantization_moduleact_func_namec           
         t          |                                           }|  d}t          ||          t          d| d          	 ddt          j        dt          j        dt          t                   dd ffd	            }t          d|           	 ddt          j        dt          j        dt          t                   dd fd
            }t          di ||iS )N_and_mulzflashinfer::)out)mutates_argsr   input
enable_pdlreturnc                 N    |t          |j                  } | ||           d S N)r   device)r   r   r   fns      i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/activation.py_act_and_mulz,get_act_and_mul_module.<locals>._act_and_mul*   s4     +EL99J
3z"""""    c                     d S r    )r   r   r   s      r   _fake_act_and_mulz1get_act_and_mul_module.<locals>._fake_act_and_mul2   s	     	r   r   r   )
r   build_and_loadgetattrr	   torchTensorr   boolr
   r   )r   modulefnamer   r   r   s        @r   get_act_and_mul_moduler&   "   s(   #M22AACCF &&&E			B.u..XFFFMQ# #\#"',#<DTN#	# # # # # GF# ,U,,--MQ \"',<DTN	   .- 33e\2333r   r   outputr   c                 x   | j         |j         k    sJ | j          d|j                      | j        d d         |j        d d         k    s)J | j        d d          d|j        d d                      | j        d         d|j        d         z  k    s(J | j        d          dd|j        d         z               d S )Nz !=    )ndimshape)r   r'   s     r   _check_shaper-   <   s    :$$$&F&F&F&F$$$;ssv|CRC0000;ss44crc!244 100 ;r?a&,r"22222;r?66FL$4 466 32222r   r   r   c                    |t          | j                  }| j        d         | j        j        z  dz  dk    rt          d          |t          | |           n@t          j        | j        dd         | j        d         dz  fz   | j        | j                  }t          d          
                    || |           |S )	aw  Fused SiLU and Mul operation.

    ``silu(input[..., :hidden_size]) * input[..., hidden_size:]``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (..., 2 * hidden_size).

    out: Optional[torch.Tensor]
        The output tensor, if specified, the kernel will update this tensor inplace.

    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_

    Returns
    -------
    output: torch.Tensor
        Output tensor, shape (..., hidden_size).
    Nr)      r   *The pointers must be multiple of 16 bytes.r*   r   dtypesilu)r   r   r,   r2   itemsize
ValueErrorr-   r!   emptyr&   silu_and_mulr   r   r   s      r   r7   r7   F   s    2 '55
{2--2a77EFFF
UC    kKB1 466<+
 
 

 6""//  
 Jr   c                    |t          | j                  }| j        d         | j        j        z  dz  dk    rt          d          |t          | |           n@t          j        | j        dd         | j        d         dz  fz   | j        | j                  }t          d          
                    || |           |S )	a  Fused GeLU Tanh and Mul operation.

    ``gelu(tanh(input[..., :hidden_size])) * input[..., hidden_size:]``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (..., 2 * hidden_size).

    out: Optional[torch.Tensor]
        The output tensor, if specified, the kernel will update this tensor inplace.

    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_

    Returns
    -------
    output: torch.Tensor
        Output tensor, shape (..., hidden_size).
    Nr)   r/   r   r0   r*   r1   	gelu_tanh)r   r   r,   r2   r4   r5   r-   r!   r6   r&   gelu_tanh_and_mulr8   s      r   r;   r;   s   s    2 '55
{2--2a77EFFF
UC    kKB1 466<+
 
 

 ;''99#ujQQQJr   c                    |t          | j                  }| j        d         | j        j        z  dz  dk    rt          d          |t          | |           n@t          j        | j        dd         | j        d         dz  fz   | j        | j                  }t          d          
                    || |           |S )	aw  Fused GeLU and Mul operation.

    ``gelu(input[..., :hidden_size]) * input[..., hidden_size:]``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (..., 2 * hidden_size).

    out: Optional[torch.Tensor]
        The output tensor, if specified, the kernel will update this tensor inplace.

    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_

    Returns
    -------
    output: torch.Tensor
        Output tensor, shape (..., hidden_size).
    Nr)   r/   r   r0   r*   r1   gelu)r   r   r,   r2   r4   r5   r-   r!   r6   r&   gelu_and_mulr8   s      r   r>   r>      s    2 '55
{2--2a77EFFF
UC    kKB1 466<+
 
 

 6""//UJGGGJr   c                     t          | j                  \  }}|dz  |z    }t          |                              | ||          \  }}||fS )a  
    Silu and multiply and quantize batched input tensor to NVFP4 format with mask.
    Parameters:
        a (torch.Tensor): Input tensor of shape [B, M, K] with dtype fp16/bf16.
        a_global_sf (torch.Tensor): Global scale factor of shape [1] with dtype float32.
        mask (torch.Tensor): Mask tensor to apply before quantization.
        sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - Quantized tensor of shape [B, M, K/2] with dtype FLOAT4_E2M1X2
            - Scale factors tensor with shape determined by layout and sf_vec_size
    
   )r   r   r   0silu_and_mul_scaled_nvfp4_experts_quantize_sm100)amaska_global_sfmajorminordevice_archa_fp4a_sfs           r   *silu_and_mul_scaled_nvfp4_experts_quantizerJ      sf    $ *!(33LE5RZ%')K- 66	  E4 $;r   )NN)__doc__	functoolstypesr   typingr   r!   api_loggingr   jitr   utilsr   r	   r
   r   fp4_quantizationr   cachestrr&   r"   r-   r#   r7   r;   r>   rJ   r   r   r   <module>rU      s2         ! ! ! ! ! !        ' ' ' ' ' ' ' ' ' ' ' '            : 9 9 9 9 9 4# 4 4 4 42 el t     PT) )<)#l)?G~)
\) ) ) )X PT% %<%#l%?G~%
\% % % %P PT% %<%#l%?G~%
\% % % %P     r   