
    )`i.                     D   d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ej        d             Ze	 	 	 d1d
ej        dej        dedeej                 dee         dej        fd            Z edd          dej        d
ej        dej        dedee         ddfd            Z ed          dej        d
ej        dej        dedee         ddfd            Ze edd          	 	 d2dej        d
ej        dej        dededee         dej        fd                        Z ed          dej        d
ej        dej        dededee         ddfd            Ze edd          	 	 d2d
ej        dej        dej        dedee         ddfd                        Z ed          	 	 d2d
ej        dej        dej        dedee         ddfd            Ze edd           	 	 d2dej        d
ej        dej        dej        dededee         ddfd!                        Z ed          	 	 d2dej        d
ej        dej        dej        dededee         ddfd"            Ze	 	 	 d1d
ej        dej        dedeej                 dee         dej        fd#            Z ed$d          dej        d
ej        dej        dedee         ddfd%            Z ed$          dej        d
ej        dej        dedee         ddfd&            Ze ed'd          	 	 d2d
ej        dej        dej        dedee         ddfd(                        Z ed'          	 	 d2d
ej        dej        dej        dedee         ddfd)            Ze ed*d+          	 d3d
ej        d,ej        d-ej        dedej        f
d.                        Z  ed*          	 d3d
ej        d,ej        d-ej        dedej        f
d/            Z!	 dd0l"m#Z#m$Z$ dS # e%$ r dZ#dZ$Y dS w xY w)4a3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)Optional   )flashinfer_api)gen_norm_module)device_support_pdlregister_custom_opregister_fake_opc                  B    t                                                      S N)r   build_and_load     c/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/norm.pyget_norm_moduler      s    ++---r   ư>inputweightepsout
enable_pdlreturnc                     |t          | j                  }|t          j        |           }t	          || |||           |S )a^  Root mean square normalization.

    ``out[i] = (input[i] / RMS(input)) * weight[i]``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, 2D shape (batch_size, hidden_size) or 3D shape (batch_size, num_heads, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    eps: float
        Epsilon for numerical stability.
    out: Optional[torch.Tensor]
        The output tensor, if specified, the kernel will update this tensor inplace.
    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_

    Returns
    -------
    output: torch.Tensor
        Normalized tensor, 2D shape (batch_size, hidden_size) or 3D shape (batch_size, num_heads, hidden_size).
    )r   devicetorch
empty_like_rmsnormr   r   r   r   r   s        r   rmsnormr       sI    > '55

{u%%S%j111Jr   zflashinfer::rmsnorm)r   )mutates_argsc                 |    |t          |j                  }t                                          | ||||           d S r   )r   r   r   r   r   r   r   r   r   s        r   r   r   G   sA     '55
c5&#zBBBBBr   c                     d S r   r   r!   s        r   _rmsnorm_faker#   T   	     	Dr   zflashinfer::rmsnorm_quantscalec                 ~    |t          |j                  }t                                          | |||||           dS )a&  Root mean square normalization.

    ``out[i] = (input[i] / RMS(input)) * weight[i]``

    Parameters
    ----------
    out: torch.Tensor
        The output tensor, will quantize the output to the dtype of this tensor.
    input: torch.Tensor
        Input tensor, 2D shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    scale: float
        Scale factor for quantization.
    eps: float
        Epsilon for numerical stability.
    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_

    Returns
    -------
    output: torch.Tensor
        Normalized tensor, 2D shape (batch_size, hidden_size).
    N)r   r   r   rmsnorm_quantr   r   r   r%   r   r   s         r   r'   r'   _   sD    F '55
##CsJOOOOOr   c                     d S r   r   r(   s         r   _rmsnorm_quant_faker*      s	     	Dr   zflashinfer::fused_add_rmsnorm)r   residualr+   c                 |    |t          | j                  }t                                          | ||||           dS )a  Fused add root mean square normalization.

    Step 1:
    ``residual[i] += input[i]``

    Step 2:
    ``input[i] = (residual[i] / RMS(residual)) * weight[i]``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size).
    residual: torch.Tensor
        Residual tensor, shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    eps: float
        Epsilon for numerical stability.
    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
    N)r   r   r   fused_add_rmsnormr   r+   r   r   r   s        r   r-   r-      sA    > '55
''xjQQQQQr   c                     d S r   r   r.   s        r   _fused_add_rmsnorm_faker0      r$   r   z#flashinfer::fused_add_rmsnorm_quant)r   r+   c           	          |t          |j                  }t                                          | ||||||           dS )aR  Fused add root mean square normalization.

    Step 1:
    ``residual[i] += input[i]``

    Step 2:
    ``input[i] = (residual[i] / RMS(residual)) * weight[i]``

    Parameters
    ----------
    out: torch.Tensor
        The output tensor, will quantize the output to the dtype of this tensor.
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size).
    residual: torch.Tensor
        Residual tensor, shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    scale: float
        Scale factor for quantization.
    eps: float
        Epsilon for numerical stability.
    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
    N)r   r   r   fused_add_rmsnorm_quantr   r   r+   r   r%   r   r   s          r   r2   r2      sQ    N '55
--UHfeS*    r   c                     d S r   r   r3   s          r   _fused_add_rmsnorm_quant_faker5      s	     	Dr   c                     |t          | j                  }|t          j        |           }t	          || |||           |S )a  Gemma-style root mean square normalization.

    ``out[i] = (input[i] / RMS(input)) * (weight[i] + 1)``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    eps: float
        Epsilon for numerical stability.
    out: Optional[torch.Tensor]
        The output tensor, if specified, the kernel will update this tensor inplace.
    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_

    Returns
    -------
    output: torch.Tensor
        Gemma Normalized tensor, shape (batch_size, hidden_size).
    )r   r   r   r   _gemma_rmsnormr   s        r   gemma_rmsnormr8      sI    > '55

{u%%3vsJ777Jr   zflashinfer::gemma_rmsnormc                 |    |t          |j                  }t                                          | ||||           d S r   )r   r   r   r8   r!   s        r   r7   r7   $  sA     '55
##CZHHHHHr   c                     d S r   r   r!   s        r   _gemma_rmsnorm_faker;   1  r$   r   z#flashinfer::gemma_fused_add_rmsnormc                 |    |t          | j                  }t                                          | ||||           dS )a  Gemma-style fused add root mean square normalization.

    Step 1:
    ``residual[i] += input[i]``

    Step 2:
    ``input[i] = (residual[i] / RMS(residual)) * (weight + 1)``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size).
    residual: torch.Tensor
        Residual tensor, shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    eps: float
        Epsilon for numerical stability.
    enable_pdl: bool
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
    N)r   r   r   gemma_fused_add_rmsnormr.   s        r   r=   r=   <  sB    B '55
--eXvsJWWWWWr   c                     d S r   r   r.   s        r   _gemma_fused_add_rmsnorm_faker?   b  r$   r   zflashinfer::layernormr   gemmabetac                 x    t          j        |           }t                                          || |||           |S )a
  Layer normalization.
    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size). Need to be bfloat16.
    gemma: torch.Tensor
        Gemma tensor, shape (hidden_size,). Need to be float32.
    beta: torch.Tensor
        Beta tensor, shape (hidden_size,). Need to be float32.
    eps: float
        Epsilon for numerical stability.

    Returns
    -------
    output: torch.Tensor
        Layer Normalized tensor, shape (batch_size, hidden_size). Same dtype as input.
    )r   r   r   	layernorm)r   r@   rA   r   r   s        r   rC   rC   m  s;    2 
5
!
!CUE4===Jr   c                 D    | j         \  }}|                     ||g          S r   )shape	new_empty)r   r@   rA   r   bks         r   _layernorm_fakerI     s%     ;DAq??Aq6"""r   )rmsnorm_fp4quantadd_rmsnorm_fp4quant)r   NN)r   N)r   )&__doc__	functoolstypingr   r   api_loggingr   jit.normr   utilsr   r   r	   cacher   Tensorfloatboolr   r   r#   r'   r*   r-   r0   r2   r5   r8   r7   r;   r=   r?   rC   rI   cute_dslrJ   rK   ImportErrorr   r   r   <module>rX      s/	                ' ' ' ' ' ' % % % % % % K K K K K K K K K K . . .  "&!%# #<#L# 
# 
%,		#
 # \# # # #L )AAA	C		C<	C L	C 
		C
 	C 
	C 	C 	C BA	C '((			<	 L	 
		
 	 
	 	 	 )(	 /hGGG !%#P #P	#P<#P L#P 	#P
 
#P #P \#P #P #P HG #PL -..			<	 L	 		
 
	 	 
	 	 	 /.	 3BWXXX
 !%R R<RlR LR 
	R
 R 
R R R YX RD 122
 !%	 	<	l	 L	 
		
 	 
	 	 	 32	 )8K   !%' '	'<' l' L	'
 ' 
' ' 
' ' '  'T 788 !%		 					<		 l		 L			
 		 
		 		 
		 		 		 98		  "&!%# #<#L# 
# 
%,		#
 # \# # # #L /hGGG	I		I<	I L	I 
		I
 	I 
	I 	I 	I HG	I -..			<	 L	 
		
 	 
	 	 	 /.	 )8M   !%X X<XlX LX 
	X
 X 
X X X  XD 788
 !%	 	<	l	 L	 
		
 	 
	 	 	 98	 +"===
 	 << , 
	
 \   >= 8 )**
 	# #<#<# ,# 
	#
 \# # # +*# @@@@@@@@@@       s   P 	PP