
    )`iN                     r
   U d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZmZ ddlZddlZddlZddlmZ ddlmZ ddlZddlmZ  G d	 d
e          Z G d de          Z G d de          ZdZ G d de          Z G d de          Z G d de          Zdej        de dej        fdZ!dej        de dej        fdZ"de#de#fdZ$	 dde#de#de#d e#de#f
d!Z%d"e ddfd#Z&de ddfd$Z'dej        de(fd%Z)dej        dej        fd&Z*d'eej        eej        ej        f         f         de deej        ej        f         fd(Z+d)e#dej        fd*Z,i Z-eee ej.        f         ej        f         e/d+<   	 dd-e d.e#d/ej.        d0e(dej        f
d1Z0de#de#fd2Z1d3e#d/ej.        dej        fd4Z2d5e#d/ej.        dej        fd6Z3d7eej4        e f         dej4        fd8Z5ej6        d/ej.        dee#e#f         fd9            Z7ej6        d/ej.        de8fd:            Z9d;ej        d<ej        d=ej4        d>ej4        ddf
d?Z: ee           ed@          k     rW	 ddddAd-e dBe	e         dCee ee          f         dDe	ee e
e          f                  dEe	e          defdFZ;	 dd-e dBe	e         defdGZ<nV	 ddddAd-e dBe	e         dCee ee          f         dDe	ee e
e          f                  dEe	e          defdHZ;	 dd-e dBe	e         defdIZ<d/ej.        de fdJZ=d"e#dKe(dLe(d=ej4        d>ej4        de(fdMZ>d"e#dKe(dLe(d=ej4        d>ej4        de(fdNZ?d/ej.        d"e#dKe(dLe(d=ej4        d>ej4        de fdOZ@dPe dQe de(fdRZAde(fdSZBddTlCmDZDmEZE de fdUZFd/ej.        de(fdVZGd/ej.        de(fdWZHd/ej.        de(fdXZId/ej.        de(fdYZJd/ej.        de(fdZZKd/ej.        de(fd[ZLd/ej.        de fd\ZMdej        d]e	e
e#                  d^e	ej4                 d_e	ej.                 d-e ddfd`ZNej6        da             ZO G db dce          ZPePjQ        ePjR        ePjS        ePjT        ePjU        ePjV        ddZWdee ddfdfZXej6        d/ej.        de(fdg            ZYde#dhe#de#fdiZZde#dhe#de#fdjZ[ej6        d/ej.        de#fdk            Z\ G dl dm          Z]g dnZ^g doZ_dpe#de#fdqZ`drej        dpe#dej        fdsZa	 ddrej        dpe#due#dej        fdvZbdw Zcdxee#         defdyZd	 	 ddzee ef         d{e	e         d|e	e         defd}Zeej6        d/ej.        fd~            ZfdS )a3  
Copyright (c) 2023 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)Enum)CallableDictIterableOptionalSequenceTupleUnion)TorchVersion)__version__   )gen_spdlog_modulec                       e Zd ZdZdZdZdS )PosEncodingModer   r      N)__name__
__module____qualname__NONE
ROPE_LLAMAALIBI     d/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/utils.pyr   r       s        DJEEEr   r   c                       e Zd ZdZdZdZdZdS )MaskModer   r   r      N)r   r   r   
NON_CAUSALCAUSALCUSTOMMULTIITEMSCORINGr   r   r   r   r   &   s%        JFFr   r   c                       e Zd ZdZdZdS )TensorLayoutr   r   N)r   r   r   NHDHNDr   r   r   r#   r#   -   s        
C
CCCr   r#   g+eG?c                       e Zd ZdZdS )GPUArchitectureErrorz5Custom exception for GPU architecture-related errors.Nr   r   r   __doc__r   r   r   r'   r'   5   s        ??Dr   r'   c                       e Zd ZdZdS )LibraryErrorz,Custom exception for library-related errors.Nr(   r   r   r   r+   r+   ;           66Dr   r+   c                       e Zd ZdZdS )BackendSupportedErrorz,Custom exception for backend-related errors.Nr(   r   r   r   r.   r.   A   r,   r   r.   x	kv_layoutreturnc                     | j         dvrt          d          | j         dk    rX|dk    r|                     d          S |dk    r|                     d          S t          d                    |                    | S )	N)      zx must be 4D or 5Dr3   r$   r%   Invalid kv_layout {}ndim
ValueError	unsqueezeKeyErrorformatr/   r0   s     r   
_expand_5dr?   G       vV-...v{{ ;;r??"% ;;r??"188CCDDDHr   c                     | j         dvrt          d          | j         dk    rX|dk    r|                     d          S |dk    r|                     d          S t          d                    |                    | S )	N)r   r3   zx must be 3D or 4Dr   r$   r5   r%   r6   r7   r8   r>   s     r   
_expand_4drB   Y   r@   r   c                     | dk     rdS | dz
  }||dz	  z  }||dz	  z  }||dz	  z  }||dz	  z  }||dz	  z  }||dz	  z  }|dz   S )Nr   r   r3             r   )r/   ns     r   next_positive_power_of_2rH   k   sr    1uuq
 	
AAaKAaKAaKAaKAbLAbLAq5Lr      
num_tokensnum_expertstop_kmax_tile_tokens_dimc                     d}| |z  |z  }t          ||z            }t          |          }t          t          |d          |          }|S )Ng?rD   )intrH   minmax)rJ   rK   rL   rM   imbalance_factornum_tokens_per_experttile_tokens_dims          r   calculate_tile_tokens_dimrU   |   sa      (%/K? 58H HII./DEEO #oq113FGGOr   pos_encoding_modec                 t    t          t          |           s"t          d                    |                     d S )NzInvalid pos_encoding_mode {})hasattrr   r<   r=   )rV   s    r   _check_pos_encoding_moderY      s@    ?$566 Q5<<=NOOPPPQ Qr   c                 t    t          t          |           s"t          d                    |                     d S )Nr7   )rX   r#   r<   r=   )r0   s    r   _check_kv_layoutr[      s>    <++ A-44Y??@@@A Ar   c                 @    | j         t          j        t          j        fv S N)dtypetorchfloat8_e4m3fnfloat8_e5m2r/   s    r   	is_float8rc      s    7u*E,=>>>r   c                     |                      t          j                  } t          j        | j        d         dz   | j        | j                  }|                     d          |dd <   |S )Nr   r   r^   device)tor_   int64zerosshaper^   rf   cumsum)r/   rets     r   
get_indptrrm      sU    	U[A
+agaj1nAGAH
E
E
EChhqkkCGJr   paged_kv_cachec                 V   t          | t                    r%| \  }}t          ||          t          ||          fS t          j        |           r-t          | |          } |                     d          \  }}||fS t          d                    t          |                               )Nr   )dimzQUnrecognized paged_kv_cache type {}, expect a single tensor or a tuple of tensor.)

isinstancetuplerB   r_   	is_tensorr?   unbindr<   r=   type)rn   r0   paged_k_cachepaged_v_caches       r   _unpack_paged_kv_cacherx      s     .%(( 
'5$}}i00}i00
 	
 
	(	( 

#NI>>'5'<'<'<'C'C$}m++_ff^$$ 
 
 	
r   n_headsc           
         dt          j        t          j        |                     z  }dd|z  z  }t          j        |t          j        dd|z                       }|| k     rPdd|z  z  }t          j        |t          j        ddd| |z
  z  z   d                    }t          j        ||g          }|                                S )Nr   g       @g       r   g      )mathfloorlog2r_   powarangecatfloat)ry   rG   m_0mm_hat_0m_hats         r   get_alibi_slopesr      s    	TZ	'**+++A
$(
C	#u|Aq1u--..A7{{$(#	'5<1qGaK7H3H!#L#LMMIq%j!!7799r   
_cache_bufFnamebytesrf   	zero_initc                    | |f}t                               |          }||                    d          |k     rO|r"t          j        |t          j        |          }n!t          j        |t          j        |          }|t           |<   |S )Nr   re   )r   getsizer_   ri   uint8empty)r   r   rf   r   keybufs         r   _get_cache_bufr      s     .C
..

C
{chhqkkE)) 	G+e5;vFFFCC+e5;vFFFC
3Jr   c                 6    d| dz
                                   z  S )Nr   )
bit_lengthrb   s    r   
_ceil_pow2r      s    Q""$$$$r   seq_lenc                     t          |           }d| |f}t                              |          }|+t          j        ||t          j                  }|t          |<   |d |          S )Nrange_)rf   r^   )r   r   r   r_   r   int32)r   rf   seq_len_pow2r   r   s        r   _get_range_bufr      sf    g&&L"L""F
+C
..

C
{l<ekJJJ
3xx=r   num_qo_headsc                     d|  |f}t                               |          }|,t          |                               |          }|t           |<   |S )Nalibi_slopes_)r   r   r   rg   )r   rf   r   r   s       r   _get_cache_alibi_slopes_bufr      sU     *<))6
2C
..

C
{|,,//77
3Jr   r^   c                     t          | t                    rt          t          |           S t          | t          j                  r| S t          d                    t          |                               )Nz-dtype must be a string or torch.dtype, got {})rq   strgetattrr_   r^   	TypeErrorr=   ru   r^   s    r   canonicalize_torch_dtyper      sf    % 
ue$$$	E5;	'	' 
;BB4;;OO
 
 	
r   c                 ~    | j         dk    rt          d          t          j                            | j                  S )Ncudazdevice must be a cuda device)ru   r:   r_   r   get_device_capabilityindexrf   s    r   get_compute_capabilityr     s6    {f7888:++FL999r   c                    t          | t                    rt          j        |           } | j        dk    rt          d|            | j        | j        nd}t          j                     	 t          j	        |          }t          j
        |          }t          j        |t          j                  }||z  dz  dz  dz  }|t          j                     S # t          j                     w xY w)a  
    Get GPU memory bandwidth in GB/s for the specified CUDA device.

    Args:
        device: torch.device object, e.g., torch.device('cuda:0')

    Returns:
        float: GPU memory bandwidth (GB/s)

    Raises:
        ValueError: If device is not a CUDA device
    r   z"Device must be a CUDA device, got Nr   r   rD   i  )rq   r   r_   rf   ru   r:   r   pynvmlnvmlInitnvmlDeviceGetHandleByIndexnvmlDeviceGetMemoryBusWidthnvmlDeviceGetClockInfoNVML_CLOCK_MEMnvmlShutdown)rf   device_indexhandle	bus_width	mem_clock	bandwidths         r   get_gpu_memory_bandwidthr     s     &# &f%% {fFfFFGGG $*<#;6<<L O
2<@@6v>>	1&&:OPP	 *Q.!3d:	s   +AC C*qkdtype_qdtype_kvc                     | j         |k    rt          d| j          d| d          |j         |k    rt          d|j          d| d          d S )NzThe dtype of q z  does not match the q_data_type z specified in plan function.zThe dtype of k z! does not match the kv_data_type )r^   r:   )r   r   r   r   s       r   _check_cached_qkv_data_typer   0  s|     	w'lagllwlll
 
 	
 	w(nagnnnnn
 
 	
 r   z2.4)device_typesschemafnmutates_argsr   r   c                   d S )Nc                     | S r]   r   rb   s    r   <lambda>z$register_custom_op.<locals>.<lambda>H       r   r   r   r   r   r   r   s        r   register_custom_opr   ?  s     {r   c                     d S )Nc                     | S r]   r   rb   s    r   r   z"register_fake_op.<locals>.<lambda>N  r   r   r   r   r   s     r   register_fake_opr   J  s     {r   c                   d S )Nc                     | S r]   r   rb   s    r   r   z$register_custom_op.<locals>.<lambda>e  r   r   r   r   s        r   r   r   R  s    & {r   c                     d S )Nc                     | S r]   r   rb   s    r   r   z"register_fake_op.<locals>.<lambda>l  r   r   r   r   s     r   r   r   g  s    
 {r   c                 d    t          |           \  }}|dk    rt          j        j        dk    rdS dS )N	   12.3sm90sm80)r   r_   versionr   rf   major_s      r   determine_gemm_backendr   o  s5    %f--HE1zzem(F22vvr   use_fp16_qk_reductionsuse_custom_maskc                 D    |rdS | t           j        j        k    rdS |rdS dS )a  
    Check if the FA3 backend is supported based on the given parameters.
    NOTE(Zihao): this function is a workaround for the lack of support for certain features in
    our FA3 backend, and will be removed once the backend is fully supported.

    Parameters
    ----------
    pos_encoding_mode : int
        The positional encoding mode.
    use_fp16_qk_reductions : bool
        Whether FP16 QK reductions are allowed.
    use_custom_mask : bool
        Whether a custom mask is used.
    dtype_q : torch.dtype
        The data type of the query tensor.
    dtype_kv : torch.dtype
        The data type of the key-value tensor.

    Returns
    -------
    bool
        True if the FA3 backend is supported, False otherwise.
    FT)r   r   valuerV   r   r   r   r   s        r   is_fa3_backend_supportedr   w  s9    <  uO0666u u4r   c                     |rdS | t           j        j        k    rdS |rdS |t          j        t          j        fv rdS |t          j        t          j        fv rdS dS )a;  
    Check if the cutlass backend is supported based on the given parameters.

    Parameters
    ----------
    pos_encoding_mode : int
        The positional encoding mode.
    use_fp16_qk_reductions : bool
        Whether FP16 QK reductions are allowed.
    use_custom_mask : bool
        Whether a custom mask is used.
    dtype_q : torch.dtype
        The data type of the query tensor.
    dtype_kv : torch.dtype
        The data type of the key-value tensor.

    Returns
    -------
    bool
        True if the cutlass backend is supported, False otherwise.
    FT)r   r   r   r_   r`   ra   r   s        r   is_cutlass_backend_supportedr     so    8  uO0666u u5&(9:::uE'):;;;u4r   c                 N    t          |           rt          |||||          rdS dS )a  
    Determine the appropriate attention backend based on the device and parameters.

    Parameters
    ----------
    device : torch.device
        The device to be used.
    mask_mode : int
        The mask mode.
    pos_encoding_mode : int
        The positional encoding mode.
    use_fp16_qk_reductions : bool
        Whether FP16 QK reductions are allowed.
    use_custom_mask : bool
        Whether a custom mask is used.
    dtype_q : torch.dtype
        The data type of the query tensor.
    dtype_kv : torch.dtype
        The data type of the key-value tensor.

    Returns
    -------
    str
        The name of the attention backend to be used.
    fa3fa2)is_sm90a_supportedr   )rf   rV   r   r   r   r   s         r   determine_attention_backendr     sB    B &!! 	&>' ' 	 uur   r   base_versionc                 f    ddl m} |                    |           |                    |          k    S )Nr   )r   )	packagingr   parse)r   r   pkg_versions      r   version_at_leastr     s;    000000W%%):):<)H)HHHr   c                  B    ddl } | j                            d          duS )z
    Check if cuda.cudart module is available (cuda-python <= 12.9).

    Returns:
        True if cuda.cudart exists, False otherwise
    r   Nzcuda.cudart)importlib.utilutil	find_spec)	importlibs    r   has_cuda_cudartr     s+     >##M22$>>r   )has_flashinfer_jit_cachehas_flashinfer_cubinc                      dd l } | j        S )Nr   )r   r   )r   s    r   get_cuda_python_versionr     s    KKKr   c                 p    t          |           \  }}|dk    ot          t          j        j        d          S )Nr   r   r   r   r_   r   r   r   s      r   r   r     s2    %f--HE1A:F*5=+=vFFFr   c                 p    t          |           \  }}|dk    ot          t          j        j        d          S )N
   12.8r   r   s      r   is_sm100a_supportedr     2    %f--HE1B;G+EM,>GGGr   c                 p    t          |           \  }}|dk    ot          t          j        j        d          S )Nr   12.9r   r   s      r   is_sm100f_supportedr    r   r   c                 p    t          |           \  }}|dk    ot          t          j        j        d          S )N   z13.0r   r   s      r   is_sm110a_supportedr  "  r   r   c                 |    t          |           \  }}|dk    o$|dk    ot          t          j        j        d          S )N   r   r   r   rf   r   minors      r   is_sm120a_supportedr
  '  ;    )&11LE5B;V5A:V*:5=;Mv*V*VVr   c                 |    t          |           \  }}|dk    o$|dk    ot          t          j        j        d          S )Nr  r   r  r   r  s      r   is_sm121a_supportedr  ,  r  r   c                 (    t          |           rdndS )Nr   r   )r   r   s    r   determine_mla_backendr  1  s    &v..955E9r   expected_shapeexpected_dtypeexpected_devicec                 *   |r:| j         t          j        |          k    rt          d| d| d| j                    |r(| j        |k    rt          d| d| d| j                   |r(| j        |k    rt          d| d| d| j                   d S d S )NzInvalid shape of z: expected z, got zInvalid dtype of zInvalid device of )rj   r_   Sizer:   r^   rf   )r/   r  r  r  r   s        r   check_shape_dtype_devicer  5  s      
!'UZ%?%???PPPPPqwPP
 
 	
  
!'^33PPPPPqwPP
 
 	
  
1866SSS/SSSS
 
 	

 
66r   c                  B    t                                                      S r]   )r   build_and_loadr   r   r   get_logging_moduler  J  s    --///r   c                   &    e Zd ZdZdZdZdZdZdZdS )LogLevelr   r   r   r   r3   r4   N)	r   r   r   TRACEDEBUGINFOWARNERRORCRITICALr   r   r   r  r  O  s,        EEDDEHHHr   r  )tracedebuginfowarnerrorcriticallvl_strc                 h    t                                          t          |          j                   d S r]   )r  set_log_levellog_level_mapr   )r'  s    r   r)  r)  b  s*    &&}W'='CDDDDDr   c                 L    | j         dk    rdS t          |           \  }}|dk    S )Nr   Fr   )ru   r   r   s      r   device_support_pdlr,  f  s/    {fu%f--HE1A:r   yc                     | |z   dz
  |z  S )z
    Perform ceiling division of two integers.

    Args:
        x: the dividend.
        y: the divisor.

    Returns:
        The result of the ceiling division.
    r   r   r/   r-  s     r   ceil_divr0  n  s     EAI!r   c                 (    t          | |          |z  S )z'Round up x to the nearest multiple of y)r0  r/  s     r   round_upr2  |  s    Aq>>Ar   c                 J    t           j                            |           j        S r]   )r_   r   get_device_propertiesmulti_processor_countr   s    r   get_device_sm_countr6    s    :++F33IIr   c                   `    e Zd ZdZ	 	 d
dej        dej        dedeeedf                  fd	Z	dS )	FP4Tensora@  Wrapper class for FP4 tensors.

    Since PyTorch doesn't natively support FP4, this wrapper contains:
    - data: uint8 tensor storing the compressed FP4 data, the size of innermost dimension is ceil(original_dim / 2) since each uint8 stores 2 FP4 values
    - scale: float8_e4m3fn tensor storing the scale factors
    r   Ndatascalescale_start_indexoriginal_shape.c           	         |j         t          j        k    rt          d|j                    |j         t          j        k    rt          d|j                    |j        d         dz  dk    rt          d|j        d                    |dk     s||j        d         k    r t          d| d|j        d                    ||j        d         z   |j        d         k    r.t          d| d	|j        d          d|j        d                    ||j        d
d         |d
d         k    rt          d|j         d|           t          j        |d         dz            }|j        d         |k    r)t          d|j        d          d|d          d|           || _        || _	        || _
        || _        d| _         d
S )a3  Initialize FP4Tensor.

        Parameters
        ----------
        data : torch.Tensor
            uint8 tensor storing the compressed FP4 data
        scale : torch.Tensor
            float8_e4m3fn tensor storing the scale factors
        scale_start_index : int
            The start token index of the scale factors. This is needed when two kernels (like prefill and decode kernels) are reusing the same scale factor tensor with different offsets.
        original_shape : Optional[Tuple[int, ...]]
            The original shape before compression.
        zdata must be uint8 tensor, got z(scale must be float8_e4m3fn tensor, got r   rI   z.scale.shape[0] must be a multiple of 128, got zNscale start index must be in the range [0, scale.shape[0]). scale_start_index=z, scale.shape[0]=zTscale start index + data.shape[0] must not exceed scale.shape[0]. scale_start_index=z, data.shape[0]=NzVdata and original_shape must have the same dimensions except the last one. data.shape=z, original_shape=r   zIdata last dimension must be ceil(original_shape[-1] / 2). data.shape[-1]=z, original_shape[-1]=z, expected=nvfp4)r^   r_   r   r:   r`   rj   r{   ceilr9  r:  r;  r<  )selfr9  r:  r;  r<  expected_data_dims         r   __init__zFP4Tensor.__init__  s_   ( :$$KtzKKLLL ;%---UUUVVV;q>C1$$QQQQ   q  $5Q$G$GZ%6Z ZINUVZ Z   tz!},u{1~==y%6y yHL
STy yhmhstuhvy y   %z#2#.""555 P"&*P P?MP P   !%	.*<q*@ A Az"~!222 4&*jn4 4KYZ\K]4 4 14 4   	
!2,


r   )r   N)
r   r   r   r)   r_   TensorrO   r   r	   rC  r   r   r   r8  r8    sv          "#48> >l> |> 	>
 !sCx1> > > > > >r   r8  )r   rD   r   r   r   r   r   r  r3   r  r4                  ) r   rD   rE      r   r         r   r         r   r        r3   r        r4   rE        rF  rG        rH  rI        epilogue_tile_mc                      d}| dz  dk    rd}|S )NrE   rI   r   rF   r   )rY  shuffle_block_sizes     r   get_shuffle_block_sizer\    s$    !!r   input_tensorc                    |                                  dk    sJ d|                                               | j        \  }}t          |          }|dk    rt          nt          }||z  dk    sJ d|             t          j        |t
          j                  }t          |          D ]!}||z  }||z  }	||	         }
||z  |
z   }|||<   "|S )z
    Higher-level PyTorch approach to reorder the rows in blocks of size 16 or 32.
    - We do NOT try to handle custom e2m1 memory usage (i.e. no 'K/2' bytes).
    - Instead, we purely reorder rows in a standard PyTorch shape [M, K].
    r   (input_tensor should be a 2D tensor, not rE   r   z+input_tensor.shape[0] must be multiples of r   )	rp   rj   r\  srcToDstBlk16RowMapsrcToDstBlk32RowMapr_   r   longrange)r]  rY  MKr[  row_maprow_indicesold_row	block_idxrow_in_blockmapped_row_in_blocknew_rows               r    get_shuffle_matrix_a_row_indicesrm    s    """G<3C3C3E3EGG #""
 DAq 0@@%72%=%=!!CVG!!Q&&&J6HJJ '&& +auz222K88 ' '11	!33%l3003FF&Gr   rE   num_elts_per_sfc                 H   | j         t          j        k    s| j         t          j        k    sJ |dk    s|dk    sJ |                                 dk    sJ d|                                              | j        \  }}|dz  dk    sJ |dz  dk    sJ t          | |          }|S )NrE   rF   r   r_  rI   r   r3   )r^   r_   r   bfloat16rp   rj   rm  )r]  rY  rn  rd  re  rg  s         r   #get_shuffle_matrix_sf_a_row_indicesrq    s     ,,0Ben0T0T0TTb  Or$9$9$99"""G<3C3C3E3EGG #""
 DAqs7a<<<<q5A::::2<QQKr   c                  \    t          t          d          rt          j        S t          j        S )zFget native fp4 datatype if supported in Torch, otherwise return uint8.float4_e2m1fn_x2)rX   r_   rs  r   r   r   r   get_native_fp4_dtypert  +  s&    u()) %%{r   supported_ccsc           	         	 t          |           }n3# t          $ r& t          dt          |           j                   dw xY wg t	          |          D ]\  }}t          |t                    rt          d| d|           t          |t                    s*t          d| dt          |          j         d|                               |           fd}|S )a  
    Decorator to mark functions with their supported CUDA compute capabilities.

    This decorator annotates a function with metadata about which CUDA compute
    capabilities (CC) it supports. It adds a `_supported_ccs` attribute containing
    the set of supported compute capabilities and an `is_compute_capability_supported`
    method to check if a specific compute capability is supported.

    Parameters
    ----------
    supported_ccs : list or iterable of int
        A list of supported CUDA compute capability versions as integers
        (e.g., [75, 80, 86, 89, 90, 100, 103, 110, 120]).
        These are computed as major * 10 + minor (e.g., SM 8.0 = 80, SM 9.0 = 90).

    Returns
    -------
    decorator : callable
        A decorator function that adds compute capability metadata to the decorated function.

    Attributes Added to Decorated Function
    ---------------------------------------
    _supported_ccs : set of int
        A set of integers representing the supported compute capabilities.
    is_compute_capability_supported : callable
        A method that takes a compute capability (int) and returns True if it's
        supported, False otherwise.

    Examples
    --------
    >>> @supported_compute_capability([80, 86, 89, 90])
    ... def my_kernel_function():
    ...     pass
    ...
    >>> my_kernel_function._supported_ccs
    {80, 86, 89, 90}
    >>> my_kernel_function.is_compute_capability_supported(80)
    True
    >>> my_kernel_function.is_compute_capability_supported(75)
    False

    Notes
    -----
    This decorator is useful in conjunction with the backend_requirement decorator to mark functions with their supported CUDA compute capabilities.

    Raises
    ------
    TypeError
        If supported_ccs is not iterable or contains non-integer values.
    z'supported_ccs must be an iterable, got Nzsupported_ccs[z ] must be an integer, got bool: z] must be an integer, got z: c                 J     t                     _         fd}| _         S )Nc                     | j         v S r]   )_supported_ccs)ccfuncs    r   is_cc_supportedzHsupported_compute_capability.<locals>.decorator.<locals>.is_cc_supported}  s    ,,,r   )setry  is_compute_capability_supported)r{  r|  validated_ccss   ` r   	decoratorz/supported_compute_capability.<locals>.decoratorz  s<    !-00	- 	- 	- 	- 	- 0?,r   )	listr   ru   r   	enumeraterq   boolrO   append)ru  ccs_listirz  r  r  s        @r   supported_compute_capabilityr  3  s6   h&&   Td=6I6I6RTT
 
	 M8$$ ! !2b$ 	VTQTTPRTTUUU"c"" 	WWWd2hh>OWWSUWW   	R         s	    0Abackend_checkscommon_checkheuristic_funcc                       fd}|S )a  
    Decorator to enforce backend and problem size requirements for kernel functions.

    This decorator validates that a function is called with a supported backend and
    compute capability, and optionally validates problem size constraints. It performs
    runtime checks before executing the function and raises appropriate errors if
    requirements are not met. If checking overheads are a concern, you can pass a
    `skip_check` keyword argument to the function to bypass the validation.

    Parameters
    ----------
    backend_checks : dict
        A dictionary mapping backend names (str) to requirement checker functions.
        Each checker function should accept the same arguments as the decorated function
        and return True if the problem size is supported, False otherwise.
        Checkers can be decorated with @supported_compute_capability to specify
        which compute capabilities they support.
    common_check : callable, optional
        An optional function that performs additional validation checks common to all
        backends. Should accept the same arguments as the decorated function and return
        True if requirements are met, False otherwise.
        In the case where the kernel function does not have any specific backends, this can be decorated with @supported_compute_capability to specify the function's supported compute capabilities.
    heuristic_func : callable, optional
        A function that performs heuristic backend selection when backend is "auto".
        Must be provided if backend is "auto". Does not do anything if backend is not "auto".
        Should accept the same arguments as the decorated function.
        Should return an ordered list of runnable backends with the most preferred backend first.
        When decorated function is not autotuned, the first backend in the heuristic list will be run.
        When decorated function is autotuned, the backends in the heuristic list will be autotuned over to find the best backend.

    Returns
    -------
    decorator : callable
        A decorator function that wraps the target function with validation logic, and inserts
        the "skip_check" keyword argument to the function.

    Attributes Added to Decorated Function
    ---------------------------------------
    is_backend_supported : callable
        Method with signature `is_backend_supported(backend, cc=None)` that returns
        True if the specified backend is supported, optionally for a specific compute
        capability (cc).
    is_compute_capability_supported : callable
        Method with signature `is_compute_capability_supported(cc)` that returns True
        if any backend supports the given compute capability.

    Keyword Arguments Added to Decorated Function
    ---------------------------------------------
    skip_check : bool
        (Defaults to False)
        If True, the function will not be validated. This is useful for performance-critical code paths.

    Raises
    ------
    BackendSupportedError
        If the function is called with an unsupported backend or compute capability.
    ValueError
        If the problem size is not supported for the given backend.

    Examples
    --------
    >>> @supported_compute_capability([80, 86, 89, 90])
    ... def _cutlass_check(q, k, v, backend):
    ...     # Validate problem size constraints for CUTLASS backend
    ...     return q.shape[-1] <= 256
    ...
    >>> @supported_compute_capability([75, 80, 86, 89, 90])
    ... def _cudnn_check(q, k, v, backend):
    ...     # Validate problem size constraints for cuDNN backend
    ...     return True
    ...
    >>> @backend_requirement({
    ...     "cutlass": _cutlass_check,
    ...     "cudnn": _cudnn_check
    ... })
    ... def my_attention_kernel(q, k, v, backend="cutlass"):
    ...     # Backend invocation
    ...     pass
    ...
    >>> # Example with kernel function with no backend requirements
    >>> @supported_compute_capability([80, 86, 89, 90])
    ... def _common_size_check(q, k, v):
    ...     return True
    ...
    >>> @backend_requirement(
    ...     backend_checks={}, # Empty backend_checks
    ...     common_check=_common_size_check
    ... )
    ... def backend_agnostic_kernel(q, k, v):
    ...     pass

    Notes
    -----
    - The decorator automatically extracts compute capability from tensor arguments
      by finding the first torch.Tensor in args or kwargs.
    - A `skip_check=True` keyword argument can be passed to bypass validation for
      performance-critical code paths.
    - All validation is performed before the wrapped function executes.
    - Works in conjunction with the @supported_compute_capability decorator to
      provide fine-grained control over backend and architecture support.
    c                 P   	 t          j                   d
 fd	
fd
 fddt          f
fddt          dt          f
fd}
	fdd	 t	          j                    f
d
            		_        	_        |	_        	_	        	S )Nc                                  st          dj                   | vrdS |          }|dS t          |d          r|                    |          S dS )Nz:Invalid is_backend_supported call: no backend choices for FTr~  )r:   r   rX   r~  )backendrz  req_checkerr  r{  has_backend_choicess      r   is_backend_supportedzDbackend_requirement.<locals>.decorator.<locals>.is_backend_supported  s    &&((  `QUQ^``  
 .00 5,W5:4[*KLL K&FFrJJJur   c                                   s=t          d          st          dj         d                                         S t	           fd                                D                       S )Nr~  z.Invalid is_compute_capability_supported call: z8 does not have is_compute_capability_supported decoratorc              3   b   K   | ])}t          |d           o|                              V  *dS )r~  N)rX   r~  ).0checkerrz  s     r   	<genexpr>zbbackend_requirement.<locals>.decorator.<locals>.is_compute_capability_supported.<locals>.<genexpr>  s[           G%FGG D??CC     r   )rX   r:   r   r~  anyvalues)rz  r  r  r  s   `r   r~  zObackend_requirement.<locals>.decorator.<locals>.is_compute_capability_supported  s    &&(( |-NOO $ YI^  Y  Y  Y   $CCBGGG      $2#8#8#:#:     r   c                      |                     d          }             s | i |S |vrt          d| dj                   |         } | i |o || i |S  || i |S )Nr  z	Backend 'z' is not supported for )r   r.   r   )argskwargsr  r  r  r  r{  r  s       r   _is_problem_size_supportedzJbackend_requirement.<locals>.decorator.<locals>._is_problem_size_supported  s    jj++G '&(( 5#|T4V444n,,+OOOOO   )1K'#|T4V44Ud9Uf9U9UU"{D3F333r   r1   c                  "    t                     S r]   )r  )r  s   r   r  zCbackend_requirement.<locals>.decorator.<locals>.has_backend_choices,  s    '''r   r  c                     | v S r]   r   )r  r  s    r   has_backendz;backend_requirement.<locals>.decorator.<locals>.has_backend0  s    n,,r   c                 
   
 |i |sdS g }D ]M}|         }	  ||i |r*|                     |           r|                    |           ># t          $ r Y Jw xY w
J d             |g|R i |}|sdS |	_        dS )NFz#Heuristic function must be providedT)r~  r  r:   suitable_auto_backends)
rz  r  r  suitable_backendsr  r  r  r  r  wrappers
         r   r  zFbackend_requirement.<locals>.decorator.<locals>.suitable_auto_backends4  s   'd0Mf0M0M'u ")  ,W5"{!'  :%EEbII: *00999!   H "--/T--- ./@ R4 R R R6 R R$ u->G*4s   2A
AAc                      d }d }| t          |                                          z   }|D ] }t          |t          j                  r|} n!|t          |j                  \  }}|dz  |z   }|S )Nr   )rr   r  rq   r_   rD  r   rf   )r  r  
capability
tensor_argall_argsr   r   r	  s           r   _get_capabilityz?backend_requirement.<locals>.decorator.<locals>._get_capabilityJ  s    J JeFMMOO444H!  eU\22 !&JE %  6j6GHHu"RZ%/
r   c                    
 |                     dd          }|s; j        | i |}|                                 t          |j                  }|                    d          } | i |}             s
t          dj                                rx|dk    r! |fi |st          dj                   n ||          s%|rd| nd}t          j         d	| d
|            	di |st          dj                   nu |          st          j         d|            	di |st          dj                   n1|r/-|                    d          dk    r | i |} |g| R i |  | i |S )N
skip_checkFr  zYInvalid @backend_requirement decorator usage: no backend choices and no common_check for autoz$No suitable auto backends found for z with capability  z does not support backend ''z"Problem size is not supported for z% does not support compute capability r   )	popbindapply_defaultsdict	argumentsr   r:   r   r.   )r  r  r  
bound_argskwargs_with_defaultsr  r  extrar  r  r  r{  r  r  r  r~  sigr  s           r   r  z7backend_requirement.<locals>.decorator.<locals>.wrapper_  s     L%88J 1H &SXt6v66
))+++'+J,@'A'A$.229==,_d=f==
**,, 1E$ Dtx  uB  D  D   '&(( &((55&   *>     #8 Vt} V V# #   43GZHH DN V @J @ @ @TV " #8#'= ^ ^W ^ ^W\ ^ ^# #   :9QQ<PQQ ", TT] T T# #  ;::FF 3#}__S]__   65MM8LMM (PPP    H :::i((F22!0$!A&!A!AJ**:GGGGGGG4((((r   r]   )
inspect	signaturer  r   	functoolswrapsr  r~  r  r  )r{  r  r  r  r  r  r~  r  r  r  r  r  r  s   ` @@@@@@@@r   r  z&backend_requirement.<locals>.decorator  s   %%	 	 	 	 	 	 	 	&	 	 	 	 	 	 	$	4 	4 	4 	4 	4 	4 	4 	4$	(T 	( 	( 	( 	( 	( 	(	- 	- 	- 	- 	- 	- 	- 	-	 	 	 	 	 	 	 	,	 	 	* 
		8	) 8	) 8	) 8	) 8	) 8	) 8	) 8	) 8	) 8	) 8	) 8	) 8	) 
	8	)t (<$2Q/)&9#r   r   )r  r  r  r  s   ``` r   backend_requirementr    s9    Vm m m m m m m^ r   c                 v    t           j                                         t           j        j        | j                 S r]   )r_   r   initdefault_generatorsr   r   s    r   get_default_generatorsr    s&    	JOO:(66r   )rI   )Fr]   )rE   )NN)gr)   r  r{   enumr   typingr   r   r   r   r   r	   r
   r_   torch.versionr   torch.torch_versionr   r   torch_versionr  
jit.spdlogr   r   r   r#   log2e	Exceptionr'   r+   r.   rD  r   r?   rB   rO   rH   rU   rY   r[   r  rc   rm   rx   r   r   rf   __annotations__r   r   r   r   r^   r   cacher   r   r   r   r   r   r   r   r   r   r   r   jit.envr   r   r   r   r   r  r  r
  r  r  r  r  r  r  r  r  r  r  r   r*  r)  r,  r0  r2  r6  r8  r`  ra  r\  rm  rq  rt  r  r  r  r   r   r   <module>r     s                 M M M M M M M M M M M M M M M M M M       , , , , , , < < < < < <  ) ) ) ) ) )    d       t       4   
 		 	 	 	 	9 	 	 		 	 	 	 	9 	 	 		 	 	 	 	I 	 	 	%, 3 5<    $%, 3 5<    $     $ OR "%.1HK   0Q Q Q Q Q Q
A A A A A A
? ?$ ? ? ? ?%, 5<    
%,elEL.H(IIJ

 5<%&
 
 
 
.c el     <>
DsEL()5<78 = = = DI 
#(<<@
\   %# %# % % % %C  %,    $|
\   
E%+s*:$; 
 
 
 
 
 :5< :E#s(O : : : : $U\ $e $ $ $ $N

|



/4{

FKk

	

 

 

 

 <e!4!444 "&	 =A $	 	 		X	
 C#./	 uS(3-%789	 	 
	 	 	 	 "& X 
     "& =A $  X
 C#./ uS(3-%789  
   . "& X 
   5< C    $$ $ $ [	$
 k$ 
$ $ $ $N&& & & [	&
 k& 
& & & &R*L** !* 	*
 [* k* 	* * * *ZIc I I I I I I	? 	? 	? 	? 	?           Gu| G G G G G
H H H H H H
H H H H H H
H H H H H H
W W W W W W
W W W W W W
:%, :3 : : : :
|
Xc]+
 U[)
 el+	

 
 

 
 
 
* 0 0 0    t    ^^MM^! E3 E4 E E E E u|                 
 J J J J J JF F F F F F F FT	 	 	 	 	 	 C C    %,%14%
\% % % %R NP ,14GJ
\   (  P P( P P P Pj (,)-Z Zh'Z8$Z X&Z 	Z Z Z Zz 75< 7 7 7 7 7 7r   