
    )`i                     n    d dl mZ d dlZd dlZddlmZmZmZ ddlm	Z	m
Z
mZ ddZdd	Z	 	 	 	 	 	 ddZdS )    )OptionalN   )gemm_kernel!gemm_kernel_descriptor_persistentgemm_kernel_persistent)check_device	check_dimcheck_input      ?        c                    t          |            t          | |g           t          d|            t          d|           |/t          |           t          |g           t          d|           | j        d         |j        d         k    s
J d            | j        |j        k    s
J d            |L| j        d         |j        d         k    s
J d            |j        d         |j        d         k    s
J d            | j        \  }|j        \  }| j        }|r|n|t
          j        k    r|nt
          j        }||j        |k    s
J d	            |t          j        f| j	        |
          n|}t
          j
                            d          j        }	|	nt          |	          fd}
t          |
         | ||||                     d          |                     d          |                    d          |                    d          |                    d          |                    d          ||           |S )a  
    GEMM operation with SM constraint by Triton.
    C = alpha * (a @ b.T) + beta * C

    Args:
        a: The first input matrix. Shape: (M, K)
        b: The second input matrix. Shape: (K, N)
        c: The output matrix. Shape: (M, N). In-place epilogue is supported. Expected to be out_dtype (if not specified, same as a.dtype, but fp8 --> bf16).
        alpha: The scaling factor for the product of a and b.
        beta: The scaling factor for the output matrix c.
        out_dtype: The dtype of the output matrix. Default: fp8 --> bf16. Otherwise, same as a.dtype.
        num_sms: The number of SMs to use for the computation.
       Nr   r   'Incompatible dimensions between a and b#Incompatible dtypes between a and b'Incompatible dimensions between a and c'Incompatible dimensions between b and c+Incompatible dtypes between c and out_dtypedevicedtypecudac           	          t          t          j        | d                   t          j        | d                   z            fS NBLOCK_SIZE_MBLOCK_SIZE_NmintritoncdivMETAMNnum_smss    x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/triton/sm_constraint_gemm.py<lambda>z!gemm_persistent.<locals>.<lambda>G   D    K4/006;q$~BV3W3WW	
 	
     )alphabetaNUM_SMS)r
   r   r	   shaper   torchfloat8_e4m3fnbfloat16emptyr   r   get_device_propertiesmulti_processor_countr   r   stride)abcr)   r*   	out_dtyper$   Kr   r+   gridr"   r#   s         `    @@r%   gemm_persistentr:      sy     NNN!QaOOOaOOO}AaS!Q71:###%N###7agD}wqzQWQZ''')R'''wqzQWQZ''')R'''7DAq7DAqGE 			 E''' U^  99,,,5 -,,
 BCQF189====PQA j..v66LG ggc'7.C.CG     D 4 												   " Hr(   c                 *  	
 t          |            t          | |g           t          d|            t          d|           |/t          |           t          |g           t          d|           | j        d         |j        d         k    s
J d            | j        |j        k    s
J d            |L| j        d         |j        d         k    s
J d            |j        d         |j        d         k    s
J d            | j        \  	}|j        \  }
| j        }|r|n|t
          j        k    r|nt
          j        }||j        |k    s
J d	            |t          j        	
f| j	        |
          n|}	
fd}t          |         | ||	
||                     d          |                     d          |                    d          |                    d          |                    d          |                    d          ||           |S )a  
    GEMM operation without SM constraint by Triton.
    C = alpha * (a @ b.T) + beta * C

    Args:
        a: The first input matrix. Shape: (M, K)
        b: The second input matrix. Shape: (K, N)
        c: The output matrix. Shape: (M, N). In-place epilogue is supported. Expected to be out_dtype (if not specified, same as a.dtype, but fp8 --> bf16).
        alpha: The scaling factor for the product of a and b.
        beta: The scaling factor for the output matrix c.
        out_dtype: The dtype of the output matrix. Default: fp8 --> bf16. Otherwise, same as a.dtype.
        num_sms: The number of SMs to use for the computation.
    r   Nr   r   r   r   r   r   r   r   c                 t    t          j        | d                   t          j        | d                   z  fS r   )r   r   )r!   r"   r#   s    r%   r&   zgemm.<locals>.<lambda>   s4    AtN+,,v{1d>>R/S/SS r(   )r)   r*   )r
   r   r	   r,   r   r-   r.   r/   r0   r   r   r3   )r4   r5   r6   r)   r*   r7   r8   r   r9   r"   r#   s            @@r%   gemmr=   b   s?    NNN!QaOOOaOOO}AaS!Q71:###%N###7agD}wqzQWQZ''')R'''wqzQWQZ''')R'''7DAq7DAqGE 			 E''' U^  99,,,5 -,,
 BCQF189====PQA    D 												     Hr(   Fc                    t          |            t          |           t          | |g           t          d|            t          d|           |/t          |           t          |g           t          d|           | j        d         |j        d         k    s
J d            | j        |j        k    s
J d            |L| j        d         |j        d         k    s
J d            |j        d         |j        d         k    s
J d            | j        \  }|j        \  }| j        }	|r|n|	t
          j        k    r|	nt
          j        }|	t
          j        k    r!|d	k    s
J d
            d	k    s
J d
            n |dk    s
J d
            dk    s
J d
            |t          j        f| j	        |          n|}t
          j
                            d          j        }
|
nt          |
          dt          dt          dt          t                   fd}t!          j        |           fd}t%          |         | |||||d|	t
          j        k    rdnddddd|           |S )a  
    GEMM operation with SM constraint by Triton.
    Requires TMA support and descriptor creation.
    C = alpha * (a @ b.T) + beta * C

    Note:
        - K and N must be greater than 16B.
        - Support float16, float8_e4m3fn, bfloat16.
        - float32 is not supported due to performance issues.

    Args:
        a: The first input matrix. Shape: (M, K)
        b: The second input matrix. Shape: (N, K)
        c: The output matrix. Shape: (M, N). In-place epilogue is supported. Expected to be out_dtype (if not specified, same as a.dtype, but fp8 --> bf16).
        alpha: The scaling factor for the product of a and b.
        beta: The scaling factor for the output matrix c.
        out_dtype: The dtype of the output matrix. Default: fp8 --> bf16. Otherwise, same as a.dtype.
        num_sms: The number of SMs to use for the computation.
        EPILOGUE_SUBTILE: Whether to use the epilogue subtile optimization.
    r   Nr   r   r   r   r   r      zLeast chunk size must be 16B   r   r   size	alignmentstreamc                 D    t          j        | dt           j                  S )Nr   r   )r-   r0   int8)rA   rB   rC   s      r%   alloc_fnz,gemm_descriptor_persistent.<locals>.alloc_fn   s    {4ejAAAAr(   c           	          t          t          j        | d                   t          j        | d                   z            fS r   r   r    s    r%   r&   z,gemm_descriptor_persistent.<locals>.<lambda>   r'   r(      @      )r+   r   r   BLOCK_SIZE_KGROUP_SIZE_M
num_stages	num_warpsEPILOGUE_SUBTILE)r
   r   r	   r,   r   r-   r.   r/   r0   r   r   r1   r2   r   intr   r   set_allocatorr   float32)r4   r5   r6   r)   r*   r7   r$   rO   r8   r   r+   rF   r9   r"   r#   s         `      @@r%   gemm_descriptor_persistentrS      s   > NNNNNN!QaOOOaOOO}AaS!Q71:###%N###7agD}wqzQWQZ''')R'''wqzQWQZ''')R'''7DAq7DAqGE 			 E''' U^  ###Bwww6wwwBwww6wwwwAvvv5vvvAvvv5vvvABQF189====PQAj..v66LG ggc'7.C.CGBs Bs BHSM B B B B """     D &d+						!U]22SS)!   $ Hr(   )Nr   r   NN)Nr   r   N)Nr   r   NNF)typingr   r-   r   kernels.sm_constraint_gemmr   r   r   utilsr   r	   r
   r:   r=   rS    r(   r%   <module>rX      s                    
 8 7 7 7 7 7 7 7 7 7Q Q Q QhH H H H\ 
	i i i i i ir(   