
    .`icQ                     
   d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ  ee          Zej                            dd	          Zej        d
efd            Zej        d
efd            Zde
de
d
efdZde d
e
dz  fdZ!efde de dede
f         fdZ" e"dd          Z# e"dd          Z$ e"dd          Z% e"dd          Z& e"dd          Z' e"dd          Z( e"dd          Z) e"dd           Z* e"dd!          Z+ e"dd"          Z, e"d#d$d% &          Z-ej        d
efd'            Z.ej        d
efd(            Z/ej        d
efd)            Z0ej        d
efd*            Z1ej        d
efd+            Z2ej        d
efd,            Z3ej        d
efd-            Z4ej        d
efd.            Z5ej        d
efd/            Z6d
edz  fd0Z7d1e8d2e8d
efd3Z9	 	 	 did1e8d2e8d5e8d6e8d7e8d8e d9ej:        d:ed;edz  d<ed=ed
efd>Z; e            rej<        =                    d?g d@A          dBej>        dCej>        dDej>        dEej>        dFej>        dGej:        dHedIe d
ej>        fdJ            Z?ej<        @                    d?          dBej>        dCej>        dDej>        dEej>        dFej>        dGej:        dHedIe d
ej>        fdK            ZAej<        =                    dLg d@A          dBej>        dCej>        dDej>        dEej>        dGej:        dIe d
ej>        fdM            ZBej<        @                    dL          dBej>        dCej>        dDej>        dEej>        dGej:        dIe d
ej>        fdN            ZCej<        =                    dOg d@A          dPej>        dQej>        d
eDej>        ej>        f         fdR            ZEej<        @                    dO          dPej>        dQej>        d
eDej>        ej>        f         fdS            ZFdPej>        dTej>        dUej>        dVej>        dWej>        dXej:        dIe d
ej>        fdYZG	 djdPej>        dTej>        dZej>        d[ej>        dXej:        d\ej>        dz  d
ej>        fd]ZHdPej>        dQej>        d
eDej>        ej>        f         fd^ZI e"d_d`          ZJej        d
efda            ZKej        d
efdb            ZLdceddej:        deej>        dfej>        fdgZMg dhZNdS )kzoCompatibility wrapper for FlashInfer API changes.

Users of vLLM should always import **only** these wrappers.
    N)Callable)AnyNoReturn)init_logger)vllm_is_batch_invariant)current_platformFLASHINFER_CUBINS_REPOSITORYzWhttps://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/returnc                      t           j        rdS t          j                            d          dS t
                              d           dS )z7Return `True` if flashinfer-cubin package is available.Tflashinfer_cubinNz&flashinfer-cubin package was not foundF)envsVLLM_HAS_FLASHINFER_CUBIN	importlibutil	find_speclogger
debug_once     i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/utils/flashinfer.pyhas_flashinfer_cubinr   &   sJ     % t~ 233?t
>???5r   c                      t           j                            d          t                              d           dS t                      s0t          j        d          t                              d           dS dS )z8Return `True` if flashinfer-python package is available.
flashinferNz2FlashInfer unavailable since package was not foundFnvcczSFlashInfer unavailable since nvcc was not found and not using pre-downloaded cubinsT)r   r   r   r   r   r   shutilwhichr   r   r   has_flashinferr   1   s|    
 ~--5NOOOu  !! fl6&:&:&B2	
 	
 	
 u4r   ___c                       t          d          )z/Placeholder for unavailable FlashInfer backend.zFlashInfer backend is not available. Please install the package to enable FlashInfer kernels: https://github.com/flashinfer-ai/flashinfer)RuntimeError)r   r   s     r   _missingr"   D   s    
	6  r   module_namec                 \    	 t          j        |           S # t          t          f$ r Y dS w xY w)zBSafely import a submodule and return it, or None if not available.N)r   import_moduleImportErrorModuleNotFoundError)r#   s    r   _get_submoduler(   M   s@    &{333,-   tts    ++	attr_namefallback_fn.c                 J     t           j         fd            fd}|S )z5Create a lazy import wrapper for a specific function.c                  l    t                      sd S t                    } | rt          | d           nd S N)r   r(   getattr)modr)   r#   s    r   	_get_implz'_lazy_import_wrapper.<locals>._get_impl[   s?     	4[))03=wsIt,,,=r   c                  <                 }| | i |S  || i |S r-   r   )argskwargsimplr0   r*   s      r   wrapperz%_lazy_import_wrapper.<locals>.wrapperb   s=    y{{<;////tT$V$$$r   )	functoolscache)r#   r)   r*   r5   r0   s   ``` @r   _lazy_import_wrapperr8   V   sW    
 _> > > > > _>% % % % % % Nr   flashinfer.fused_moetrtllm_fp8_block_scale_moetrtllm_fp8_per_tensor_scale_moecutlass_fused_moe$flashinfer.cute_dsl.blockscaled_gemmgrouped_gemm_nt_maskedr   fp4_quantizenvfp4_batched_quantize*silu_and_mul_scaled_nvfp4_experts_quantizescaled_fp4_grouped_quantizenvfp4_block_scale_interleavetrtllm_fp4_block_scale_moezflashinfer.autotunerautotunec                  (    t          j                    S r-   )
contextlibnullcontext)r2   r3   s     r   <lambda>rI      s    
(>(@(@ r   )r*   c                  `    t                      o t          j                            d          duS )z5Return `True` if FlashInfer comm module is available.flashinfer.commNr   r   r   r   r   r   r   has_flashinfer_commrM      s,     W	 8 89J K KSW WWr   c                      t                      sdS g d} | D ])\  }}t          |          }|rt          ||          s dS *dS )z7Return `True` if FlashInfer mnnvl all2all is available.F))rK   Mapping)zflashinfer.comm.mnnvlMnnvlMemory)flashinfer.comm.trtllm_alltoallMnnvlMoe)rQ   MoEAlltoallInfoT)rM   r(   hasattrrequired_functionsr#   r)   r/   s       r   has_flashinfer_all2allrW      sv        u   #5  Y[)) 	'#y11 	55	4r   c                  `    t                      o t          j                            d          duS )z4Return `True` if FlashInfer MoE module is available.r9   NrL   r   r   r   has_flashinfer_moerY      s1     	 	IN$$%;<<DHr   c                  `    t                      o t          j                            d          duS )z:Return ``True`` if FlashInfer cutedsl module is available.zflashinfer.cute_dslNrL   r   r   r   has_flashinfer_cutedslr[      s.     	XY^556KLLTXXr   c                      t                      sdS g d} | D ])\  }}t          |          }|rt          ||          s dS *dS )z:Return `True` if FlashInfer TRTLLM fused MoE is available.F))r9   r:   )r9   r;   r9   rD   TrY   r(   rT   rU   s       r   has_flashinfer_trtllm_fused_moer_      sv      u  
 #5  Y[)) 	'#y11 	55	4r   c                      t                      sdS g d} | D ])\  }}t          |          }|rt          ||          s dS *dS )z;Return `True` if FlashInfer CUTLASS fused MoE is available.F))r9   r<   )r   r?   )r   rC   r]   Tr^   rU   s       r    has_flashinfer_cutlass_fused_moera      sv      u   #5  Y[)) 	'#y11 	55	4r   c                      t                      sdS g d} | D ])\  }}t          |          }|rt          ||          s dS *dS )z=Return ``True`` if FlashInfer CUTLASS fused MoE is available.F))r=   r>   )r   rB   )r   &silu_and_scaled_nvfp4_experts_quantizeT)r[   r(   rT   rU   s       r   -has_flashinfer_cutedsl_grouped_gemm_nt_maskedrd      sv     "## u   #5  Y[)) 	'#y11 	55	4r   c                  T   t                      rdS 	 t          j        t          d          } | j        dk    }|rt
                              d           n t
                              d| j                   |S # t          $ r&}t
                              d|           Y d}~d	S d}~ww xY w)
zReturn `True` if NVIDIA's artifactory is accessible.

    This checks connectivity to the kernel inference library artifactory
    which is required for downloading certain cubin kernels like TRTLLM FHMA.
    T   )timeout   z NVIDIA artifactory is accessiblez2NVIDIA artifactory returned failed status code: %dz+Failed to connect to NVIDIA artifactory: %sNF)	r   requestsgetr	   status_coder   r   warning_once	Exception)response
accessiblees      r   has_nvidia_artifactoryrq      s      t< <aHHH)S0
 	@AAAAD$      I1MMMuuuuus   A$A7 7
B'B""B'c                  f    t                      rdS t          j        d          ot                      S )z
    TRTLLM attention is supported if the platform is SM100,
    NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
    Fd   )r   r   is_device_capability_familyrq   r   r   r   supports_trtllm_attentionru     s8        u 	4S99V>T>V>Vr   c                  :    ddl m}   |             }|j        j        S )a,  
    This function should only be called during initialization stage when vllm config
    is set.
    Return `None` if --attention-config.use_trtllm_attention is not set,
    return `True` if TRTLLM attention is forced to be used,
    return `False` if TRTLLM attention is forced to be not used.
    r   )get_current_vllm_config)vllm.configrw   attention_configuse_trtllm_attention)rw   vllm_configs     r   force_use_trtllm_attentionr|   !  s0     433333))++K'<<r   num_qo_headsnum_kv_headsc                 X    t                      du rdS t                      }|o| |z  dk    S )z=Check if the current configuration supports TRTLLM attention.Fr   )r|   ru   )r}   r~   
has_trtllms      r   can_use_trtllm_attentionr   /  s8    !##u,,u*,,J<<,6!;<r   F
num_tokensmax_seq_lendcp_world_sizekv_cache_dtypeq_dtype
is_prefillforce_use_trtllm	has_sinkshas_specc                    ||sdS |dk    rt                               d           dS t                      s|rt                               d           dS | |z  dk    r|rt                               d           dS |
r|st                               d           d	S |t	          j                    k    rt                               d
           d	S |	rt                               d           d	S |O|r#|dk    }|rt                               d           n(|dk    o|dk    }|rt                               d           |S t                               d           d	S )z*Return `True` if TRTLLM attention is used.NF   zcTrtllm does not support returning LSE and as a result does not support DCP, reverting to FlashInferzkTRTLLM attention is not supported on this platform, but --attention-config.use_trtllm_attention is set to 1r   zTRTLLM attention is not supported for this combination of query and key heads, but --attention-config.use_trtllm_attention is set to 1z:Using TRTLLM attention (enabled for speculative decoding).Tz,Using TRTLLM attention (query is quantized).z6Using TRTLLM attention (required for attention sinks).autoz/Using TRTLLM prefill attention (auto-detected).   z.Using TRTLLM decode attention (auto-detected).zLUsing TRTLLM attention (--attention-config.use_trtllm_attention is set to 1))r   rl   ru   	info_oncer   	fp8_dtype)r}   r~   r   r   r   r   r   r   r   r   r   
use_trtllms               r   rz   rz   7  s   " #,<#u <	
 	
 	
 u %&&  	J   u l"a'' 	  
 u 
 UVVVt ",....GHHHt  QRRRt 		V'61J W##$UVVV $s*G~/GJ V##$TUUU V   4r   zvllm::flashinfer_mm_fp4cuda)mutates_argsdevice_typesABA_scaleB_scaleg_scaledtypeuse_8x4_sf_layoutbackendc                 6    ddl m}  || |||||d||	  	        S )Nr   )mm_fp4   )
block_sizer   r   )r   r   )	r   r   r   r   r   r   r   r   flashinfer_mm_fp4_s	            r   flashinfer_mm_fp4r     sK     	<;;;;;!!/

 

 

 
	
r   c                 h    t          j        | j        d         |j        d         || j                  S )Nr   r   r   devicetorchemptyshaper   )r   r   r   r   r   r   r   r   s           r   flashinfer_mm_fp4_faker     s+     {171:qwqzqxPPPPr   zvllm::bmm_fp8c           	      0    ddl m}  || ||||d |          S )Nr   )bmm_fp8)r   r   )r   r   r   r   r   r   bmm_fp8_s          r   r   r     s2     	322222x1gwtWEEEr   c                     t          j        | j        d         | j        d         |j        d         || j                  S )Nr   r      r   r   )r   r   r   r   r   r   s         r   bmm_fp8_faker     s<     {GAJ
AGAJeAH
 
 
 	
r   zvllm::flashinfer_nvfp4_quantizeaa_global_sfc                 B    ddl m} ddl m}  || ||j        d          S )Nr   )SfLayout)nvfp4_quantizeF)sfLayout
do_shuffle)r   r   r   
layout_8x4)r   r   r   nvfp4_quantize_s       r   flashinfer_nvfp4_quantizer     sM     	('''''@@@@@@{X%8U
 
 
 	
r   c                     | j         \  }}d } ||d          }|dz  } ||d          }t          j        ||dz  t          j        | j                  t          j        ||t          j        | j                  fS )Nc                     | |z   dz
  |z  |z  S )Nr   r   )xys     r   rI   z0flashinfer_nvfp4_quantize_fake.<locals>.<lambda>  s    Qq 01 4 r      r      r   r   )r   r   r   uint8r   )r   r   mnround_up	rounded_mscale_n	rounded_ns           r   flashinfer_nvfp4_quantize_faker     s     w144HQNN	r'HWa((	{1a1fEKIII5;yAHL
 L
 L
 
 	
r   bblock_scale_ablock_scale_balpha	out_dtypec           
      :   | j         dk    r|j         dk    sJ |j         dk    r|j         dk    sJ |                     d          dk    r|                    d          dk    sJ | j        d         |j        d         k    sJ |dk    r>|                    t          j                  }|                    t          j                  }|dk    r| j        d         dk    rdnd	}t          | |                                ||                                ||||
          S )Nr   r   cutlasstrtllmr       TF)r   r   )ndimstrider   viewr   r   r   t)r   r   r   r   r   r   r   r   s           r   flashinfer_scaled_fp4_mmr     s%    6Q;;16Q;;;&""}'9Q'>'>'>>88B<<1"!2!2!2271:####)%**5;77%**5;77 '8 3 3
b8H8He		+	 	 	 	r   scale_ascale_bbiasc                    | j         dk    r|j         dk    sJ | j        d         |j        d         k    sJ |                                dk    r|                                dk    sJ | j        t          j        k    r|j        t          j        k    sJ | j        j        dk    r|j        j        dk    sJ |j        t          j        k    r|j        t          j        k    sJ |j        j        dk    r|j        j        dk    sJ t          | 
                    d          |
                    d          |||d                              | j        d         |j        d                   }|||z   }|S )Nr   r   r   r   r   )r   r   numelr   r   float8_e4m3fnr   typefloat32r   	unsqueezer   )r   r   r   r   r   r   outputs          r   flashinfer_scaled_fp8_mmr     sa    6Q;;16Q;;;&71:####==??aGMMOOq$8$8$887e)))ag9L.L.L.LL8=F""qx}'>'>'>>=EM))gmu}.L.L.LL>&((W^-@F-J-J-JJ	A	A  d171:qwqz""  $Mr   c                 "    t          | |          S r-   )r   )r   r   s     r   $flashinfer_quant_nvfp4_8x4_sf_layoutr   9  s     %Q444r   flashinfer.gemmfp8_blockscale_gemm_sm90c                      t                      o0t          j        d          ot          t	          d          d          S )z>Return `True` if FlashInfer block-scale FP8 GEMM is available.Z   r   r   )r   r   is_device_capabilityrT   r(   r   r   r   "has_flashinfer_fp8_blockscale_gemmr   D  sC     	 	S1"55	SN#4557QRRr   c                  6    t           j        ot                      S )z>Return `True` if FlashInfer block-scale FP8 GEMM is supported.)r   #VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFERr   r   r   r   +is_flashinfer_fp8_blockscale_gemm_supportedr   N  s     	0 	1.00r   is_flashinfer_supportedoutput_dtypeinputweightc                     | sdS d}d}|j         }|j         }|t          j        k    oG|t          j        k    o7|t          j        k    o'|j        d         |z  dk    o|j        d         |z  dk    }|S )NF@      r   r   )r   r   bfloat16r   r   )	r   r   r   r   
N_MULTIPLE
K_MULTIPLEweight_dtypeinput_dtypeshould_use_flashinfers	            r   -should_use_flashinfer_for_blockscale_fp8_gemmr   W  s     # u
 JJ<L+K 	& 	.5>)	.E//	. LOj(A-	. LOj(A-  ! r   )r   %flashinfer_trtllm_fp8_block_scale_moeflashinfer_cutlass_fused_moe)flashinfer_cutedsl_grouped_gemm_nt_maskedflashinfer_fp4_quantizerA   rB   rC   rD   rE   rY   rM   rW   ra   rd   r   rq   ru   r   rz   r   r   r   flashinfer_fp8_blockscale_gemmr   r   )NFFr-   )O__doc__rG   r6   r   importlib.utilosr   collections.abcr   typingr   r   ri   r   	vllm.envsr   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platformsr   __name__r   environrj   r	   r7   boolr   r   r"   strr(   r8   r   *flashinfer_trtllm_fp8_per_tensor_scale_moer   r   r   r@   rA   rB   rC   rD   rE   rM   rW   rY   r[   r_   ra   rd   rq   ru   r|   intr   r   rz   library	custom_opTensorr   register_faker   r   r   tupler   r   r   r   r   r   r   r   r   __all__r   r   r   <module>r     s
   
                 				  $ $ $ $ $ $                         # # # # # #      , + + + + +	X		
  "z~~"]     d         $ C H     d
     IQ !$3;CH3E   , )=(<8) ) % .B-A=. . *  43/     -A,@*,D- - ) /.|^LL --l<TUU -A-A>. . * 32/    430     21.  
  @@   XT X X X X
     ( D                $    ( t    &     6 4    =D4K = = = ==3 =c =d = = = =$ %)Q QQQ Q 	Q
 Q Q [Q Q TkQ Q Q 
Q Q Q Qh > m

]!   

<
<
 
 	

 
 {
  
 
 

 
 
 

0 ]  ! 
Q<
Q<
Q 
Q 	
Q
 
Q {
Q  
Q 
Q 

Q 
Q 
Q 
Q ]   

F<
F<
F 
F 	
F
 {
F 
F 

F 
F 
F 

F ]   

<

<

 

 	


 {

 

 


 

 

 

 ])   

<
&+l
	u|U\)	*
 
 
 

 ]  ) 
<
&+l
	u|U\)	*
 
 
 
 || < <	
 < {  \   L !% || \ \	
 { ,
 \   <5|5"',5
5<%&5 5 5 5 "6!51" " 
 D     T    !!!+! <! L	! ! ! !:  r   