
    .`i9                         U d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ  G d de          Zej        defd            Zej        defd            Zde	de	de
fdZdaede	f         dz  ed<   da ede	f         dz  ed<   da!ede	f         dz  ed<   da"ede	f         dz  ed<   da#ede	f         dz  ed<   da$ede	f         dz  ed<   da%ede	f         dz  ed<   da&ede	f         dz  ed<   da'ede	f         dz  ed<   dGdZ(de)fdZ*ej        de+e)         fd            Z,d ej-        dej-        fd!Z.d" Z/d# Z0d$ Z1d% Z2d&ej-        d'e3ej-        ej-        f         d(ej-        d)ej-        d*ej-        dej-        fd+Z4d,ej-        d-e)d.e)dej-        fd/Z5d0ej-        d1ej-        d(ej-        d,ej-        d2ej-        d3ej-        d4e)dej-        fd5Z6d ej-        fd6Z7d e)d7e)de)fd8Z8d e)d9e)fd:Z9d;d;gZ: ej;        d<ej<        =          e:d>fd ej-        d-e+e)         d?ede3ej-        ej-        f         fd@            Z=d ej-        d7ej-        fdAZ>	 dHdBej?        dCej-        dDedz  fdEZ@g dFZAdS )IzmCompatibility wrapper for DeepGEMM API changes.

Users of vLLM should always import **only** these wrappers.
    N)Callable)Enum)AnyNoReturn)logger)get_fp8_min_max)current_platform)has_deep_gemmcdivc                   J    e Zd ZdZdZdZedd            Zed	d            ZdS )
DeepGemmQuantScaleFMTr         returnNc                     t          | dd          }|dS t          j        ot                      ot          du}|s| j        | _        dS t          j        d          r| j	        n| j
        | _        dS )z>Initialize the oracle decision and store it in the class cache_oracle_cacheNd   )getattrenvsVLLM_USE_DEEP_GEMM_E8M0is_deep_gemm_supported_fp8_gemm_nt_implFLOAT32r   r	   is_device_capability_familyUE8M0FLOAT32_CEIL_UE8M0)clscacheduse_e8m0s      h/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/utils/deep_gemm.pyinit_oracle_cachez'DeepGemmQuantScaleFMT.init_oracle_cache&   s     ot44F ( 0&((0"$. 	
  	 #CF  ;C@@(CII' 	    c                 @    t          | dd          }|
J d            |S )z*Return the pre-initialized oracle decisionr   Nz2DeepGemmQuantScaleFMT oracle cache not initialized)r   )r   r   s     r!   from_oraclez!DeepGemmQuantScaleFMT.from_oracle<   s/     ot44!!#W!!!r#   r   N)r   r   )	__name__
__module____qualname__r   r   r   classmethodr"   r%    r#   r!   r   r      sf        G  E
 
 
 [
*    [  r#   r   r   c                      t          j                    o't          j        d          pt          j        d          } t          j        ot                      o| S )zReturn `True` if DeepGEMM is supported on the current platform.
    Currently, only Hopper and Blackwell GPUs are supported.
    Z   r   )r	   is_cudais_device_capabilityr   r   VLLM_USE_DEEP_GEMMr
   )is_supported_archs    r!   r   r   D   sU    
 )022 -b11 	=7<<  "L}L;LLr#   c                     t                      st          j        d           dS t                       t          t          j        d           dS t          j        rt          j        d           dS t          j        d           dS )znReturn `True` if vLLM is configured to use DeepGEMM "
    "E8M0 scale on a Hopper or Blackwell-class GPU.
    z>DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.FNz3DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not foundz*DeepGEMM E8M0 enabled on current platform.Tz0DeepGEMM E8M0 disabled on current configuration.)r   r   
debug_once
_lazy_initr   	info_oncer   r   r+   r#   r!   is_deep_gemm_e8m0_usedr6   P   s    
 "## L	
 	
 	
 uLLL NOOOu# EFFFt
GHHH5r#   ___c                       t          d          )z-Placeholder for unavailable DeepGEMM backend.zDeepGEMM backend is not available or outdated. Please install or update the `deep_gemm` to a newer version to enable FP8 kernels.)RuntimeError)r7   r8   s     r!   _missingr;   i   s    
	K  r#   .r   _grouped_impl_grouped_masked_impl_fp8_mqa_logits_impl_fp8_paged_mqa_logits_impl#_get_paged_mqa_logits_metadata_impl%_get_mn_major_tma_aligned_tensor_impl,_get_mk_alignment_for_contiguous_layout_impl'_transform_sf_into_required_layout_implc                     t           1t          *t          #t          t          t
          t          t          dS t                      sdS d} t          j
                            | d          s7t          j                            t          j        d          t          j
        | <   t!          j        d          }t%          |dd          a t%          |dd          at%          |dd          at%          |dd          at%          |dd          at%          |d	d          at%          |d
d          at%          |dd          at%          |dd          at(                                           dS )z2Import deep_gemm and resolve symbols on first use.NDG_JIT_CACHE_DIR	deep_gemmfp8_gemm_nt m_grouped_fp8_gemm_nt_contiguousfp8_m_grouped_gemm_nt_maskedfp8_mqa_logitsfp8_paged_mqa_logitsget_paged_mqa_logits_metadataget_mn_major_tma_aligned_tensor&get_mk_alignment_for_contiguous_layout!transform_sf_into_required_layout)r   r<   r=   r>   r?   r@   rB   rC   r
   osenvirongetpathjoinr   VLLM_CACHE_ROOT	importlibimport_moduler   rA   r   r"   )DEEP_GEMM_JIT_CACHE_ENV_NAME_dgs     r!   r4   r4   |   s    	%$++%1.:7C2>??  $6 :>>6== 
357<< +4
 4

/0 
!+
.
.C]D99C!CTJJM"3(FMM"3(8$??!(.Dd!K!K*1,d+ +' -4.- -) 4;5t4 40 /60$/ /+ ++-----r#   c                      t                       t          j        d          } t          |                                           S )NrF   )r4   rV   rW   intget_num_sms)rY   s    r!   r\   r\      s3    LLL

!+
.
.Cs  !!!r#   c                  l    t                       t          t                      S t                      } | | gS N)r4   rB   r;   )mk_align_sizes    r!   rN   rN      s1    LLL3;zz@BBM=))r#   xc                 f    t                       t          t                      S t          |           S )z6Wrapper for DeepGEMM's get_mn_major_tma_aligned_tensor)r4   rA   r;   r`   s    r!    get_col_major_tma_aligned_tensorrc      s)    LLL,4zz0333r#   c                      t                       t          t          | i |S d|v r|d         }|d= nt                      }t          | d| i|S )Nr6   disable_ue8m0_cast)r4   r   r;   r6   )argskwargs	use_ue8m0s      r!   rG   rG      sk    LLL ((((6))34	+,,*,,	dO9}OOOOr#   c                  z    t                       t          t          | i |S t          | dt                       i|S Nre   )r4   r<   r;   r6   rf   rg   s     r!   rH   rH      sR    LLL((((	&<&>&>">BH  r#   c                  z    t                       t          t          | i |S t          | dt                       i|S rj   )r4   r=   r;   r6   rk   s     r!   rI   rI      sR    LLL#((((	&<&>&>">BH  r#   c                  z    t                       t          t          | i |S t          | dt                       i|S rj   )r4   rC   r;   r6   rk   s     r!   rO   rO      sR    LLL.6((((2	&<&>&>">BH  r#   qkvweightscu_seqlen_kscu_seqlen_kec                 n    t                       t          t                      S t          | ||||          S )a  Compute FP8 MQA logits for a single sequence without KV paging.

    Args:
        q: Query tensor of shape [M, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N])
            with dtype `torch.float32`.
        weights: weights of shape [M, H], dtype `torch.float32`.
        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
            shape [M], dtype int32.
        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
            shape [M], dtype int32.

    Returns:
        Logits tensor of shape [M, N], dtype `torch.float32`.
    )r4   r>   r;   )rn   ro   rp   rq   rr   s        r!   rJ   rJ      s3    0 LLL#zz2wlKKKr#   context_lens
block_sizenum_smsc                 j    t                       t          t                      S t          | ||          S )a  Build scheduling metadata for paged MQA logits.

    Args:
        context_lens: Tensor of shape [B], dtype int32; effective context length
            per batch element.
        block_size: KV-cache block size in tokens (e.g., 64).
        num_sms: Number of SMs available. 132 for Hopper

    Returns:
        Backend-specific tensor consumed by `fp8_paged_mqa_logits` to
        schedule work across SMs.
    )r4   r@   r;   )rt   ru   rv   s      r!   rL   rL     s/     LLL*2zz.|ZQQQr#   q_fp8kv_cache_fp8block_tablesschedule_metadatamax_model_lenc           
      v    t                       t          t                      S t          | ||||||d          S )a  Compute FP8 MQA logits using paged KV-cache.

    Args:
        q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
            4 bytes per (block,pos) store the `float` dequant scale.
        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
        context_lens: Tensor of shape [B], dtype int32; effective context length
            for each batch element.
        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
            block indices to physical blocks in the paged cache.
        schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
            used to distribute work across SMs.
        max_model_len: Maximum sequence length used to size the logits output.

    Returns:
        Logits tensor of shape [B * next_n, max_model_len], dtype
        `torch.float32`.
    NT)clean_logits)r4   r?   r;   )rx   ry   rp   rt   rz   r{   r|   s          r!   rK   rK   "  sL    < LLL!)zz%	 	 	 	r#   c           	          t          j        dt          j        t          j        |                                                               S )Ng       @)torchpowceillog2absrb   s    r!   _ceil_to_ue8m0r   O  s0    9S%*UZ%8%899:::r#   yc                 (    t          | |          |z  S r^   r   )r`   r   s     r!   _alignr   S  s    1::>r#   element_sizec                 (    t          | d|z            S )N   )r   )r`   r   s     r!   get_tma_aligned_sizer   X  s    !R<'(((r#      T)dynamicbackendFrh   c                 l   t          j                    }|                                 dk    sJ | j        \  }}|\  }}t	          j        t          ||          t          ||          f| j        | j                  }| |d |d |f<   |	                    d||
                    d          |z  |          }	|	                                                                                    dd                              d          }
t                      \  }}|
|z  }|rt!          |          n|}|	d	|z  z                      |          }|                    |          d |d |f                                         |	                    |	
                    d
          |	
                    d                    fS )Nr   )dtypedevicer   )r      T)dimkeepdimg-C6?g      ?r   )r	   	fp8_dtyper   shaper   zerosr   r   r   viewsizer   floatamaxclampr   r   toview_as
contiguous)r`   ru   rh   r   mnblock_mblock_nx_paddedx_viewx_amaxr7   fp8_maxsfx_scaleds                  r!   per_block_cast_to_fp8r   `  s    !*,,I5577a<<<<7DAq!GW{	7		VAw//0  H HRaR!V]]2wa(8(8G(CWMMFZZ\\!!&&64&@@FFtLLF ""JAw	'	B(	0			bB#(#''	22HH%%bqb"1"f-88::BGGAA= =  r#   c                     |                                  |                                 }} | | z  ||z  z                                   }d| |z                                  z  |z  }d|z
  S )a|  Return a global difference metric for unit tests.

    DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
    error, causing `torch.testing.assert_close` to fail.  Instead of checking
    every element, we compute a cosine-style similarity over the whole tensor
    and report `1 - sim`.  Once kernel accuracy improves this helper can be
    removed.
    r   r   )doublesum)r`   r   denominatorsims       r!   	calc_diffr   w  s^     88::qxxzzqAq51q5=%%''K
q1ukkmm
k
)Cs7Nr#   output_dtypeweightsupports_deep_gemmc                     |t                      }d}d}|o7| t          j        k    o'|j        d         |z  dk    o|j        d         |z  dk    S )N@   r   r   r   )r   r   bfloat16r   )r   r   r   
N_MULTIPLE
K_MULTIPLEs        r!   "should_use_deepgemm_for_fp8_linearr     sm    
 !355
 JJ 	 	.EN*	.LOj(A-	. LOj(A-	r#   )r   r   rG   rH   rI   rJ   rK   rL   r   r6   r   r\   r   rc   rN   r&   r^   )B__doc__	functoolsrV   rP   collections.abcr   enumr   typingr   r   r   	vllm.envsr   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platformsr	   vllm.utils.import_utilsr
   vllm.utils.math_utilsr   r   cacheboolr   r6   r;   r   __annotations__r<   r=   r>   r?   r@   rA   rB   rC   r4   r[   r\   listrN   Tensorrc   rG   rH   rI   rO   tuplerJ   rL   rK   r   r   r   DEFAULT_BLOCK_SIZEcompilesimple_compile_backendr   r   r   r   __all__r+   r#   r!   <module>r      s    
         				 $ $ $ $ $ $                                         , + + + + + 1 1 1 1 1 1 & & & & & && & & & &D & & &R M M M M M     0 C H     04 8CH%, 3 3 3+/xS!D( / / /26 hsCx(4/ 6 6 626 hsCx(4/ 6 6 68< HS#X.5 < < <AE #Xc3h%7$%> E E ECG %xS'9D'@ G G GJN ,hsCx.@4.G N N NEI '#s();d)B I I I2. 2. 2. 2.j"S " " " " *S	 * * * *4 4 4 4 4 4	P 	P 	P      L|LelEL()L \L ,	L
 ,L \L L L L<R,R,/R:=R
\R R R R**<*,* \* ,	*
 ,* |* * \* * * *Z;el ; ; ; ;c c c    
)C )s ) ) ) ) 3Z  t%5%LMMM-?SX |!%cLP
5<%&   NM, %,    & '+ +L t   ,  r#   