
    ,`i|             =       G   d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ  ee          Z e	j                     erd Zn	 d d	lmZ n# e$ r	 d d
lmZ Y nw xY w	 	 	 	 	 dEdej        dej        dej        dej        dededej        dej        dededej        dz  dedej        dej        dedededededdf(d Z	 	 	 	 	 dEdej        d!ej        d"ej        d#ej        dej        dej        dej        dededej        dej        dededej        dz  dedej        dej        dedededededdf.d$Zdej        rd%nd&fdej        d!ej        d"ej        d#ej        dej        dej        dej        dededej        dej        d'ej        dz  dededej        dz  dedej        dej        d(ej        dz  d)eddf*d*Zdej        dej        d+ej        dedej        dej        ddfd,Z	 dFd-ej        d.ej        d/ej        d0ej        d1ej        d2ej        dz  ddfd3Z 	 dGd5ej        d6ej        d7ej        d8ej        d9ed:ed;ed<e!de"ej        ej        ej        ej        f         fd=Z#	 dGd5ej        d6ej        d7ej        d8ej        d>ej        d?ej        d9ed:ed;ed<e!de"ej        ej        ej        ej        f         fd@Z$dAej        dej        dBej        dz  dCedDej        dEe!ddfdFZ%dej        dGej        dHej        dIeddf
dJZ&dGej        dKej        dHej        dIeddf
dLZ'dMej        dNedOedPedQedRedSej        dTej        dDej        dEe!dUej        ddfdVZ(dWej        dXej        dYej        dZej        ddf
d[Z)dWej        dXej        dYej        dZej        ddf
d\Z*dWej        dXej        dYej        dZej        ddf
d]Z+	 	 dHdGej        dHej        dIed^ej,        d_ej        dz  dKej        dz  de"ej        ej        f         fd`Z-	 	 	 dIdGej        dHej        dIed^ej,        dbe.e         d_ej        dz  dKej        dz  dce!de"ej        ej        f         fddZ/deej        dfej        dgej        dhediedjedej        fdkZ0dGej        deej        dfej        dlej        dhedej        fdmZ1dnej        doej        dpej        dqej        drej        dse!dte!duedej        fdvZ2 e3ej4        j5        dw          rE edx          dnej        doej        dpej        dqej        drej        dse!dte!duedej        fdy            Z6dSej        dzej        dueddfd{Z7dnej        doej        d|ej        d}ej        d~ej        dededededej        fdZ8 e3ej4        j5        d          r ed          dnej        doej        d|ej        d}ej        d~ej        dedej9        dej9        dej9        dej        fd            Z: ed          	 	 	 	 dJdnej        dej        dz  doej        dej        dz  d}ej        dej        dz  dej        dz  dej        dz  dej        dz  dej        dz  d~ej        dedej9        dej9        dej9        de!de!de!de!dej        f(d            Z; ed          deej        dfej        dgej        dhej9        diedjedej        fd            Z< ed          dGej        deej        dfej        dlej        dhej9        dej        fd            Z= ed          	 	 	 	 	 	 	 dKdnej        dej        dedej,        dz  dej        dz  dej        dz  dedz  dej        dz  dej        dz  dedz  dej        fd            Z> ed          doej        dej,        dedej,        dz  dej        f
d            Z? ed          	 	 dHdnej        dej        dej        dedej        dej        dej,        dz  dedz  dej        fd            Z@ ed          dfej        dej        fd            ZA ed          dej        dej        fd            ZB ed          dej        dej        fd            ZC e3ej4        j5        d          rb ed          dnej        dej        d}ej        dej        dz  dej9        dbej9        dej9        dej9        dej9        de!de!dej        fd            ZD e3ej4        j5        d          r ed          	 dFdej        dedej9        dej9        dej,        dz  dej        fdÄ            ZE edĦ          dej        dej        dedej9        dej        f
dǄ            ZF edȦ          dej        dej        dedej9        dej        f
dɄ            ZG edʦ          dej        dej        dej        dej        dej        dedej9        dej9        dej9        dej        fdЄ            ZH e3ej4        j5        dѦ          rC edҦ          dej        dej        dej        dededej9        dej9        dej        fdԄ            ZIdede!fdքZJdnej        dej        dej        dej        dej        dej,        dej        fdۄZKdede!fd܄ZLdede!fd݄ZM	 dFdnej        dej        dej        dej        dej,        dej        dz  dej        fdZN	 	 dHdnej        dej        dej        dej        dej,        dej        dej        dz  dej        dz  dej        fdZOdede!fdZPdede!fdZQdnej        de"ej        ej        f         fdZR	 dFdnej        dej        dej        dej        dej        dej,        dej        dz  dej        fdZS	 dFdej        dej        dej        dej        dej        dej        dedededej        dz  fdZTdej        dej        dej        dedede!fdZUdej        dej        fdZVdej        dej        dej        dej        dedededefdZWdej        dej        d ej        dej        d}ej        dej        dej        dej        dej        dej        de!de!fdZXdej        dej        d ej        dej        d}ej        dej        dej        dej        d	ej        fd
ZY	 dLdoej        dej        dededede!dej        fdZZ e3ej4        j5        d          rC ed          	 dLdoej        dej        dej9        dej9        dede!dej        fd            Z[	 dLdoej        dededede!dej        fdZ\ e3ej4        j5        d          r< ed          	 dLdoej        dej9        dej9        dede!dej        fd            Z]	 dLdoej        dej        dededede!dej        fdZ^	 dLdoej        dej        dededede!dej        fdZ_	 	 dMdeej        dej        dz  de!fdZ`	 	 	 	 dJdnej        dej        dz  doej        dej        dz  d}ej        dej        dz  dej        dz  dej        dz  dej        dz  dej        dz  d~ej        dedededede!de!de!de!dej        f(dZa	 	 	 	 dNdej,        dedej,        dz  dej,        dz  dej,        dz  dej,        dz  dej,        dz  de.e         fdZb	 	 	 	 	 	 	 dKdnej        dej        dedej,        dz  dej        dz  dej        dz  dedz  dej        dz  dej        dz  dedz  dej        fdZcdoej        dej,        dedej,        dz  dej        f
d Zd	 	 dHdnej        dej        dej        dedej        dej        dej,        dz  dedz  dej        fd!Zedfej        dej        fd"Zfdej        dej        fd#Zg	 dFdej        dej        d ej        dej        d}ej        dej        dedej        dej        dej        dej        dej        d$ej        dedz  fd%Zhd ej        de"ej        ej        f         fd&Zi e3ej4        j5        d'          r, ed(          dnej        dej        dej        fd)            Zjdnej        dej        dej        fd*Zk	 	 dOdGej        d,ej        d-e!d.ede"ej        ej        f         f
d/Zldej        d,ej        dej        dej        d0ede"ej        ej        f         fd1Zmdej        d,ej        dej        dej        d0ede"ej        ej        f         fd2Zn	 	 	 	 	 	 dPdGej        dej        dz  d3edz  d_ej        dz  d4e!d-ej        dz  d5e"eef         dz  de"ej        ej        f         fd6Zo	 	 dMdeej        dej        d7ej        dz  de!de"ej        ej        ej        f         f
d8Zpdnej        dej        d}ej        dej        dz  dedbedededede!de!dej        fd9Zq	 	 	 dQdGej        dej        dz  dej        dz  d:e!de"ej        ej        ej        dz  f         f
d;Zrdej        dedededej,        dz  dej        fd<Zsdej        dej        dededej        f
d=Ztdej        dej        dededej        f
d>Zudej        dej        dej        dej        dej        dededededej        fd?Zvdej        dej        dej        dededej9        dej9        dej        fd@ZwdedefdAZx	 	 	 	 dRdCej        dDej        dEej        dFej        dGej        dHej        dz  dIej        dz  dJej        dz  dKe!d'ej        dz  dLej        dz  dMej        dz  dNej        dOededPej        dz  dQej        dz  dRej        dz  f$dSZydnej        dej        de"ej        ej        f         fdTZzdnej        dej        dUedej        fdVZ{	 dFdnej        dej        dWedej        dej        f
dXZ|	 dFdnej        dej        dWedej        dej        f
dYZ}	 dFdnej        dej        dej,        dej        dej        dWedej        dej        fdZZ~dGej        d-ej        fd[Z	 dFdej        dededej        d\ej        d]ej        d^ej        dz  ddfd_Zd`ededej        daej        dej        d]ej        ddfdbZ	 dFdej        dcej        dededdedeedfedej        d\ej        d]ej        dgej        dhej        d^ej        dz  ddfdiZdGej        d-ej        dej        d}ej        dej        dz  djej        dz  dej        d\ej        d]ej        dedkedledmeduedej        fdnZ	 	 dSdjej        dej        doej        dpej        dqe!drej        dz  ddfdsZ	 	 dSdjej        dej        doej        dpej        dqe!drej        dz  ddfdtZ	 dTduej        dvedwed0edqe!dxedej        dyefdzZ	 	 	 dUdGej        d-ej        dz  dej        dej        dz  d}ej        dej        dz  dej        dz  dej        dz  dej        dz  dej        dz  d~ej        dej        dej        d|ej        djej        d}eded~e!dedededede!de!de!de!dedededej        f<dZ e3ej4        d          rV e3ej4        j        d          r> ed          dnej        dej        daej        djej        dej        d}ej        dej        dej        dej        d~ej        dedej9        dej9        dej9        de!ded0ed}ede!de!dej        f*d            Z ed          dGej        d-ej        dz  dej        dej        dz  d}ej        dej        dz  dej        dz  dej        dz  dej        dz  dej        dz  d~ej        dej        dej        d|ej        djej        d}eded~e!dedededede!de!de!de!f4d            ZdBej        dej        dej        dej        dej        dedej        dej        ddfdZdBej        dej        dej        dej        dej        dedej        dej        ddfdZdej        dej        d+ej        dej        dedej        ddfdZdAej        dej        dej        dej        dDej        dEe!dej        d+ej        dedej        ddfdZdej        dej        dedej        ddf
dZ	 dVd-ej        dGej        dededdf
dZ	 dFdej        dej        dej        dej        dej        dededej        dej        dz  ddfdZ	 dFdej        dej        dej        dej        dedej        dz  ddfdZdej        dej        dej        dej        dej        deddfdZdej        d+ej        dej        dededdfdZd+ej        dej        dej        dej        dej        ddfdZdededefdZdedefdZde.ej                 dej        dede!def
dZdedej        dej        dededdfdZdeddfdZdefdZdede.e         ddfdZdede"e.e         e.e         f         fdZdede.e.e                  de.e.e                  ddfdÄZdede"eej        f         fdńZdej        fdǄZdeddfdɄZdFdedededz  defd̄Zdeddfd̈́Z	 dLdedej        dej        dede!ddfdЄZdedej        fdфZdede.ej                 ddfd҄ZdefdӄZdej        dedOede"ej        ej        f         fdքZ	 	 dMdej        dej        dej        dej        dedej        dej        dedz  d<e!de"ej        ej        f         fd݄Zdej        dej        dej        dej        dej        dej        dej        d~ej        dededej        fdZdededededef
dZ e3ej4        j5        d          r; ed          dej        dej        dej        dz  de!dej        f
d            Z e3ej4        j5        d          r ed          dej        dej        dej        djej        dej        de!de!de!dej        dz  dej        dz  de.e         dz  dej        dz  dej        dz  de!dej        fd            Z e3ej4        j5        d          rJ ed          dej        dej        dej        dej        dz  dej,        de!dej        fd            Z G d d          Z e! e3ej4        j5        d                    Zd Z	 dWdHej        dedefdZdedej        dej        dz  dej        fdZ	 dWdHej        dej        dej,        de!d	e!dedefd
Z	 	 	 dQdGej        dej        dz  dej        dz  d:e!fdZdedej        d-ej        dej        dz  dej        dz  dej        dz  dej        dz  dej        fdZdedededQedej        dej,        d'ej        d<e!dedede!dej        fdZdBej        dej        dej        dej        dej        deddfdZdej        dej        dej        d-ej        d'ej        dej        ded<e!dej        dz  de"eef         dej        dedej        dej        dz  ddfdZdGej        dSej        dfej        dgej        dz  dej        dz  dej        dz  dededej        fdZdHej        dedej        fdZdGej        d ej        dej        d!ej        dz  d"ej        dz  djej        dej        d#ededej        fd$Z e3ej4        j        d%          r< ed&          dnej        dej        d'ej        d(ej        dej        f
d)            Zdnej        dej        d'ej        d(ej        dej        dej        fd*Z e3ej4        j        d+          r< ed,          dnej        dej        d'ej        d(ej        dej        f
d-            Zdnej        dej        d'ej        d(ej        dej        dej        fd.Z e3ej4        j        d/          r5 ed0          dnej        dej        d1ej        d2ej        fd3            Z e3ej4        j        d4          r5 ed5          dnej        dej        d1ej        d2ej        fd6            ZÐd7d8dnej        dej        d9ed:         de"ej        ej        f         fd;Z e3ej4        j        d<          r< ed=          dnej        dej        d1ej        d>ej        dej        f
d?            Zdnej        dej        dej        de"ej        ej        f         fd@ZƐdGdej        de!dej        fdAZ e3ej4        j5        dB          r+ edC          dej        de!dej        fdD            ZdS dS (X      )TYPE_CHECKINGLiteralN)init_logger)current_platform)
ScalarType)$flashinfer_quant_nvfp4_8x4_sf_layout)cdivc                       fdS )Nc                     S N )namefns    d/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/_custom_ops.py<lambda>zregister_fake.<locals>.<lambda>   s    B     r   )r   s   `r   register_faker      s    r   )r   )impl_abstract@   outquery	key_cachevalue_cachenum_kv_headsscaleblock_tablesseq_lens
block_sizemax_seq_lenalibi_slopeskv_cache_dtypek_scalev_scaletp_rankblocksparse_local_blocksblocksparse_vert_strideblocksparse_block_sizeblocksparse_head_sliding_stepreturnc                 r    t           j        j                            | |||||||||	|
||||||||           d S r   )torchops_Cpaged_attention_v1)r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   s                      r   r.   r.   !   sc    * 
IL## %'    r   exp_sum
max_logitstmp_outc                 x    t           j        j                            | |||||||||	|
|||||||||||           d S r   )r+   r,   r-   paged_attention_v2)r   r/   r0   r1   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   s                         r   r3   r3   M   sl    0 
IL## %-    r   fp8f16query_start_locfp8_out_scale	mfma_typec                 t    t           j        j                            | |||||||||	|
|||||||||           d S r   )r+   r,   _rocm_Cpaged_attention)r   r/   r0   r1   r   r   r   r   r   r   r   r6   r   r   r    r!   r"   r#   r7   r8   s                       r   paged_attention_rocmr<      sg    , 
I%%)    r   kv_cachec                 X    t           j        j                            | |||||           d S r   )r+   r,   _C_cpumla_decode_kvcache)r   r   r=   r   r   r   s         r   mla_decode_kvcache_cpurA      s8     
I''UHe\8    r   outputprefix_output
prefix_lsesuffix_output
suffix_lse
output_lsec                 X    t           j        j                            | |||||           d S r   )r+   r,   r-   merge_attn_states)rB   rC   rD   rE   rF   rG   s         r   rI   rI      s7     
IL""
M:}j    r   T	q_seqlens
kv_seqlensvertical_indexesslash_indexescontext_sizeblock_size_Mblock_size_Ncausalc                 F   |                     d          }|                     d          }	|                     d          }
|                     d          }||z   dz
  |z  }t          j        ||	|| j        | j                  }t          j        ||	||
| j        | j                  }t          j        ||	|| j        | j                  }t          j        ||	||| j        | j                  }t          j        j                            ||||| |||||||           ||||fS Nr         dtypedevice)sizer+   zerosrW   rX   r,   r-   convert_vertical_slash_indexes)rJ   rK   rL   rM   rN   rO   rP   rQ   
batch_size	num_heads	nnz_slashnnz_verticalnum_rowsblock_countblock_offsetcolumn_countcolumn_indexs                    r   r[   r[      sc    ##A&&J""1%%I""1%%I#((++L|+a/L@H+IxyyGW  K ;o  L ;IxyyGW  L ;o  L 
IL//   lL@@r   vertical_indices_countslash_indices_countc
                 J   |                     d          }
|                     d          }|                     d          }|                     d          }||z   dz
  |z  }t          j        |
||| j        | j                  }t          j        |
|||| j        | j                  }t          j        |
||| j        | j                  }t          j        |
|||| j        | j                  }t          j        j                            ||||| |||||||||	           ||||fS rS   )rY   r+   emptyrW   rX   r,   r-   (convert_vertical_slash_indexes_mergehead)rJ   rK   rL   rM   re   rf   rN   rO   rP   rQ   r\   r]   r^   r_   r`   ra   rb   rc   rd   s                      r   ri   ri      si    ##A&&J""1%%I""1%%I#((++L|+a/L@H+IxyyGW  K ;o  L ;IxyyGW  L ;o  L 
IL99    lL@@r   	positionskey	head_sizecos_sin_cacheis_neoxc                 X    t           j        j                            | |||||           d S r   )r+   r,   r-   rotary_embedding)rj   r   rk   rl   rm   rn   s         r   rp   rp   >  s7     
IL!!5#y-    r   inputweightepsilonc                 T    t           j        j                            | |||           d S r   )r+   r,   r-   rms_norm)r   rq   rr   rs   s       r   ru   ru   L  s(     
IL#ufg66666r   residualc                 T    t           j        j                            | |||           d S r   )r+   r,   r-   fused_add_rms_norm)rq   rv   rr   rs   s       r   rx   rx   R  s(     
IL##E8VWEEEEEr   qkvnum_heads_qnum_heads_knum_heads_vhead_dimepsq_weightk_weightposition_idsc                 b    t           j        j                            | |||||||||	|
           d S r   )r+   r,   r-   fused_qk_norm_rope)ry   rz   r{   r|   r}   r~   r   r   rm   rn   r   s              r   r   r   X  sK     
IL##    r   logitsprompt_maskoutput_maskrepetition_penaltiesc                     |                     d                              d|                     d                    }t          j        ||z  |d          }t          j        | dk    d|z  |          }| |z  } d S )NrT   )dim      ?r   )	unsqueezerepeatrY   r+   where)r   r   r   r   	penaltiesscalings         r    apply_repetition_penalties_torchr   t  s~     099a9@@GG	6;;q>>  K+57KSQQIk&1*cIoyAAG
gFFFr   c                 T    t           j        j                            | |||           d S r   )r+   r,   r-   apply_repetition_penalties_r   r   r   r   s       r   apply_repetition_penalties_cudar     s4     
IL,,[*>    r   c                     | j         r(|                                 rt          | |||           dS t          | |||           dS )aw  Apply repetition penalties to logits in-place.

    Args:
        logits: The logits tensor of shape [num_seqs, vocab_size].
        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
        output_mask: A boolean tensor indicating which tokens appear in the output.
        repetition_penalties: The repetition penalties of shape (num_seqs, ).
    N)is_cudais_contiguousr   r   r   s       r   apply_repetition_penaltiesr     so     ~ 
&..00 
'K.B	
 	
 	
 	
 	
 	)K.B	
 	
 	
 	
 	
r   quant_dtypescale_ubc           	         t          j        | |          }t          j        |                                 | j        d         z  df| j        t           j                  }t           j        j        	                    || |||||           ||fS )NrW   rT   rX   rW   )
r+   
empty_likerh   numelshaperX   float32r,   r-    rms_norm_dynamic_per_token_quant)rq   rr   rs   r   r   rv   rB   scaless           r   r   r     s     e;777F[	%+b/	)1-el%-  F 
IL11vvw(   6>r   F
group_sizeis_scale_transposedc                 b   t          |          dk    sJ t          j        | |          }|rqt          j        | j        d         |d         z  |                                 | j        d         z  f| j        t          j                                      dd          }	n\t          j        |                                 | j        d         z  | j        d         |d         z  f| j        t          j                  }	t          j	        j
                            || ||	||||d         |	  	         ||	fS )NrU   r   r   rT   r   r   )lenr+   r   rh   r   r   rX   r   	transposer,   r-   rms_norm_per_block_quant)
rq   rr   rs   r   r   r   rv   r   rB   r   s
             r   r   r     s+    z??ae;777F 
[_
1-u{{}}B/OP<-
 
 
 )Aq//	 	 [[]]ek"o-u{2*Q-/OP<-
 
 
 
IL))1
 
 
 6>r   qweightr   rZ   split_k_itersthxthyc                     t           j        rddlm}  || ||          S t          j        j                            | |||||          S )Nr   )awq_dequantize_triton)envsVLLM_USE_TRITON_AWQ2vllm.model_executor.layers.quantization.awq_tritonr   r+   r,   r-   awq_dequantize)r   r   rZ   r   r   r   r   s          r   r   r     sg      =	
 	
 	
 	
 	
 	
 %$Wfe<<<9<&&w}cSVWWWr   qzerosc                     t           j        rddlm}  || ||||          S t          j        j                            | ||||          S )Nr   )awq_gemm_triton)r   r   r   r   r+   r,   r-   awq_gemm)rq   r   r   r   r   r   s         r   r   r     sZ      NVVVVVVugvv}MMM9<  OOOr   a
b_q_weightb_gptq_qzerosb_gptq_scalesb_g_idxuse_exllamause_v2_formatbitc           
      X    t           j        j                            | |||||||          S r   )r+   r,   r-   	gptq_gemmr   r   r   r   r   r   r   r   s           r   r   r     s8     9<!!		 	 	r   r   z_C::gptq_gemmc                     t          j        |                     d          |                    d          f| j        | j                  S )Nr   rT   rV   r+   rh   rY   rW   rX   r   s           r   _gptq_gemm_faker     sC     {VVAYY
**+1718
 
 
 	
r   q_permc                 R    t           j        j                            | ||           d S r   )r+   r,   r-   gptq_shuffle)r   r   r   s      r   r   r   -  s$    	ILh44444r   b_metab_scales	workspaceb_q_typesize_msize_nsize_kc	                 d    t           j        j                            | |||||j        |||	  	        S r   )r+   r,   r-   gptq_marlin_24_gemmid	r   r   r   r   r   r   r   r   r   s	            r   r   r   2  s6     9<++	:vxHKQW  r   r   z_C::gptq_marlin_24_gemmc	                 H    t          j        ||f| j        | j                  S Nr   )r+   rh   rX   rW   r   s	            r   _gptq_marlin_24_gemm_faker   D  s$     {FF+AHAGLLLLr   z_C::marlin_gemmcb_biasa_scalesglobal_scaleb_zerosg_idxpermb_q_type_id	is_k_fulluse_atomic_adduse_fp32_reduceis_zp_floatc                     | j         }|t          j        t          j        fvr|j         }t          j        ||f| j        |          S r   )rW   r+   halfbfloat16rh   rX   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rW   s                       r   _marlin_gemm_faker   R  sD    , U^444NE{FF+AHEJJJJr   z_C::awq_dequantizec                     |                      d          }|                      d          }|dz  }t          j        ||f|j        |j                  S Nr   rT      rV   rY   r+   rh   rW   rX   )	r   r   rZ   r   r   r   in_cqout_cout_cs	            r   _awq_dequantize_faker   m  sJ     ||Aa
{D%=V]SSSSr   z_C::awq_gemmc                     |                      d          }t          j        |||                     d          dz  f| j        | j                                      d          S r   )rY   r+   rh   rW   rX   sum)rq   r   r   r   r   num_in_featss         r   _awq_gemm_faker   {  s\     zz!}}{L',,q//A*=>+<
 
 
 #a&&		r   z_C::machete_mmb_qb_typeout_typeb_group_scalesb_group_zerosb_group_sizeb_channel_scalesa_token_scalesschedulec
                     |                      d          }
|                     d          }t          j        |
|f| j        | j                  S Nr   rT   r   rY   r+   rh   rX   rW   )r   r   r   r   r   r   r   r   r   r   mns               r   machete_mm_faker    sA     FF1IIHHQKK{Aq6!(!'BBBBr   z_C::machete_prepack_Ba_typegroup_scales_typec                 B    t          j        | t           j                  S N)memory_formatr+   r   contiguous_formatr   r  r   r  s       r   machete_prepack_B_faker    s     
%:QRRRRr   z_C::cutlass_w4a8_mmmaybe_schedulec                     |                      d          }|                     d          }	||nt          j        }
t          j        ||	f| j        |
          S r   )rY   r+   r   rh   rX   )r   r   r   r   r   r   r   r  r   r  	out_dtypes              r   cutlass_w4a8_mm_faker    sO     FF1IIHHQKK ( 4HH%.	{Aq6!()DDDDr   z_C::cutlass_pack_scale_fp8c                 B    t          j        | t           j                  S r  r  r   s    r   cutlass_pack_scale_fp8_faker    s    e6MNNNNr   z$_C::cutlass_encode_and_reorder_int4bbc                 B    t          j        | t           j                  S r  r  r  s    r   %cutlass_encode_and_reorder_int4b_faker        1HIIIIr   z,_C::cutlass_encode_and_reorder_int4b_groupedc                 B    t          j        | t           j                  S r  r  r  s    r   -cutlass_encode_and_reorder_int4b_grouped_faker    r  r   allspark_w8a16_gemmz_C::allspark_w8a16_gemm	b_qweightb_qzerosr  sm_count
sm_versionCUBLAS_M_THRESHOLDhas_zpn32k16_reorderc                 r    |                      d          }t          j        ||f| j        | j                  S )Nr   r   r   )r   r  r   r  r  r   r  r  r  r   r!  r   s               r   _allspark_w8a16_gemm_faker#    s2     FF1II{Aq6!(!'BBBBr   ggml_dequantizez_C::ggml_dequantizeW
quant_typer   rW   c                 R    t          j        ||ft           j        | j                  S NrV   )r+   rh   float16rX   r%  r&  r   r  rW   s        r   _ggml_dequantize_faker+    s#     {Aq6qxHHHHr   z_C::ggml_mul_mat_vec_a8Xrowc                 ^    t          j        |j        d         |f|j        | j                  S Nr   rV   )r+   rh   r   rW   rX   r%  r,  r&  r-  s       r   _ggml_mul_mat_vec_a8_faker1    s*     {AGAJ,AGAHMMMMr   z_C::ggml_mul_mat_a8c                 r    |                     d          }t          j        ||f|j        | j                  S r/  r   )r%  r,  r&  r-  batchs        r   _ggml_mul_mat_a8_faker4    s2     q		{E3<qwqxHHHHr   z_C::ggml_moe_a8sorted_token_ids
expert_idsnum_tokens_post_paddedtop_ktokensc	                     |                      d          }t          j        ||z  |ft          j        |j                  S r/  )rY   r+   rh   r)  rX   	r,  r%  r5  r6  r7  r&  r-  r8  r9  s	            r   _ggml_moe_a8_faker<    s7     {FUNC0ahWWWWr   ggml_moe_a8_vecz_C::ggml_moe_a8_vectopk_idsc                 x    |                      d          }t          j        ||z  |f| j        |j                  S r/  r   r,  r%  r>  r8  r&  r-  r9  s          r   _ggml_moe_a8_vec_fakerA    s7     {FUNC0QQQQr   cuda_device_capabilityc                 J    t           j        j                            |           S r   )r+   r,   r-   cutlass_scaled_mm_supports_fp4rB  s    r   rD  rD        9<667MNNNr   block_scale_ablock_scale_balphar  c                     | j         dk    r|j         dk    sJ | j        d         |j        d         }}t          j        ||f|| j                  }t          j        j                            || ||||           |S )NrU   r   rV   )ndimr   r+   rh   rX   r,   r-   cutlass_scaled_fp4_mm)	r   r  rG  rH  rI  r  r   r  r   s	            r   rL  rL    sw     6Q;;16Q;;;&71:qwqzqA
+q!fIah
?
?
?C	IL&&sAq-PUVVVJr   c                 J    t           j        j                            |           S r   )r+   r,   r-   cutlass_scaled_mm_supports_fp8rE  s    r   rN  rN  *  rF  r   c                 J    t           j        j                            |           S r   )r+   r,   r-   $cutlass_scaled_mm_supports_block_fp8rE  s    r   rP  rP  .  s    9<<<=STTTr   scale_ascale_bbiasc                    |t           j        u s|t           j        u sJ |0|                                |j        d         k    r|j        |k    sJ g | j        dd         |j        d         R }|                     d| j        d                   } |j        d         dz  dk    o|j        d         dz  dk    }t          j                    s|sddl	m
}  || |||||          }	n]t          j        | j        d         |j        d         f|| j                  }	t           j        j                            |	| ||||            |	j        | S )a  
    `cutlass_scaled_mm` implements a fused version of
        `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
    where scale_a * a and scale_b * b are implemented using numpy-style
    broadcasting.

    In order to support blockwise scaling like found in DeepSeek V3 we also
    support extended "group" broadcast rules. We extend the numpy-style
    broadcasting rules with the following rule:
        "if the extent of a dimension in the source shape is between 1 and
        corresponding extent in the target shape we repeat each element along
        that dimension  src_shape[dim] // target_shape[dim] times consecutively"
    example if we have:
          a = [[1, 2], and target_shape = (2, 4)
               [3, 4]]
    then we would expand a to:
          a = [[1, 1, 2, 2],
               [3, 3, 4, 4]]
    currently we only support the case:
        scale_a.shape * [1, 128] == a.shape
        scale_b.shape * [128, 128] == b.shape
    NrT   r   r      )triton_scaled_mmrV   )r+   r   r)  r   r   rW   viewr   is_rocmKvllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mmrV  rh   rX   r,   r-   cutlass_scaled_mm)
r   r  rQ  rR  r  rS  target_shapecutlass_compatible_brV  r   s
             r   rZ  rZ  2  si   < &&)u}*D*D*DD<4::<<171:55$*	:Q:Q:QQ /QWSbS\.171:..L	r172;A71:?a/HAGAJOq4H!! J)= J	
 	
 	
 	
 	
 	
 q!Wgy$GGk171:qwqz2)AHUUU	&&sAq'7DIII38\""r   azp_adjazpc           
         |j         d         dz  dk    r|j         d         dz  dk    sJ |t          j        u s|t          j        u sJ |0|                                |j         d         k    r|j        |k    sJ g | j         dd         |j         d         R }|                     d| j         d                   } |%|                                | j         d         k    sJ t          j        | j         d         |j         d         f|| j                  }	t          j	        j
                            |	| ||||||            |	j        | S )z
    :param azp_adj: In the per-tensor case, this should include the azp.
    Always per-channel.
    :param azp: Only set in the per-token case. Per-token if set.
    r   rU  rT   Nr   rV   )r   r+   r   r)  r   rW   rW  rh   rX   r,   r-   cutlass_scaled_mm_azp)
r   r  rQ  rR  r  r]  r^  rS  r[  r   s
             r   r`  r`  e  s@    71:?aAGAJOq$8$8$88&&)u}*D*D*DD<4::<<171:55$*	:Q:Q:QQ /QWSbS\.171:..L	r172;A;#))++3333
+qwqz171:.i
Q
Q
QC	IL&&sAq'7GSRVWWW38\""r   c                 J    t           j        j                            |           S r   )r+   r,   r-   "cutlass_sparse_scaled_mm_supportedrE  s    r   rb  rb    s    9<::;QRRRr   c                 n    	 t           j        j                            |           S # t          $ r Y dS w xY wNF)r+   r,   r-   cutlass_group_gemm_supportedAttributeErrorrE  s    r   re  re    sB    y|889OPPP   uus   #& 
44c                    | j         t          j        t          j        t          j        t          j        fv sJ |                                 sJ d}| j        d         d|z  z  dk    sJ t          j        j	        
                    |           S )a  
    Compresses a sparse matrix for use with Cutlass sparse operations.

    This function takes a dense tensor and compresses it into two components:
    non-zero elements and metadata. The compressed representation is compatible
    with Cutlass sparse kernels.

    Args:
        a (torch.Tensor):
            The input tensor to be compressed. Must have one of the following data types:
            - `torch.int8`
            - `torch.float8_e4m3fn`
            - `torch.bfloat16`
            - `torch.float16`

    Returns:
        tuple[torch.Tensor, torch.Tensor]:
            A tuple containing:
            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.

    Raises:
        ValueError: If the compression operation fails.

    Notes:
        - The `a_meta` tensor has a data type of `torch.uint8`.
        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
       rT   rU   r   )rW   r+   int8float8_e4m3fnr   r)  r   r   r,   r-   cutlass_sparse_compress)r   elemsPerMetaElems     r   rk  rk    s    > 7uz5#6VVVVV?? 71:--.!33339<//222r   bt_nzsbt_metac           	         |j         d         dz  dk    r|j         d         dz  dk    sJ |t          j        u s|t          j        u sJ |)|j         d         |j         d         k    r|j        |k    sJ | j         d         }|j         d         }t          j        ||f|| j                  }	t          j        j        	                    |	| |||||           |	S )aG  
    Performs a scaled sparse matrix multiplication using Cutlass.

    Steps:
    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
    `a = torch.randn((m, k), device='cuda')`.

    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
    `b = torch.randn((k, n), device='cuda')`.

    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
    `b = prune_to_2_4(b, dim=0)`.

    4. Compress the transposed sparse matrix `b.t()`:
    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.

    5. Perform sparse matrix multiplication using the compressed matrix,
    applying scaling factors for `a` and `b`, and the output data type:
    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.

    Returns:
    - The result of the scaled sparse matrix multiplication.
    r   rU  rT   NrV   )
r   r+   r   r)  rW   rh   rX   r,   r-   cutlass_scaled_sparse_mm)
r   rm  rn  rQ  rR  r  rS  r   r  r   s
             r   rp  rp    s    @ <?R1$$a2)=)B)B)BB&&)u}*D*D*DD<4:a=FLO;;
i@W@W@WW	
AQA
+q!fIah
?
?
?C	IL))Q'4   Jr   expert_offsetsproblem_sizes1problem_sizes2input_permutationoutput_permutationnum_expertskblockscale_offsetsc
                 \    t           j        j                            | |||||||||	
  
        S )a1  
    Prepare data necessary to perform CUTLASS grouped matrix multiplications
    used in CUTLASS-based fused MoE.

    The function takes in topk_ids (token-expert mapping) and uses it to
    compute:
    - expert_offsets: Indices that mark at which token index each expert begins
                      its computation after the input is sorted with
                      input_permutation. The number of tokens computed with
                      expert E is expert_offsets[E + 1] - expert_offsets[E]
    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
                                      multiplication in two grouped MMs used in
                                      the fused MoE operation.
    - input_permutation: Permutation that must be used to shuffle the input
                         before executing the MMs.
    - output_permutation: Permutation that must be used to shuffle the output
                          after executing the MMs.
    - blockscale_offsets: Optional argument passed for fp4 moe. Indices that
                          mark at which block scale index each expert begins
                          its computation. The number of block scale rows
                          computed with expert E is blockscale_offsets[E + 1] -
                          blockscale_offsets[E]
    )r+   r,   r-   get_cutlass_moe_mm_data)
r>  rq  rr  rs  rt  ru  rv  r  rw  rx  s
             r   rz  rz    s?    F 9<//		  r   expert_first_token_offsetswap_abc                 T    t           j        j                            | |||||          S )zICompute per-expert (M, N, K) problem sizes from expert_first_token_offset)r+   r,   r-   4get_cutlass_moe_mm_problem_sizes_from_expert_offsets)r{  rr  rs  r  rw  r|  s         r   r~  r~    s2     9<LL!		  r   input_tensordst2src_mapc                     |j         d         }t          j        || j         d         f| j        | j                  }t          j        j                            | ||           |S )z
    Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
    This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
    r   rT   r   )r   r+   rh   rX   rW   r,   _moe_Cshuffle_rows)r  r  num_tokens_permutedoutput_tensors       r   r  r  *  si    
 &+A.K	l034"   M
 
I!!,]KKKr   expert_num_tokensnum_local_expertspadded_mc           
      X    t           j        j                            | |||||||          S )a}  
    Prepare data necessary to perform CUTLASS grouped matrix multiplications
    used in CUTLASS-based fused MoE.

    The function takes in expert_num_tokens (token count per expert) and
    non_zero_expert_idxs (consecutive indices of experts with non-zero token
    counts) and uses them to compute:
    - expert_offsets: Indices that mark at which token index each expert begins
                      its computation.
    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
                                      multiplication in two grouped MMs used in
                                      the fused MoE operation.
    )r+   r,   r-   get_cutlass_pplx_moe_mm_data)rq  rr  rs  r  r  r  r  rw  s           r   r  r  9  s8    . 9<44			 	 	r   out_tensors	a_tensors	b_tensorsproblem_sizes	a_strides	b_strides	c_stridesper_act_token
per_out_chc                 `    t           j        j                            | |||||||||	|
|          S )aY  
    A single grouped matrix multiplication used in CUTLASS-based fused MoE.
    The function executes fp8-quantized OUT = AB matrix multiplication.

    - expert_offsets: Indices that mark at which token index each expert begins
                      its computation. The number of tokens computed with
                      expert E is expert_offsets[E + 1] - expert_offsets[E]
    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
                     MMs used in the fused MoE operation.
    - a/b/c_strides: The data strides passed to grouped matrix multiplication.
    )r+   r,   r-   cutlass_moe_mm)r  r  r  r   r   rq  r  r  r  r  r  r  s               r   r  r  \  sD    2 9<&&  r   alphas
sf_offsetsc	                 Z    t           j        j                            | ||||||||	  	        S )av  
    An FP4 Blockscaled Group Gemm that takes in  a_tensors, b_tensors and runs
    the gemms for each combination based on the specified problem sizes.

    This is used as the MoE gemm during NVFP4 Quantized FusedMoE forward.
    - a/b_tensors: the NVFP4 a_ptrs and b_ptrs tensors which are quantized
                     input and expert weights.
    - a_/b_scales: The blockscales in FP8-E4M3 precision
    - expert_offsets/sf_offsets: Indices that mark at which token index
                    each expert begins its computation. The number of tokens
                    computed with expert E is expert_offsets[E + 1] -
                    expert_offsets[E] And the sf_size per expert is
                    sf_offset[E+1] - sf_offset[E]
    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
                     MMs used in the fused MoE operation.
    )r+   r,   r-   cutlass_fp4_group_mm)	r  r  r  r   r   r  r  rq  r  s	            r   cutlass_fp4_moe_mmr    s;    6 9<,,
 
 
r   num_bits	is_a_8bitc                 T    t           j        j                            | |||||          S r   )r+   r,   r-   gptq_marlin_repack)r   r   r   r   r  r  s         r   r  r    s-     9<**D&&(I  r   r  z_C::gptq_marlin_repackc                 h    d|z  }d}t          j        ||z  ||z  |z  f| j        | j                  S N    rU  rV   r+   rh   rW   rX   )r   r   r   r   r  r  pack_factormarlin_tile_sizes           r   _gptq_marlin_repack_faker    sQ     Hn{''2B)Bk)QR"$
 
 
 	
r   c                 R    t           j        j                            | ||||          S r   )r+   r,   r-   awq_marlin_repack)r   r   r   r  r  s        r   r  r    s+     9<))FFHi  r   r  z_C::awq_marlin_repackc                 h    d|z  }d}t          j        ||z  ||z  |z  f| j        | j                  S r  r  )r   r   r   r  r  r  r  s          r   _awq_marlin_repack_faker    sQ     Hn{''2B)Bk)QR"$
 
 
 	
r   c           	      $   | j         d         }|dz  dk    sJ t          j        ||dz  ||dz  z  f| j        | j                  }t          |          D ]:}t          j        j                            | |         ||         ||||          ||<   ;|S Nr   rU  rU   r   )	r   r+   rh   rX   rW   ranger,   r-   r  	r   r   r   r   r  r  rv  rB   es	            r   gptq_marlin_moe_repackr    s     "1%KB;![	flFh!m$<=   F
 ; 
 
IL33qM47FFHi
 
q		 Mr   c                    | j         d         }|dz  dk    sJ t          j        ||dz  ||dz  z  f| j        | j                  }t          |          D ]3}t          j        j                            | |         ||||          ||<   4|S r  )	r   r+   rh   rX   rW   r  r,   r-   r  r  s	            r   awq_marlin_moe_repackr    s     "1%KB;![	flFh!m$<=   F
 ; 
 
IL22qM668Y
 
q		 Mr   qzeros_or_noneinplacec                 N    t           j        j                            | ||          S r   )r+   r,   r-   marlin_int4_fp8_preprocess)r   r  r  s      r   r  r    s     
 9<227NGTTTr   c                 x    t           j        j                            | |||||||||	|
|j        |||||||          S r   )r+   r,   r-   marlin_gemmr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s                      r   r  r  #  s[    * 9<##		'  r   group_zeros_typechannel_scales_typetoken_scales_typec           	      `    t           j        j                            | |j        |||||          S r   )r+   r,   r-   machete_supported_schedulesr   )r  r   r  r  r  r  r   s          r   r  r  P  s7     9<33	  r   c
                 f    t           j        j                            | ||j        |||||||	
  
        S r   )r+   r,   r-   
machete_mmr   )
r   r   r   r   r   r   r   r   r   r   s
             r   r  r  d  s@     9<""		  r   c                 Z    t           j        j                            | ||j        |          S r   )r+   r,   r-   machete_prepack_Br   r
  s       r   r  r    s,     9<))FFI'8  r   c           
      X    t           j        j                            | |||||||          S r   )r+   r,   r-   cutlass_w4a8_mm)r   r   r   r   r   r   r   r  s           r   r  r    s8     9<''		 	 	r   c                 J    t           j        j                            |           S r   )r+   r,   r-   cutlass_pack_scale_fp8r  s    r   r  r    s    9<..v666r   c                 J    t           j        j                            |           S r   )r+   r,   r-    cutlass_encode_and_reorder_int4br  s    r   r  r    s    9<88;;;r   group_scale_stridesc                 d    t           j        j                            | |||||||||	|
|||          S )a.  
    Executes the CUTLASS-based fused-MoE grouped matrix multiplication for the
    W4A8 quantization scheme. Uses group-wise quantization (INT4 -> FP8)
    and both per-channel + per-token scaling in the epilogue.

    Args:
        out_tensors:
            Output buffer for all experts (updated in-place).
        a_tensors:
            FP8 (E4M3FN) activations for all experts.
        b_tensors:
            INT4-packed weight matrix for all experts, packed to INT32
        a_scales:
            Per-token FP8 activation scales, applied in the epilogue.
        b_scales:
            Per-channel FP8 weight scales for each expert, applied in the epilogue.
        b_group_scales:
            FP8 scale values for group-wise INT4 weight blocks.
        b_group_size:
            Number of elements grouped under each entry of b_group_scales.
        expert_offsets:
            Cumulative token offsets
        problem_sizes:
            Per-expert (M, N, K) GEMM sizes used by the grouped GEMM launcher.
        a/b/c/group_scale_strides:
            Strides describing the memory layout of the input tensors.
        maybe_schedule:
            Optional override to choose a specific kernel or epilogue schedule.

    Returns:
        out_tensors updated in-place with the dequantized INT4xFP8 grouped GEMM result.
    )r+   r,   r-   cutlass_w4a8_moe_mm)r  r  r  r   r   r   r   rq  r  r  r  r  r  r  s                 r   r  r    sK    ` 9<++  r   c                 J    t           j        j                            |           S r   )r+   r,   r-   (cutlass_encode_and_reorder_int4b_grouped)r  s    r   r  r    s     9<@@KKKr   permute_colsz_C::permute_colsc                 *    t          j        |           S r   r+   r   r   r   s     r   _permute_cols_faker    s    """r   c                 L    t           j        j                            | |          S r   )r+   r,   r-   r  r  s     r   r  r    s    9<$$Q---r   noneinput_global_scaleis_sf_swizzled_layoutbackendc                    t          j                    rJ | j        dk    sJ d| j         d            | j        dk    rdnd}|                     || j        d                   } | j        \  }}d}| j        }||z  dk    sJ d| d            | j        t          j        t          j	        fv sJ d| j         d            d	|v r|d
k    rdnd}	|	rt          | |          \  }
}nt          j        ||dz  f|t          j                  }
|rGd } ||d          }||z  } ||d          }t          j        ||dz  f|t          j                  }n&t          j        ||dz  f|t          j                  }t          j        j                            |
| |||           |                    t          j                  }|
|fS )ay  
    Quantize input tensor to FP4 and return quantized tensor and scale.

    This function quantizes the last dimension of the given tensor `input`. For
    every 16 consecutive elements, a single dynamically computed scaling factor
    is shared. This scaling factor is quantized using the `input_global_scale`
    and is stored in a swizzled layout (see
    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).

    Args:
        input: The input tensor to be quantized to FP4
        input_global_scale: A scalar scaling factor for the entire tensor.
        use_8x4_sf_layout: Whether to use the 8x4 or 128x4 layout for the scaling

    Returns:
        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
            two values are packed into a uint8 and float8_e4m3 scaling factors
            in the sizzled layout.
    rT   z%input.ndim needs to be >= 1, but got .r   rU  r   z+last dim has to be multiple of 16, but got z-input.dtype needs to be fp16 or bf16 but got trtllmr  TFrU   r   c                     | |z   dz
  |z  |z  S )NrT   r   )xys     r   r   z"scaled_fp4_quant.<locals>.<lambda>3  s    QUQY1$4q$8 r      rh  )r   rX  rK  reshaper   rX   rW   r+   r)  r   r   rh   uint8int32r,   r-   scaled_fp4_quantrW  rj  )rq   r  r  r  
other_dimsr   r  r   rX   use_8x4_sf_layoutrB   output_scaleround_up	rounded_mscale_n	rounded_ns                   r   r  r    s   2  '))))):???QEJQQQ???jAoo2JMM*ek"o66E;DAqJ\Fz>Q Ra R R R;5=%.9999FFFF :99 !)G 3 3RU 
C% 
  

 aa[u{KKK  	W 98H C((I:oG !,,I ;IN+F%+  LL !;17|F%+VVVL	%%E<);=R	
 	
 	
  $$U%899L<r   topkc                 "   t          j                    rJ | j        dk    sJ d| j         d            t          j        }| j        \  }}|||z  k    sJ d| d| d            |dz  }|dz   d	z  }	t          j        ||dz  | j        t          j	        
          }
t          j        ||z  |	t          j
        | j                  }t          j        j                            |
|| |||           |                    t          j                  }|
|fS )a  
    Quantize input tensor to NVFP4 and return quantized tensor and scale, for
    packed MoE Inputs.
    Args:
        input_tensor: The input tensor to be quantized to NVFP4
        input_global_scale: A scalar scaling factor for the entire tensor.
        expert_offsets: The expert offsets tensor
        blockscale_offsets: The blockscale offsets tensor
    Outputs:
        output: The quantized tensor in NVFP4
        output_scales: The blockscale tensor in FP8-E4M3
    rU   %input.ndim needs to be == 2, but got r  2m_numtopk must be less than MAX_TOKENS_PER_EXPERT(,) for cutlass_moe_fp4, observed m_numtopk = ;. Use VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value.rU     rh  r   rV   )r   rX  rK  r   "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOEr   r+   rh   rX   r  r  r,   r-   scaled_fp4_experts_quantrW  rj  )r  r  rq  rx  r  MAX_TOKENS_PER_EXPERT	m_numtopkrw  scales_kpadded_krB   output_scaless               r   r  r  E  sk   &  ')))))!!!D0ADDD "!! !C%LIq-4444	B 	B 	B6?	B 	B 	B 544 BwHE"q(H [16,"5U[  F K$k"	  M 
IL))   "&&u':;;M=  r   c                 R   t          j                    rJ | j        dk    sJ d| j         d            t          j        }| j        \  }}|dz  dk    s
J d            |dz  }|||z  k    sJ d| d| d            |d	z  }	|	d
z   dz  }
t          j        ||dz  | j        t          j	                  }t          j        ||z  |
t          j
        | j                  }t          j        j                            ||| |||           |                    t          j                  }||fS )a!  
    Fused SiLU+Mul+NVFP4 quantization for MoE intermediate activations.

    Args:
        input_tensor: The input tensor with gate || up layout [m_topk, k*2]
        input_global_scale: A per-expert scaling factor [n_experts]
        expert_offsets: The expert offsets tensor [n_experts+1]
        blockscale_offsets: The blockscale offsets tensor [n_experts+1]
        topk: Number of top-k experts selected
    Outputs:
        output: The quantized tensor in NVFP4 [m_topk, k/2]
        output_scales: The blockscale tensor in FP8-E4M3
    rU   r  r  r   z,input width must be even (gate || up layout)r  r  r  rU  r  rh  r   rV   )r   rX  rK  r   r  r   r+   rh   rX   r  r  r,   r-   %silu_and_mul_scaled_fp4_experts_quantrW  rj  )r  r  rq  rx  r  r  r  	k_times_2rw  r  r  rB   r  s                r   r  r    s   (  ')))))!!!D0ADDD "!! !C'-Iyq=AMQA-4444	B 	B 	B6?	B 	B 	B 544 BwHE"q(H [16,"5U[  F K$k"	  M 
IL66   "&&u':;;M=  r   num_token_paddinguse_per_token_if_dynamicgroup_shapec                    | j         dk    sJ | j        }t          j                    }|r#t	          || j        d                   |d         f}|t          j        || j        |          }n|
J d            |j        |k    sJ ||rVt          j        |d         df| j        t
          j	                  }t
          j
        j                            || ||           ntt          j        d| j        t
          j	                  }t
          j
        j                            || |           n't
          j
        j                            || ||           ||fS )a_  
    Quantize input tensor to FP8 and return quantized tensor and scale.

    This function supports both static and dynamic quantization: If you
    provide the scale, it will use static scaling and if you omit it,
    the scale will be determined dynamically. The function also allows
    optional padding of the output tensors for downstream kernels that
    will benefit from padding.

    Args:
        input: The input tensor to be quantized to FP8 (must be 2D: [M, N])
        scale: Optional scaling factor for the FP8 quantization. Supports:
            - 0D or [1]: per-tensor scaling
            - 1D: requires explicit group_shape to disambiguate per-channel
              vs per-token (use (-1, 1) for per-channel, (1, -1) for per-token)
            - 2D [M/group_m, N/group_n]: group scaling (e.g. [M, N/128] for
              DeepSeek-style (1,128) groups, or [M/128, N/128] for (128,128))
        scale_ub: Optional upper bound for scaling factor in dynamic
            per token case
        num_token_padding: If specified, pad the first dimension
            of the output to at least this value.
        use_per_token_if_dynamic: Whether to do per_tensor or per_token
            in the dynamic quantization case.
        group_shape: Optional tuple (group_m, group_n) specifying the group
            shape for static quantization. Use -1 for "full extent" (e.g.,
            (-1, -1) for per-tensor, (-1, 1) for per-channel, etc.)
            Required for 1D scales; optional for 2D scales.

    Returns:
        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
            scaling factor.
    rU   r   rT   Nr   z)padding not supported if output passed in)rK  r   r   	fp8_dtypemaxr+   rh   rX   rW   r   r,   r-   "dynamic_per_token_scaled_fp8_quantdynamic_scaled_fp8_quantstatic_scaled_fp8_quant)	rq   r   r  r   r  rB   r  r   r  s	            r   scaled_fp8_quantr    sU   T :????*/+E-799I C&A77qB~U5<yIII ((*U(((|y((((}# 	HKq1el%-XXXEIL;;uh    K%,emLLLEIL11&%GGGG	,,VUE;OOO5=r   
zero_pointc                    | j         d         }| j         d         }|dz   dz
  dz  dz  }t          j        ||f| j        | j                  }t          j        d|f|j        |j                  }d}	|r/|
J d            t          j        d|f|j        |j                  }	t          j        j                            | ||||||	|||
  
         |||	fS )a  
    Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format
    for Ampere W8A16 Fused Gemm kernel

    Args:
        qweight: uint8 weight tensor, original k x n format.
        scale: fp16/bf16 weight scale tensor, 1 x n format.
        zero_point: fp16/bf16 weight zero_point tensor, 1 x n format.
            Must be provided for asymmetric quantization.
        has_zp: if use symmetric quantization, has_zp = False.
            if use asymmetric quantization, has_zp = True.

    Returns:
        tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] :
            rearranged weight, scale, and optionally zero_point.
    r   rT   r  r   Nz8zero_point must be provided for asymmetric quantization.)r   r+   rh   rX   rW   r,   r-   #rearrange_kn_weight_as_n32k16_order)
r   r   r  r   KN	N_32alignqweight_reorderscale_reorderzero_point_reorders
             r   allspark_repack_weightr    s   , 	aAaAR!"R'Ik	Aw~W]  O KIu|5;WWWM 
%%F &%% #[	N:#4J<L
 
 
 
IL44		   M+===r   c                 ^    t           j        j                            | |||||||||	|
          S r   )r+   r,   r-   r  )r   r  r   r  r  r   r  r  r  r   r!  s              r   r  r  B  sA     9<++		  r   	symmetricc                    t          j        | t           j                  }|>||du k    s
J d            t           j        j                            || ||           |||fS t          j        |                                 | j        d         z  df| j	        t           j
                  }|rdnt          j        |t           j                  }t           j        j                            ||                                 ||           |||fS )  
    Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.

    Args:
        input: The input tensor to be quantized to int8.
        scale: Optional scaling factor for the int8 quantization.
            When not provided, we invoke dynamic-per-token quantization.
        azp: Optional zero-point for the int8 quantization.
            Must be provided for asymmetric quantization if `scale` is provided.
        symmetric: Whether to use symmetric quantization (scale only, azp ignored).

    Returns:
      tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
    r   N6azp must only be provided for asymmetric quantization.r   rT   r   )r+   r   ri  r,   r-   static_scaled_int8_quantrh   r   r   rX   r   r  dynamic_scaled_int8_quant
contiguous)rq   r   r^  r  rB   input_scales	input_azps          r   scaled_int8_quantr  _  s   ( e5:666FSD[)))D *)) 		--feUCHHHuc!! ;	%+b/	)1-el%-  L "Xu'7EK'X'X'XI	IL**  ""L)   <**r   c                 R    t           j        j                            | ||||          S r   )r+   r,   r-   r$  r*  s        r   r$  r$    s$     9<'':q!UCCCr   c                 P    t           j        j                            | |||          S r   )r+   r,   r-   ggml_mul_mat_vec_a8r0  s       r   r  r    s"     9<++Aq*cBBBr   c                 P    t           j        j                            | |||          S r   )r+   r,   r-   ggml_mul_mat_a8r0  s       r   r  r    s"     9<''1j#>>>r   c	                 Z    t           j        j                            | ||||||||	  	        S r   )r+   r,   r-   ggml_moe_a8r;  s	            r   r  r    s;     9<##		
 
 
r   c           	      V    t           j        j                            | ||||||          S r   )r+   r,   r-   r=  r@  s          r   r=  r=    s)     9<''1hz3PVWWWr   c                 J    t           j        j                            |           S r   )r+   r,   r-   ggml_moe_get_block_size)r&  s    r   r!  r!    s    9<//
;;;r      udeltaABCD_z_delta_bias_delta_softpluscache_indiceshas_initial_state
ssm_statespad_slot_idblock_idx_first_scheduled_tokenblock_idx_last_scheduled_tokeninitial_state_idxc                 p    t           j        j                            | |||||||||	|
|||||||           d S r   )r+   r,   r-   selective_scan_fwd)r#  r$  r%  r&  r'  r(  r)  r*  r+  r6   r,  r-  r.  r/  r   r0  r1  r2  s                     r   r4  r4    s`    ( 
IL##				

'&%    r   c                 Z    |                                  } |                                 }| |fS r   )r  )r   r  s     r   *rocm_enforce_contiguous_skinny_gemm_inputsr6    s'     	
A	Aa4Kr   rows_per_blockc                 t    t          | |          \  } }t          j        j                            | ||          S r   )r6  r+   r,   r:   LLMM1)r   r  r7  s      r   r9  r9     s2    5a;;DAq9""1a888r   cu_countc                 v    t          | |          \  } }t          j        j                            | |||          S r   )r6  r+   r,   r:   wvSplitKr   r  r:  rS  s       r   r<  r<    s6     6a;;DAq9%%aD(;;;r   c                 v    t          | |          \  } }t          j        j                            | |||          S r   )r6  r+   r,   r:   
wvSplitKrcr=  s       r   r?  r?    s6     6a;;DAq9''1dH===r   c           	          t          | |          \  } }t          j        |j        d         | j        d         f||j                  }t          j        j                            | ||||||           |S r/  )r6  r+   rh   r   rX   r,   r:   	wvSplitKQ)r   r  r  rQ  rR  r:  rS  r   s           r   rA  rA    sj     6a;;DAq
+qwqz171:.i
Q
Q
QC	I1dC'8LLLJr   c                 P    t           j        j                            | |           d S r   )r+   r,   r  moe_sum)rq   rB   s     r   rC  rC  #  s#    	IUF+++++r   experts_idsnum_tokens_post_pad
expert_mapc           	      Z    t           j        j                            | ||||||           d S r   )r+   r,   r  moe_align_block_size)r>  rv  r   r5  rD  rE  rF  s          r   rH  rH  '  s@     
I))    r   max_tokens_per_batch
sorted_idsc                 X    t           j        j                            | |||||           d S r   )r+   r,   r  batched_moe_align_block_size)rI  r   r  rJ  r6  rE  s         r   rL  rL  ;  s=     
I11    r   token_lora_mapping	max_lorasmax_num_tokens_paddedmax_num_m_blocksadapter_enabledlora_idsc                 f    t           j        j                            | |||||||||	|
||           d S r   )r+   r,   r  moe_lora_align_block_size)r>  rM  rv  r   rN  rO  rP  r5  rD  rE  rQ  rR  rF  s                r   rT  rT  M  sR     
I..    r   topk_weightsBLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_Kc                     t          j                    st          d          t          j        j                            | |||||||||	|
|||           d S )NzGThe optimized moe_wna16_gemm kernel is only available on CUDA platforms)r   r   NotImplementedErrorr+   r,   r  moe_wna16_gemm)rq   rB   r  r   r  rU  r5  rD  rE  r8  rV  rW  rX  r   s                 r   r[  r[  m  s}      #%% 
!U
 
 	
 
I##    r   token_expert_indicesgating_outputrenormalizee_score_correction_biasc                 X    t           j        j                            | |||||           d S r   )r+   r,   r  topk_softmaxrU  r>  r\  r]  r^  r_  s         r   ra  ra    =     
I!!    r   c                 X    t           j        j                            | |||||           d S r   )r+   r,   r  topk_sigmoidrb  s         r   re  re    rc  r   scoresnum_expert_group
topk_grouprouted_scaling_factorscoring_funcc           
          t          j                    st          d          t          j        j                            | |||||||          S )a5  
    Perform grouped top-k routing for mixture of experts.

    Args:
        scores: Raw inputs (logits if scoring_func=1, scores if scoring_func=0)
        num_expert_group: Number of expert groups
        topk_group: Number of groups to select
        topk: Number of experts to select per token
        renormalize: Whether to renormalize the output weights
        routed_scaling_factor: Scaling factor for routing weights
        bias: Bias tensor (e_score_correction_bias). Always fused in kernel.
        scoring_func: 0=none (no activation), 1=sigmoid
    zAThe fused grouped_topk kernel is only available on CUDA platforms)r   r   rZ  r+   r,   r  grouped_topk)rf  rg  rh  r  r^  ri  rS  rj  s           r   rl  rl    sa    . #%% 
!O
 
 	
 9((	 	 	r   r   num_tokens_past_paddedmoe_block_sizemul_topk_weightsthread_kthread_nblocks_per_smc                     t           j        j                            | |||||||||	|
||||||||j        ||||||||||          S r   )r+   r,   r  moe_wna16_marlin_gemmr   )rq   rB   r  r   r   r   r   r  r   r   r   r5  r6  rm  rU  rn  r8  ro  r   r   r   r   r   r   r   r   rp  rq  rr  s                                r   rt  rt    sz    > 911;  r   r  marlin_gemm_moez_moe_C::marlin_gemm_moeb_q_weightsb_zero_pointsreplicate_inputapply_weightsc                 J    t          j        |||f| j        | j                  S r(  r  )r   rv  rJ  rU  r>  r   rw  r   r   r   r   r   r   r   r   rv  r  rn  rx  ry  s                       r   marlin_gemm_moe_faker{   	  s&    . {FD&1RRRRr   z_moe_C::moe_wna16_marlin_gemmc                 N    t          j        ||z  |f| j        | j                  S r(  r  )rq   rB   r  r   r   r   r   r  r   r   r   r5  r6  rm  rU  rn  r8  ro  r   r   r   r   r   r   r   r   s                             r   moe_wna16_marlin_gemm_faker}  9	  s1    : {e^V$EK
 
 
 	
r   valueslot_mappingc           
      \    t           j        j                            | |||||||           d S r   )r+   r,   _C_cache_opsreshape_and_cacherk   r~  r   r   r  r!   r"   r#   s           r   r  r  [	  sC     
I,,	 	 	 	 	r   c           
      \    t           j        j                            | |||||||           d S r   )r+   r,   r  reshape_and_cache_flashr  s           r   r  r  q	  sC     
I22	 	 	 	 	r   kv_ck_pec                 X    t           j        j                            | |||||           d S r   )r+   r,   r  concat_and_cache_mla)r  r  r=   r  r!   r   s         r   r  r  	  s8     
I//dHlNE    r   q_pekv_cache_scalec
                 `    t           j        j                            | |||||||||	
  
         d S r   )r+   r,   r  concat_and_cache_mla_rope_fused)
rj   r  r  r  rm   rn   r  r=   r!   r  s
             r   r  r  	  sI     
I::    r   srcdstblock_size_in_bytesblock_mappingc                 T    t           j        j                            | |||           dS )as  
    Copy specific blocks from one tensor to another.

    This method assumes each of the two input tensors is composed of
    consecutive contiguous blocks, of size block_size_in_bytes.
    i.e. the memory layout for each tensor is:
    [block0] [block1] ... [block N]

    block_mapping determines the subset of blocks to copy of the source tensor,
    and their matching destination block number on the destination tensor.
    block_mapping is expected to be a tensor of shape (num_blocks_to_copy, 2)
    where each block_mapping[i] represents a single copy operation, copying
    block #block_mapping[i][0] from the source tensor
    to block #block_mapping[i][1] on the destination tensor.
    block_mapping should have dtype int64.

    The source and the destination tensors can be either on cpu or gpu,
    but not both on cpu.
    the block mapping tensor must on cpu.
    N)r+   r,   r  swap_blocks)r  r  r  r  s       r   r  r  	  s*    4 
I&&sC1DmTTTTTr   r   kv_dtypec                 T    t           j        j                            | |||           d S r   )r+   r,   r  convert_fp8)rB   rq   r   r  s       r   r  r  	  s)     
I&&vueXFFFFFr   	src_cacheblock_tablecu_seq_lenstoken_to_seq
num_tokens
seq_startsc	                 ^    t           j        j                            | ||||||||	  	         d S r   )r+   r,   r  gather_and_maybe_dequant_cache)	r  r  r  r  r  r  r!   r   r  s	            r   r  r  	  sF     
I99
 
 
 
 
r   r\   c                 X    t           j        j                            | |||||           d S r   )r+   r,   r  cp_gather_cache)r  r  r  r  r\   r  s         r   r  r  	  s8     
I**3[*j    r   workspace_startsc                 X    t           j        j                            | |||||           dS )a  Gather and upconvert FP8 KV cache to BF16 workspace.

    Args:
        src_cache: FP8 KV cache [num_blocks, block_size, 656]
        dst: BF16 output workspace [total_tokens, 576]
        block_table: Block indices [num_reqs, max_blocks]
        seq_lens: Sequence lengths [num_reqs]
        workspace_starts: Workspace start offsets [num_reqs]
        batch_size: Number of requests
    N)r+   r,   r  $cp_gather_and_upconvert_fp8_kv_cache)r  r  r  r   r  r\   s         r   r  r  	  s9    $ 
I??3X/?    r   quant_block_sizec                 V    t           j        j                            | ||||           d S r   )r+   r,   r  indexer_k_quant_and_cache)rw  r=   r  r  r!   s        r   r  r  
  s7     
I44	8\#3^    r   dst_k	dst_scalec                 V    t           j        j                            | ||||           d S r   )r+   r,   r  cp_gather_indexer_k_quant_cache)r=   r  r  r  r  s        r   r  r  
  s6     
I::%K    r   	attributerX   c                 L    t           j        j                            | |          S r   )r+   r,   _C_cuda_utilsget_device_attribute)r  rX   s     r   r  r  %
  s    9"77	6JJJr   c                 J    t           j        j                            |           S r   )r+   r,   r  0get_max_shared_memory_per_block_device_attribute)rX   s    r   r  r  )
  s"    9"SS  r   ipc_tensors	rank_datarankfully_connectedc                 P    t           j        j                            | |||          S r   )r+   r,   _C_custom_arinit_custom_ar)r  r  r  r  s       r   r  r  1
  s*     9!00Yo  r   fainp
reg_bufferreg_buffer_sz_bytesc                 V    t           j        j                            | ||||           d S r   )r+   r,   r  
all_reduce)r  r  r   r  r  s        r   r  r  <
  s,     
I%%b#sJ@STTTTTr   c                 N    t           j        j                            |            d S r   )r+   r,   r  disposer  s    r   r  r  F
  s!    	I""2&&&&&r   c                  H    t           j        j                                        S r   )r+   r,   r  	meta_sizer   r   r   r  r  J
  s    9!++---r   c                 L    t           j        j                            | |          S r   )r+   r,   r  register_buffer)r  r  s     r   r  r  N
  s    9!11"kBBBr   c                 J    t           j        j                            |           S r   )r+   r,   r  get_graph_buffer_ipc_metar  s    r   r  r  R
  s    9!;;B???r   handlesoffsetsc                 R    t           j        j                            | ||           d S r   )r+   r,   r  register_graph_buffers)r  r  r  s      r   r  r  V
  s'     
I11"gwGGGGGr   rY   c                 J    t           j        j                            |           S r   )r+   r,   r  !allocate_shared_buffer_and_handle)rY   s    r   r  r  \
  s    9!CCDIIIr   
mem_handlec                 J    t           j        j                            |           S r   )r+   r,   r  open_mem_handle)r  s    r   r  r  `
  s    9!11*===r   ptrc                 N    t           j        j                            |            d S r   )r+   r,   r  free_shared_buffer)r  s    r   r  r  d
  s!    	I--c22222r   
world_sizeqr_max_sizec                 N    t           j        j                            | ||          S r   )r+   r,   r  init_custom_qr)r  r  r  s      r   r  r  i
  s    9!00z;OOOr   c                 N    t           j        j                            |            d S r   )r+   r,   r  
qr_destroyr  s    r   r  r  m
  s!    	I%%b)))))r   quant_levelcast_bf2halfc                 V    t           j        j                            | ||||           d S r   )r+   r,   r  qr_all_reduce)r  r  r   r  r  s        r   r  r  q
  s+     
I((S#{LQQQQQr   c                 J    t           j        j                            |           S r   )r+   r,   r  qr_get_handler  s    r   r  r  {
  s    9!//333r   c                 L    t           j        j                            | |          S r   )r+   r,   r  qr_open_handles)r  r  s     r   r  r  
  s    9!11"g>>>r   c                  H    t           j        j                                        S r   )r+   r,   r  r  r   r   r   r  r  
  s    9!--///r   cache_seqlensnum_heads_per_head_kc                 N    t           j        j                            | ||          S )ac  
    Arguments:
        cache_seqlens: (batch_size), dtype torch.int32.
        num_heads_per_head_k: Equals to seq_len_q * num_heads_q // num_heads_k.
        num_heads_k: num_heads_k.

    Return:
        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32.
        num_splits: (batch_size + 1), dtype torch.int32.
    )r+   r,   r-   get_flash_mla_metadata)r  r  r{   s      r   r  r  
  s(     9<..+[  r   qk_cache
head_dim_vtile_scheduler_metadata
num_splitssoftmax_scalec	                     || j         d         dz  }t          j        j                            | |d|||||||
  
        \  }	}
|	|
fS )a5  
    Arguments:
        q: (batch_size, seq_len_q, num_heads_q, head_dim).
        k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
        block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
        cache_seqlens: (batch_size), torch.int32.
        head_dim_v: Head_dim of v.
        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, return by get_mla_metadata.
        num_splits: (batch_size + 1), torch.int32, return by get_mla_metadata.
        softmax_scale: float. The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim).
        causal: bool. Whether to apply causal attention mask.

    Return:
        out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
        softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
    Nr   g      )r   r+   r,   r-   flash_mla_fwd_kvcache)r  r  r  r  r  r  r  r  rQ   r   softmax_lses              r   flash_mla_with_kvcacher  
  sc    6 -y|99	 C r   lseq_nopekv_c_and_k_pe_cache
page_tablenum_kv_splitsc
                 `    t           j        j                            | |||||||||	
  
         | S r   )r+   r,   r-   sm100_cutlass_mla_decode)
r   r  r  r  r  r   r  r   r   r  s
             r   r  r  
  sE     
IL))   Jr   num_batchesc                 P    t           j        j                            | |||          S r   )r+   r,   r-   $sm100_cutlass_mla_get_workspace_size)r   r  r  r  s       r   r  r  
  s)     9<<<[(M  r   weight_packed_linearz_C::weight_packed_linearmat1mat2is_vnnic                     t          j        |                     d          |                    d          f| j        |j                  S r/  r   )r  r  rS  r  s       r   weight_packed_linear_faker  
  sA     {YYq\\499Q<<(
4;
 
 
 	
r   fused_experts_cpuz_C::fused_experts_cpuhidden_statesw1w2use_int8_w8a8use_fp8_w8a16w1_scalew2_scalea1_scalea2_scalec                 *    t          j        |           S r   r  )r  r  r  rU  r>  r  r  r  r  r  r   r  r	  r  s                 r   fused_experts_cpu_faker  
  s    " ...r   int8_scaled_mm_with_quantz_C::int8_scaled_mm_with_quantscales2c                     |                      d          }|                     d          }t          j        ||f|          S )Nr   r   )rY   r+   rh   )r  r  r  rS  r  r  Mr  s           r   int8_scaled_mm_with_quant_faker    s;     IIaLLIIaLL{Aq63333r   c                       e Zd ZddZd ZdS )CPUDNNLGEMMHandlerr)   Nc                 0    d | _         d| _        d| _        d S )Nr   )handlerr  rw  selfs    r   __init__zCPUDNNLGEMMHandler.__init__  s    #'r   c                 j    | j         +t          j        j                            | j                    d S d S r   )r  r+   r,   r-   release_dnnl_matmul_handlerr  s    r   __del__zCPUDNNLGEMMHandler.__del__$  s2    <#IL44T\BBBBB $#r   )r)   N)__name__
__module____qualname__r  r  r   r   r   r  r    s<           
C C C C Cr   r  create_onednn_mm_handlerc                  H    t           j        j                                        S r   )r+   r,   r-   is_onednn_acl_supportedr   r   r   r   r   ,  s    9<//111r   r  primitive_cache_sizec                     t                      }|                                 \  |_        |_        t          j        j                            | |          |_        |S r   )	r  rY   rw  r  r+   r,   r-   r  r  )rr   r!  r  s      r   create_onednn_mmr#  0  sM     !""G!;;==GIwyil;;$ GO Nr   dnnl_handlerr  c                     t          j        g |j        dd         | j        R |j                  }t           j        j                            ||                    d| j	                  || j
                   |S )Nr   r   r   )r+   rh   r   r  rW   r,   r-   	onednn_mmr  rw  r  )r$  r  rS  rB   s       r   r&  r&  <  ss    
 [9171R4=9,.99IIIF	IL		"ln--t\5I   Mr   weight_scalesoutput_typedynamic_quantuse_azpc                     t                      }|                                 \  |_        |_        t          j        j                            | |||||          |_        |S r   )	r  rY   rw  r  r+   r,   r-   create_onednn_scaled_mm_handlerr  )rr   r'  r(  r)  r*  r!  r  s          r   create_onednn_scaled_mmr-  I  sU     !""G!;;==GIwyilBB{M7DX GO Nr   c                 :   t          j        | t           j                  }|                                 | j        d         z  }|                     || j        d         f          } |>||du k    s
J d            t           j        j                            || ||           |||fS t          j	        |df| j
        t           j                  }|rdnt          j        |t           j                  }t           j        j                            || ||           |||fS )r  r   r   Nr  rT   r   )r+   r   ri  r   r   rW  r,   r-   r  rh   rX   r   r  r  )rq   r   r^  r  rB   	token_numr  r  s           r   onednn_scaled_int8_quantr0  Y  s   ( e5:666FR0IJJ	5;r?344ESD[)))D *)) 		--feUCHHHuc!! ;	1~el%-XXXL!Xu'7EK'X'X'XI	IL**65,	RRR<**r   input_scaleinput_zpinput_zp_adjc           	      d    t           j        j                            ||||||| j                   |S r   )r+   r,   r-   onednn_scaled_mmr  )r$  r  rB   r1  r2  r3  rS  s          r   r5  r5    s9     
IL!!;,l>R   Mr   num_reqsr]   sliding_window_sizeisaenable_kv_splitc                 b    t           j        j                            | |||||||||	|
          }|S r   )r+   r,   r-   get_scheduler_metadata)r6  r]   r   r}   r   rW   r6   rQ   r7  r8  r9  sheduler_metadatas               r   cpu_attn_get_scheduler_metadatar=    sG     	;;  r   c                 X    t           j        j                            | |||||           d S r   )r+   r,   r-   cpu_attn_reshape_and_cache)rk   r~  r   r   r  r8  s         r   r?  r?    s<     
IL++    r   sliding_windowsoftcapscheduler_metadatas_auxc                     t           j        j                            | |||||||||	d         |	d         |
|||           d S )Nr   rT   )r+   r,   r-   cpu_attention_with_kv_cache)r   r   r   rB   r6   r   r   rQ   r    r@  r  rA  rB  rC  s                 r   rE  rE    sa      
IL,,qq    r   r  isa_hintc                     t          j        |                     d          |                    d          f| j                  }t           j        j                            | ||||||||	  	         |S )Nr   rT   r   )r+   rh   rY   rW   r,   r-   cpu_gemm_wna16)	rq   r   r   rZ   r   rS  r  rF  rB   s	            r   rH  rH    sq     [%**Q--Q8LLLF	IL
 
 
 Mr   c                 z    t          j        |           }t           j        j                            | ||           |S r   )r+   r   r,   r-   prepack_moe_weight)rr   r8  rB   s      r   cpu_prepack_moe_weightrK    s5     f%%F	IL##FFC888Mr   w13w13_biasw2_biasactc	                     t          j        |           }	t           j        j                            |	| ||||||||
  
         |	S r   )r+   r   r,   r-   cpu_fused_moe)
rq   rL  r  rM  rN  rU  r>  rO  r8  rB   s
             r   rQ  rQ    sU     e$$F	IL
   Mr   matmul_mxf4_bf16_tnz_qutlass_C::matmul_mxf4_bf16_tna_sfb_sfc                 j     | j         g | j        d d         |j        d         R dt          j        iS Nr   r   rW   	new_emptyr   r+   r   r   r  rS  rT  rI  s        r   _fake_matmul_mxf4_bf16_tnrZ     ;     q{KAGCRCLK!'!*KKKENKKKr   c                 R    t           j        j                            | ||||          S r   )r+   r,   
_qutlass_CrR  rY  s        r   rR  rR  +  s%     933Aq$eLLLr   matmul_ada_mxf4_bf16_tnz#_qutlass_C::matmul_ada_mxf4_bf16_tnc                 j     | j         g | j        d d         |j        d         R dt          j        iS rV  rW  rY  s        r   _fake_matmul_ada_mxf4_bf16_tnr`  7  r[  r   c                 R    t           j        j                            | ||||          S r   )r+   r,   r]  r^  rY  s        r   r^  r^  B  s%     9771dD%PPPr   fusedQuantizeMxQuestz _qutlass_C::fusedQuantizeMxQuestxh_e2m1xh_e8m0c                 
    ||fS r   r   r   r  rc  rd  s       r   _fake_fused_quantize_mx_questrg  N       r   fusedQuantizeMxAbsMaxz!_qutlass_C::fusedQuantizeMxAbsMaxc                 
    ||fS r   r   rf  s       r   _fake_fused_quantize_mx_absmaxrk  W  rh  r   quest)methodrm  )rl  abs_maxc                   |                                  dk    rt          d          |                     d          dz  dk    r&t          d|                     d           d          |j        | j        k    rt          d          t	          j        g | j        d d         |                     d          dz  R t          j        | j        d	}|                                 |                     d          z  |                     d          dz  }}t          |d
          }t          |d          }|d
z  }|dz  }	t	          j        ||	t          j
        | j        	          }
t          t          j        d          st          d          |dk    r't          j        j                            | |||
          S |dk    r't          j        j                            | |||
          S t          d|d          )Nr   z#`a` must have at least 1 dimension.r   r  z-last dim of `a` must be divisible by 32, got r  z'`a` and `b` must be on the same device.rU   rV   r  rh  r]  zvThe `_qutlass_C` extension is not loaded. Make sure your custom op library is imported before calling fusedQuantizeMx.rl  rn  zinvalid method z, must be 'quest' or 'abs_max')r   
ValueErrorrY   rX   r+   rh   r   r  r   r	   float8_e8m0fnuhasattrr,   RuntimeErrorr]  rb  ri  )r   r  rm  rc  rowscolsn_row_blocksn_col_blockspadded_rowspadded_colsrd  s              r   fusedQuantizeMxrz  ^  s    	uuww!||>???vvbzzB!VPRVVVWWWx18BCCCk 	
"vvbzzQ .3k!(  G affRjj(!&&***:$Dc??La==L$K"Kk[(<QX  G 59l++ 
[
 
 	

 y#88AwPPP	9		y#99!QQQQS6SSSTTTr   fusedQuantizeNvz_qutlass_C::fusedQuantizeNvxh_e4m3c                 
    ||fS r   r   )r   r  rc  r|  r   s        r   _fake_fused_quantize_nvr~    s     r   c                 
   t          j        g | j        d d         |                     d          dz  R t           j        | j        d}|                                 |                     d          z  |                     d          dz  }}t          |d          }t          |d          }|dz  }|dz  }	t          j        ||	t           j        | j                  }
t           j	        j
                            | |||
|          S )Nr   rU   rV   rU  r  rh  )r+   rh   r   rY   r  rX   r   r	   rj  r,   r]  r{  )r   r  r   rc  rt  ru  rv  rw  rx  ry  r|  s              r   r{  r{    s     k 	
"vvbzzQ .3k!(  G affRjj(!&&***:$Dc??La==L$K"Kk[(;AH  G 9//1gwUUUr   c                 L    t           j        j                            | |          S )a	  
    Perform Hadamard transforms using [Hadacore](https://arxiv.org/abs/2412.08832)
    kernels. Note that these kernels exploit the recursive properties of
    Sylvester Hadamards, and therefore do not require transform weight data

    Note that sylvester hadamard transforms are also symmetric, which means that
    this function is also applies the (transpose <=> inverse) transform.

    :param x: value to be transformed inplace
    :param inplace: modify value in place
    :return: value after transformation
    )r+   r,   r-   hadacore_transformr  r  s     r   r  r    s     9<**1g666r   r  z_C::hadacore_transformc                 2    |st          j        |           n| S r   r  r  s     r   _hadacore_transform_faker    s    *18u"""q8r   )r   r   r   r   r   r   )T)NN)NNF)TFFF)NNNNNNN)Frd  )NNNN)Tr  )NNNFNN)NNT)r"  NNN)FN)r   )r   r   r   )r   r4   )r  )typingr   r   r+   	vllm.envsr   vllm.loggerr   vllm.platformsr   vllm.scalar_typer   vllm.utils.flashinferr   vllm.utils.math_utilsr	   r  loggerimport_kernelsr   torch.libraryImportErrorr   Tensorintfloatstrr.   r3   VLLM_ROCM_FP8_MFMA_PAGE_ATTNr<   rA   rI   booltupler[   ri   rp   ru   rx   r   r   r   r   rW   r   listr   r   r   r   rr  r,   r-   r   r   r   SymIntr   r   r   r   r  r  r  r  r  r  r#  r+  r1  r4  r<  rA  rD  rL  rN  rP  rZ  r`  rb  re  rk  rp  rz  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r$  r  r  r  r=  r!  r4  r6  r9  r<  r?  rA  rC  rH  rL  rT  r[  ra  re  rl  rt  r  r{  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  _supports_onednnr   r#  r&  r-  r0  r5  r=  r?  rE  rH  rK  rQ  r]  rZ  rR  r`  r^  rg  rk  rz  r~  r{  r  r  r   r   r   <module>r     s6J   * ) ) ) ) ) ) )        # # # # # # + + + + + + ' ' ' ' ' '      ' & & & & &	X		    ! ! ! A   A/////// A A A@@@@@@@@A( $%#$"$)*') )	)<) |) 	)
 ) ) ,) l) ) ) ,%) ) \) \) )  "!)" !#)$  %)& $'')( 
)) ) ) )| $%#$"$)*-/ /	/\/ / \	/
 </ |/ / / / ,/ l/ / / ,%/ /  \!/" \#/$ %/& "'/( !)/*  +/, $'-/. 
// / / /J *."?JUUU)+ +	+\+ + \	+
 <+ |+ + + + ,+ l+ \D(+ + + ,%+  !+" \#+$ \%+& <$&'+( )+* 
++ + + +\
	
<
 l
 	

 ,
 l
 

 
 
 
( '+
 
L
<
 
 <	

 
 t#
 

 
 
 
* 5A 5A|5A5A l5A <	5A
 5A 5A 5A 5A 5<u|U\AB5A 5A 5A 5AF :A :A|:A:A l:A <	:A "L:A :A :A :A :A :A 5<u|U\AB:A :A :A :A|
|
<
 
	
 	

 <
 
 

 
 
 
7	7#l749L7KP7	7 7 7 7F<F#(<F9>FPUF	F F F F	  	
  
 l l <  , 
   8L   ,	
 
    L   ,	
 
   
L

 
  ,	

 

 
 
 
< %)$( <L  	
 lT! lT! 5<%&   4 %)$( %$ $<$L$ $ 	$
 S	$ lT!$ lT!$ $ 5<%&$ $ $ $RX\XLX <X 	X
 
X 
X \X X X X"P<P\P LP L	P
 P \P P P P| < <	
 \   
 \   , 759<%% 
]?##
<
L
 |
 |	

 
 
 
 
 

 
 
 $#
55< 5 5C 5D 5 5 5 5
| L l	
 |     \     759<.// ~J],--M<MLM M ,	M
 <M M M M M 
M M M .-M ]$%%" $ %!'K K<K<$K LK t#	K
 ,K ,%K lT)K $K |d"K lT!K <K K K K K  !K" #K$ %K& 'K( 
)K K K &%K4 ]'((TTT |T |	T
 T T 
T T T )(T ]>""|  	
 | 
   #" ]#$$ (,.2-1#'04.2#C C<C \C 	C
 +$C t+C |d*C DjC  ,-C t+C *C 
C C C %$C" ]*++SLSS S !;-	S
 
S S S ,+S ]()) (,%)E E<E \E 	E
 E  ,E E +$E d
E 
E E E *)E  ]/00OEL OU\ O O O 10O ]9::J J%, J J J ;:J ]ABBJ J%, J J J CBJ 759<.// C],--C<C<C ,C ,%	C
 <C LC ,C LC "LC C C 
C C C .-C" 759<*++ ,X]()) %)I I<II <I <	I
 {T!I 
I I I *)I ],--N<N<N N \	N
 
N N N .-N ]())I<I<I I \	I
 
I I I *)I ]$%%X<X<X  ,X L	X
 !&X X \X |X X 
X X X &%X 759<*++ R]())
R<
R<
R ,
R 	
R
 
R \
R 
R 

R 
R 
R *)
RO3 O4 O O O O|| < <	
 < { \   O3 O4 O O O OU U U U U U !%0# 0#|0#|0# \0# \	0#
 {0# ,
0# \0# 0# 0# 0#t  $ $# #|#|# \# \	#
 {# \# 
	# ,
# \# # # #:Ss St S S S S     &3u| &3elEL6P0Q &3 &3 &3 &3` !%, ,|,L, \, \	,
 \, {, ,
, \, , , ,r /3. .l.L. L. L	.
 |. . . . . t+. . . .b$|L L 	
     &u| %,     L L  L  |	 
              F&&|& |& l	&
 l& L& <& |& |& |& & & & & &R%%|% |% l	%
 l% L% <% L% % % % %^ 
 


,
 
 	

 
 
 \
 
 
 
 759<-.. 
]+,,  
 
L
l
 
 	

 
 
 

 
 
 -,
. 	 			 	 		
 	 \	 	 	 	 759<,-- 
]*++  
 
L

 
 	

 
 

 
 
 ,+
,  
,  	
   \   8  
,  	
   \   0 +/U U\UL4'U U U U U0  !') )|)|d) ) L4	)
 l) lT!) ,%) \D ) <$) ,
) |) ) ) ) )  !)" #)$ %)& ')( \)) ) ) )b ,0.2,0#' K {T) kD(	
 t+ {T) kD  
#Y   2 $(*.)-#,0*. | 
 	
 kD  L4' <$& * lT) L4' Dj \   6K  {T)	
 \   ( $(!% | 
 L	
  l L kD  $J \   .75< 7EL 7 7 7 7< < < < < <$ "&? ??|? |? l	?
 l? L? ? L? <? |? |? |? ? $J? ? ? ?DL|L
5<%&L L L L 759<(( #]%&&#el #%, #5< # # # '&#.EL . . . . . . #'	D  D <D D   D  	D 
 5<%&D  D  D  D N;!,;!;! L;! 	;!
 ;! 5<%&;! ;! ;! ;!|>!,>!>! L>! 	>!
 >! 5<%&>! >! >! >!H "&$($(%*"&*.B B<B<$B TzB lT!	B
 #B L4B sCx4'B 5<%&B B B BR '+	4> 4>\4><4> t#4> 	4>
 5<u|344> 4> 4> 4>n|| l lT!	
        \   > "&#	%+ %+<%+<$%+ 
	%+ 	%+
 5<u|d'::;%+ %+ %+ %+RD|D!$D),D14D=B[4=OD
\D D D DC|C|C C 
	C
 \C C C C?|?|? ? 
	?
 \? ? ? ?|| l 	
 "L  
   \   0	X|	X|	X l	X 		X
 	X 
	X L	X \	X 	X 	X 	X< < < < < <( ;?:>-1%' '|'<' |' |	'
 |' 	t' 	t' $' ' \D(' <$&' |d*' ' ' '  &+\D%8!'" %*L4$7#'$ |d*%' ' ' '\|
5<%&   9U\ 9el 9C 9EL 9 9 9 9 KO< <|<<03<;@<<
\< < < < KO> >|>>03>;@<>
\> > > >  || { \	
 \  , \    ,5< , , , , , '+ l  l	
   t# 
   ( | 	
   
   > '+ l  	
    l   \ l t# 
   @#<#L# |# l	#
 lT!# ,%# l# # # # # # # 
# \# # # #V 37 ,l  , <	
  #\D0 
   . 37 ,l  , <	
  #\D0 
   4 $ $L$$ $ 	$
 $ !$ ,$ $ $ $ $D ;= =<=L4= |= L4	=
 l= lT!= ,%= lT!= <$= ,
= |= l= = "L= ,=  !=" #=$ %=& '=( )=* +=, -=. /=0 1=2 3=4 5=6 7=8 9=: ;=< \== = = =@ 759h :
GGEI,<>O$P$P :
],--S<S\S LS l	S
 ,S ,S |S |S lS <S S S S S S  !S" #S$ %S& 'S( )S* 
+S S S .-S0 ]233
|
t#
 <
 t#	

 ,
 ,%
 lT)
 ,%
 |d"
 lT!
 <
  ,
 L
 !&
 l
  !
" #
$ %
& '
( )
* +
, -
. /
0 1
2 3
4 5
 
 
 43
B	< | 	
 ,  \ \ 
   ,	< | 	
 ,  \ \ 
   ,

,

,
 l
 ,	

 
 <
 

 
 
 
|
, , ,	
 <  , l  L 
   4U	U	U U <	U
 
U U U U< TYG GLG!&G6;GMPG	G G G G '+ |	  	
 ,   < t# 
   < '+
 
|
	
 
 	

 
 t#
 

 
 
 
|	  l	
 l  
   .	|	l	 ,	 		
 	 
	 	 	 		l	<	 |	 		
 	 
	 	 	 	KC K K K K K KS S    el#|  	
 	   UU	U 
U 	U
 U 
U U U U' ' ' ' ' '.3 . . . .C C$s) C C C C C@# @%S	490D*E @ @ @ @HH49oH04T#YH	H H H HJC JE#u|:K4L J J J J> > > > >3C 3D 3 3 3 3
P P P# PC$J PRU P P P P*3 *4 * * * * R RR	R 
R 	R
 R 
R R R R4c 4el 4 4 4 4? ?d5<&8 ?T ? ? ? ?0S 0 0 0 0<  5<%&	   8 #') )|)\) ) <	)
 ) #\) ) 4<) ) 5<%&) ) ) )X		 L ,	
  l  |   \   6#&25FI    759</00 
]-..
l
l
 lT!
 	

 

 
 
 /.
 759<,-- /]*++/|/L/ L/ l	/
 ,/ / / / ,%/ ,%/ I$/ ,%/ ,%/ / 
/ / / ,+/& 759<455 4]233
4l
4l
4 
4 lT!	
4
 ;
4 
4 

4 
4 
4 43
4C C C C C C C C 4	.HIIJJ 2 2 2 !$	 	L		 	 	 	 	
$
|
 ,

 \	
 
 
 
& !$ L<  	
      $ "&#	#+ #+<#+<$#+ 
	#+ 	#+ #+ #+ #+L$| L $	
 lT! ,% ,
 \      	
 l ; \   
  \   :	< | 	
 , 
 
   $ < |    L	 
 \  l      ,%  #s(O        <$  
       F<l L <$	
 <$ ,
   \   2L	 \   <	 	 lT!	
 \D  , l 
 
 \   6 759!677 
L]455L<L<L lL l	L
 |L L L 65LM|M|M ,M ,	M
 <M \M M M M 759!:;; 
L]899L<L<L lL l	L
 |L L L :9LQ|Q|Q ,Q ,	Q
 <Q \Q Q Q Q 759!788  ]566 < !L 38< JO,      76  759!899  ]677 < !L 38< JO,      87  PW#U #U #U|#U#U189K1L#U
5<%&#U #U #U #UL 759!233 
 ]011 < <    	 
 l      21 V|VV49LV
5<%&V V V V&7 7%, 7 7 7 7 7 7  759<-.. 9]+,,9EL 94 9EL 9 9 9 -,9 9 99 9s   A A'&A'