
    -`i1                     0   d dl mZmZ d dlmZ d dlmZmZ d dlZd dl	m
c mZ d dlmZ d dlmZ d dl	mZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-  ee.          Z/ ed          Z0 ej1                    Z2ej3        Z4ej5        j6        j7        j8        Z9ej5        j:        j;        j8        Z< G d de          Z= G d de=          Z> G d de=          Z? G d de-          Z@dS )    )ABCabstractmethod)Callable)Any	ParamSpecN)fx)auto_functionalized)PatternMatcherPass)	Attention)
VllmConfigget_layers_from_vllm_config)init_logger)QuantKeykNvfp4DynamickStaticTensorScale)current_platform)round_up   )	QUANT_OPS
empty_bf16
empty_fp32	empty_i32)is_func)enable_fake_mode)MatcherQuantFP8)VllmInductorPassVllmPatternMatcherPassPc            	       |   e Zd ZdZdededej        ddfdZde	d	e	dej
        fd
Zde	d	e	dej
        fdZedeeej        f         deej        gdf         deeej        f         fd            Zedej        j        ddfd            Zedej        j        ddfd            ZdeddfdZededdfd            ZdS )AttentionQuantPatternzQ
    The base class for Attn+Quant fusions.
    Should not be used directly.
    layer	quant_keydtypereturnNc                     || _         |j        | _        |j        | _        |j        | _        || _        |j        | _        || _        | j        t          v sJ d| j                     t          | j                 | _        d S )Nz unsupported quantization scheme )	r!   
layer_name	num_heads	head_sizer"   r#   quant_dtyper   QUANT_OP)selfr!   r"   r#   s       p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/compilation/fusion_attn.py__init__zAttentionQuantPattern.__init__.   s{     
*"$?
~***?t~?? +** "$.1    argskwargsc                 <    | j         dd|}t          j        |i |S Ncudar#   device)r#   torchemptyr+   r/   r0   s      r,   r7   zAttentionQuantPattern.emptyA   s+    :BB6B{D+F+++r.   c                 <    | j         dd|}t          j        |i |S r2   )r)   r6   r7   r8   s      r,   empty_quantz!AttentionQuantPattern.empty_quantE   s,    +vHHH{D+F+++r.   trace_fnprocess_fx_fnsc                 `     dt           j        dt           j        dt          j        f fd}|S )Nr/   r0   r$   c                  8     | i |}D ]} ||           |S N )r/   r0   gm
process_fxr<   r;   s       r,   wrappedz4AttentionQuantPattern.wrap_trace_fn.<locals>.wrappedN   s:    4*6**B,  

2Ir.   )r   r/   r0   r   GraphModule)r;   r<   rC   s   `` r,   wrap_trace_fnz#AttentionQuantPattern.wrap_trace_fnI   sI    
	16 	QX 	". 	 	 	 	 	 	 	 r.   rA   c                 (    ddl m}  ||            d S )Nr   )view_to_reshape)#torch._inductor.fx_passes.post_gradrG   )rA   rG   s     r,   fx_view_to_reshapez(AttentionQuantPattern.fx_view_to_reshapeW   s*    GGGGGGr.   c                 T   | j         j        D ]}t          |t          j        j        j        j                  s,|j        d         }t          d t          |          D                       r`|                    |j        d                    | j                             |           d S )Nr   c              3   (   K   | ]\  }}||k    V  d S r?   r@   ).0idims      r,   	<genexpr>z=AttentionQuantPattern.remove_noop_permutes.<locals>.<genexpr>d   s*      ::33!8::::::r.   r   )graphnodesr   r6   opsatenpermutedefaultr/   any	enumeratereplace_all_uses_with
erase_node)rA   nodedimss      r,   remove_noop_permutesz*AttentionQuantPattern.remove_noop_permutes]   s    HN 
	& 
	&D4!7!?@@ 9Q<D::)D//:::::  &&ty|444H%%%%
	& 
	&r.   pm_passc                 |    | j         j                            | j                  r|                     |           d S d S r?   )r!   implfused_output_quant_supportedr"   	_registerr+   r]   s     r,   register_if_supportedz+AttentionQuantPattern.register_if_supportedk   s@    :?77GG 	$NN7#####	$ 	$r.   c                     t           r?   )NotImplementedErrorrb   s     r,   ra   zAttentionQuantPattern._registero   s    !!r.   )__name__
__module____qualname____doc__r   r   r6   r#   r-   r   Tensorr7   r:   staticmethodr   r   r   rD   rE   rI   r\   r
   rc   r   ra   r@   r.   r,   r    r    (   s        
22 2 {	2
 
2 2 2 2&,3 ,# ,%, , , , ,, , , , , , , 1bn,-!2>"2D"89 
!R^#	$   \ ux3     \
 &!5 &$ & & & \&$-? $D $ $ $ $ "!3 " " " " ^" " "r.   r    c            	       R     e Zd ZdZ	 ddedej        deddf fdZd	e	ddfd
Z
 xZS )AttentionFp8StaticQuantPatterna?  
    Fusion for Attention+Fp8StaticQuant.

    Only triggers when the attention implementation returns True in
    `fused_output_quant_supported()`. If the pattern is found, the
    Fp8StaticQuant op will be removed from the graph, and its scale
    will be passed into Attention op as the `output_scale` argument.
    Tr!   r#   	symmetricr$   Nc                     t          t          t          |          }t                                          |||           t          |          | _        d S )N)r#   scalern   )r   	FP8_DTYPEr   superr-   r   quant_matcher)r+   r!   r#   rn   r"   	__class__s        r,   r-   z'AttentionFp8StaticQuantPattern.__init__~   sU     #5
 
 
	 		5111,Y77r.   r]   c                     dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        f fd}dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        f fd}                     d	 j         j                                       d	 j         j                                       d	 j         j                                       d	 j         j                  t          d
d
          g}t          j        |||t          	                    t          j
        t          j        t          j                  |           d S )Nqkvoutput_attnrp   r$   c           
          t          t          | |||j        d d           }t          |d         | j        d         j        j        z  g          }                    ||          d         S )Nquerykeyvalueoutputr&   output_scaleoutput_block_scaler   r   )r	   ATTN_OPr&   
RESHAPE_OPshaper'   r(   rs   )rv   rw   rx   ry   rp   at1attn_out_viewr+   s          r,   patternz9AttentionFp8StaticQuantPattern._register.<locals>.pattern   s~     &"?!#'	 	 	C 'AT^dn%DE M %%mU;;A>>r.   c           
      .   t           j        j        j                            | j        d         j        j        gdj        | j	                  }t          t          | |||j        |d           }t          |d         dj        j        z  g          S )Nr           r4   r{   r   )r6   rR   rS   fullrU   r   r'   r(   r)   r5   r	   r   r&   r   )rv   rw   rx   ry   rp   r   r+   s         r,   replacementz=AttentionFp8StaticQuantPattern._register.<locals>.replacement   s      ).-55T^T^<&x	 6  K &"?"#'	 	 	C c!fr4>DN+J&KLLLr.      r   )r6   rj   r7   r'   r(   r   pmregister_replacementr    rE   fwd_onlyrI   r\   r+   r]   r   r   inputss   `    r,   ra   z(AttentionFp8StaticQuantPattern._register   s   	?|	?|	? |	? 		?
 <	? \	? 	? 	? 	? 	? 	?.	M|	M|	M |	M 		M
 <	M \	M 	M 	M 	M 	M 	M6 JJq$.$.99JJq$.$.99JJq$.$.99JJq$.$.99q!
 	!//%8%: 
 
	
 
	
 
	
 
	
 
	
r.   )T)rf   rg   rh   ri   r   r6   r#   boolr-   r
   ra   __classcell__rt   s   @r,   rm   rm   t   s          	
8 
8
8 {
8 	
8
 

8 
8 
8 
8 
8 
8D
!3 D
 D
 D
 D
 D
 D
 D
 D
 D
r.   rm   c                   J     e Zd ZdZdedej        ddf fdZdeddfdZ	 xZ
S )	AttentionNvfp4QuantPatterna7  
    Fusion for Attention+Nvfp4Quant.

    Only triggers when the attention implementation returns True in
    `fused_output_quant_supported()`. If the pattern is found, the
    Nvfp4Quant op will be removed from the graph, and its scale
    will be passed into Attention op as the `output_scale` argument.
    r!   r#   r$   Nc                 X    t                                          |t          |           d S r?   )rr   r-   r   )r+   r!   r#   rt   s      r,   r-   z#AttentionNvfp4QuantPattern.__init__   s%    u55555r.   r]   c                 2    dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd	}dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd
}t          d j         j                  t          d j         j                  t          d j         j                  t          d j         j                                       d j         j        z  dz            t          dt           j         j        z  dz  d                    t          dd          g}t          j        |||t                              t          j        t          j        t          j                  |           d S )Nrv   rw   rx   ry   output_quantr   input_scaler$   c           
      X   t          t          | |||j        d d           }t          |d         | j        d         j        j        z  g          }t          j        ||||d          }	t          j	        j
        j                            |	d         t                    }
|	d         |
fS )Nr{   r   r   T)r   inputr   r   is_sf_swizzled_layout   )r	   r   r&   r   r   r'   r(   r*   r6   rR   rS   viewr#   rq   )rv   rw   rx   ry   r   r   r   r   r   at2output_scale_viewr+   s              r,   r   z5AttentionNvfp4QuantPattern._register.<locals>.pattern   s     &"?!#'	 	 	C 'AT^dn%DE M &##)'&*  C !&	 3 9 9#a&) L Lq6,,,r.   c           
         t           j        j        j                            | j        d         
j        
j        dz  gd
j        | j	                  }t           j        j        j
                            |t                    }t          t          | |||
j        ||          }t!          |d         d
j        
j        z  dz  g          }	|	|d         fS )Nr   r   r   r4   r{   r   r   )r6   rR   rS   r   rU   r   r'   r(   r)   r5   r   r#   rq   r	   r   r&   r   )rv   rw   rx   ry   r   r   r   r   r   r   r+   s             r,   r   z9AttentionNvfp4QuantPattern._register.<locals>.replacement   s      ).-55T^T^q-@A&x	 6  K !&	 3 9 9,	 R R%"?(#4	 	 	C  AT^dn-LPQ-Q(RSSF3q6>!r.   r   r            r   )r6   rj   tupler   r'   r(   r:   r   r   r   r   r   r    rE   r   rI   r\   r   s   `    r,   ra   z$AttentionNvfp4QuantPattern._register   s   	-|	-|	- |	- 		-
  ,	-  ,	- 	- 5<-.	- 	- 	- 	- 	- 	-B	"|	"|	" |	" 		"
  ,	"  ,	" 	" 5<-.	" 	" 	" 	" 	" 	"@ q$.$.99q$.$.99q$.$.99q$.$.99Q ?1 DEEXdnt~=CQGG  q!

 	!//%8%: 
 
	
 
	
 
	
 
	
 
	
r.   )rf   rg   rh   ri   r   r6   r#   r-   r
   ra   r   r   s   @r,   r   r      s         6i 6 6 6 6 6 6 6 6W
!3 W
 W
 W
 W
 W
 W
 W
 W
 W
r.   r   c                        e Zd ZdZededdf fd            Zej        de	j
        j        j        ddfd            ZdefdZ xZS )	AttnFusionPassa  
    This pass fuses post-attention quantization onto attention if supported.

    It uses the pattern matcher and matches each layer manually, as strings
    cannot be wildcarded. This also lets us check support on attention layers
    upon registration instead of during pattern matching.

    Currently, only static fp8 quant is supported, but patterns could easily be
    added for other quant schemes and dtypes. The bigger hurdle for wider
    support are attention kernels, which need to support fusing output quant.
    configr$   Nc                    t                                          |           t          d          | _        t	          |t
                    }|                                D ]\  }}t          ||j        j	                  }|
                    | j                   t          j                    rSt          t          j        j        d          r4t#          ||j        j	                  }|
                    | j                   t%          |          dk    rt&                              d           |                     || j                   d S )Nattn_fusion_pass)	pass_namescaled_fp4_quantr   zAttention + quant fusion is enabled, but no attention layers were found in CompilationConfig.static_forward_context so no fusion patterns were registered.)rr   r-   r
   patternsr   r   itemsrm   model_configr#   rc   r   is_cudahasattrr6   rR   _Cr   lenloggerwarningdump_patterns)r+   r   attn_layersr&   r!   pattern_fp8pattern_nvfp4rt   s          r,   r-   zAttnFusionPass.__init__E  s;      *5GHHH1&)DD!,!2!2!4!4 
	C 
	CJ8v*0 K --dm<<<')) CgeilDV.W.W C :6.4! ! 33DMBBB{q  NN9   	64=11111r.   rP   c                     | j                             |          | _        t                              d| j                   d S )Nz#Fused quant onto %s attention nodes)r   applymatched_countr   debug)r+   rP   s     r,   __call__zAttnFusionPass.__call__a  s8    !]0077:D<NOOOOOr.   c                 N    t          j        | t          t          t                    S r?   )r   hash_sourcer    rm   r   )r+   s    r,   uuidzAttnFusionPass.uuidf  s#    +!*&	
 
 	
r.   )rf   rg   rh   ri   r   r   r-   r   time_and_logr6   r   rP   Graphr   strr   r   r   s   @r,   r   r   8  s        
 
 2z 2d 2 2 2 2 2 26 "Pehn2 Pt P P P #"P
c 
 
 
 
 
 
 
 
r.   r   )Aabcr   r   collections.abcr   typingr   r   r6   torch._inductor.pattern_matcher	_inductorpattern_matcherr   r   *torch._higher_order_ops.auto_functionalizer	   r
   vllm.attention.layerr   vllm.configr   r   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   r   vllm.platformsr   vllm.utils.math_utilsr   fusionr   r   r   r   fx_utilsr   inductor_passr   matcher_utilsr   vllm_inductor_passr   r   rf   r   r   	fp8_dtyperq   uint8	FP4_DTYPErR   vllmunified_attention_with_outputrU   r   rS   reshaper   r    rm   r   r   r@   r.   r,   <module>r      s   $ # # # # # # # $ $ $ $ $ $ ! ! ! ! ! ! ! !  , , , , , , , , ,       J J J J J J > > > > > > * * * * * * ? ? ? ? ? ? ? ? # # # # # #         
 , + + + + + * * * * * * @ @ @ @ @ @ @ @ @ @ @ @       + + + + + + * * * * * * H H H H H H H H	X		IcNN&&((	K	
).
6
>Y^#+
I" I" I" I" I"C I" I" I"XZ
 Z
 Z
 Z
 Z
%: Z
 Z
 Z
zd
 d
 d
 d
 d
!6 d
 d
 d
N4
 4
 4
 4
 4
+ 4
 4
 4
 4
 4
r.   