
    .`ib                     $   d dl Z d dlmZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZBmCZC  eeD          ZEdZF G d d           ZG G d! d"eG          ZHd#ejI        d$e5d%eJejI        ejI        f         fd&ZKdS )'    N)replace)	find_spec)CUDAGraphMode
VllmConfigget_layers_from_vllm_config)get_pp_group)set_forward_context)init_logger)AttentionLayerBase)	get_model)supports_multimodal)DeepseekV32IndexerCache)Eagle3LlamaForCausalLM)MULTIMODAL_REGISTRY)current_platform)triton)is_pin_memory_available)AttentionMetadataBuilderCommonAttentionMetadata)AttentionBackendEnum)TreeAttentionMetadataTreeAttentionMetadataBuilder)TritonAttentionMetadata)CudagraphDispatcher)KVCacheConfig)SamplingMetadata)_SAMPLING_EPS)SpecDecodeMetadata)"eagle_prepare_inputs_padded_kernel&eagle_prepare_next_token_padded_kernel)CpuGpuBuffer)coordinate_batch_across_dp)CachedRequestState
InputBatchc                   F   e Zd Z	 d=dedej        defdZdefdZ	dedej
        fd	Z	 d=ded
ej
        dz  deeej
        f         fdZdeddfdZ	 	 	 d>dej
        dej
        dej
        dej
        dej
        dz  dededeeej
                 ej
        f         dz  dej
        dz  deeej
        f         eeeej
        f                  z  dz  dej
        fdZdej
        dej
        dej
        dej
        dz  dedej
        dz  deeej
        ef         fdZdefdZdeee                  deeef         ded eeef         dej
        f
d!Zdedej
        deeef         ded"ej
        deej
        ej
        f         fd#Zded$ed%ej
        deeej
        ej
        f         fd&Z	 d=d'ed(ej
        dej
        d)ej
        dedeeej
        f         eeeej
        f                  z  dz  deej
                 fd*Zdedeee                  d+ee         deeej
        f         fd,Zd-e j!        defd.Z"d/e j!        ddfd0Z# ej$                    	 	 	 d?ded3ed4edeeej
        f         dz  ddf
d5            Z%de&fd6Z'defd7Z(d8e)ddfd9Z*d:ed;edeeej
        f         fd<Z+dS )@SpecDecodeBaseProposerNvllm_configdevicepass_hidden_states_to_modelc                 n
   || _         |j        | _        | j        J | j        j        | _        | j        j        | _        || _        || _        || _        |j        j        | _        |j        j	        | _	        |j
        j        | _        | j        j        | _        |j        j        }|j        j        |z   | _        t%          j        | j                  | _        | j                                        | _        | j                                        | _        t2          | _        | j                            |j                  | _        d | _        d | _        g | _        g | _         | !                                | _"        | j         j#        | _#        tI          | j                   | _%        tM          j'        | j        tL          j(        |          | _)        | j        j*        | _*        | j*        r1tM          j'        d| j        dz   ftL          j+        |          | _,        n+tM          j'        | j        tL          j+        |          | _-        tM          j'        | j        | j        f| j        |          | _.        t_          |dz   | j                  }tM          j        ||tL          j(                  | _        tM          j'        | j        | j        f| j        |          | _0        tc          |tL          j(        te                      |d          | _3        tM          j'        | j        tL          j+        |          | _4        d | _5        tm          j7                    rddl8m9} tt          |g}tw          tx          j=        >                    d	
                    rddl?m@}	 |A                    |	           ddlBmC}
 |A                    |
           ddlDmE} |A                    |           t          |          | _5        | j        jG        }t          jI        |          | _J        t          | jJ        d                   }dg|z  }| jJ        D ]"}|t          |          dz
  xx         dz  cc<   #|d         g| _L        |d         g| _M        t          d|          D ]\}| jL        A                    | jL        d         ||         z              | jM        A                    ||         ||dz
           z             ]tM          j        dt          | jJ                  dz   |tL          j(                  O                    |d          | _P        d S )Ndtyper)         r)   r-   T)r-   
pin_memoryr)   
with_numpyr   )RocmAttentionMetadataF)include_classname)AiterFlashAttentionMetadata)MLACommonMetadata)FlexAttentionMetadatar%   )Qr(   speculative_configdraft_model_configmethodr*   runnerr)   model_configr-   max_model_lenparallel_configdata_parallel_rankdp_ranknum_speculative_tokensscheduler_configmax_num_seqsmax_num_batched_tokensmax_num_tokensnparangetoken_arange_npget_hidden_sizehidden_sizeget_inputs_embeds_sizeinputs_embeds_sizer   mm_registrysupports_multimodal_inputssupports_mm_inputsattn_metadata_builderdraft_indexer_metadata_builderattn_layer_namesindexer_layer_names,_get_eagle3_use_aux_hidden_state_from_configeagle3_use_aux_hidden_statecompilation_configr   cudagraph_dispatchertorchzerosint32	input_ids
uses_mropeint64mrope_positions	positionshidden_statesmaxinputs_embedsr!   r   backup_next_token_ids_slot_mapping_bufferallowed_attn_typesr   is_rocm$vllm.v1.attention.backends.rocm_attnr3   r   r   r   ROCM_AITER_FAget_path(vllm.v1.attention.backends.rocm_aiter_far5   append2vllm.model_executor.layers.attention.mla_attentionr6   )vllm.v1.attention.backends.flex_attentionr7   tuplespeculative_token_treeastliteral_evaltree_choiceslencu_drafts_per_levelchild_drafts_per_levelrangerepeattree_draft_pos_offsets)selfr(   r)   r*   r;   max_batch_sizemax_num_slots_for_aranger3   
rocm_typesr5   r6   r7   spec_token_tree
tree_depthnum_drafts_per_levelnodelevels                    m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/spec_decode/eagle.py__init__zSpecDecodeBaseProposer.__init__9   s    '"-"@&222"&"9"L-4+F( -3
(5C"2E&*&=&T#$5B(?.P 	  "y)<==  2BBDD"&"9"P"P"R"R /"&"2"M"M$#
 #
 GK"OS++-.0 ==?? 	( #'"2"E %88H$I$I! u{6
 
 

 1<? 	 $);D'!+,EK$ $ $D  
 #[#5;v  DN #[ $"234:f
 
 
 $'~'94;N#O#O l$V5;
 
 
 #[ $"9:*
 
 
 &2+.00&
 &
 &
" %*Ku{6%
 %
 %
!
 15#%% 	8RRRRRR (%J
 $2;;e;TT  ?      !!"=>>>      /000 XWWWWW3444&+J&7&7D# 1H363CO3T3T*2.//
 !sZ/% 	5 	5D TQ///14////$8$;#< ';A'>&?#1j)) 	 	E$++(,/CE/JJ   '..$U+/CEAI/NN    ',ls4$%%)&'
 '
 '

&
#
# 	###    
num_tokensc                 T    | j         r| j        d d d |f         S | j        d |         S N)r\   r^   r_   )ry   r   s     r   _get_positionsz%SpecDecodeBaseProposer._get_positions   s7    ? 	8';J;77~kzk**r   r_   c                     | j         r|| j        d d d |f<   d S | j        j        j         r|d         }|| j        d |<   d S )Nr   )r\   r^   r(   r<   r_   )ry   r   r_   s      r   _set_positionsz%SpecDecodeBaseProposer._set_positions   s]    ? 	43<D KZK000
 ,7 )%aL	*3DN;J;'''r   slot_mappingreturnc                    |\|j         d         }| j        d|                             |           ||k    r'| j        ||                             t                     | j        d|         fd| j        | j        z   D             S )zzReturn slot_mapping dict for EAGLE layers.

        If slot_mapping is provided, copies it into the buffer first.
        Nr   c                     i | ]}|S  r   ).0nameviews     r   
<dictcomp>z<SpecDecodeBaseProposer._get_slot_mapping.<locals>.<dictcomp>   s    XXXtdXXXr   )shaperd   copy_fill_PADDING_SLOT_IDrR   rS   )ry   r   r   
num_actualr   s       @r   _get_slot_mappingz(SpecDecodeBaseProposer._get_slot_mapping   s     #%+A.J%kzk288FFFJ&&)*Z*?@FFWWW(*5XXXXt'<t?W'WXXXXr   cudagraph_modec                     | j         j        s9|                                t          j        t          j        fv rt          j        }nt          j        }| j                            |           dS )zInitialize cudagraph dispatcher keys for eagle.

        Eagle only supports PIECEWISE cudagraphs (via mixed_mode).
        This should be called after adjust_cudagraph_sizes_for_spec_decode.
        N)	r8   enforce_eager
mixed_moder   	PIECEWISEFULLNONErW   initialize_cudagraph_keys)ry   r   eagle_cudagraph_modes      r   r   z0SpecDecodeBaseProposer.initialize_cudagraph_keys   sn     '5	6))++');<= = $1#:  #0#5 !;;<PQQQQQr   target_token_idstarget_positionstarget_hidden_statesnext_token_idslast_token_indicescommon_attn_metadatasampling_metadatamm_embed_inputsnum_rejected_tokens_gpuslot_mappingsc                 x   |                                 }| j        dk    rNt          | j        t                    sJ | j                            |          }|j        d         | j        k    sJ |                     ||||||	          \  }}}| j	        J | j
        |                                 }n| j
        }|                    |d          }| j        r| j                            |d          }nd }i }| j        D ]}|||<   | j        D ]}|J |||<   |                     ||          \  }}| j                            |          \  }}|j        }|
||| j        <   | j        r|| j        d |<   | j        rM|pd\  }}| j                            | j        d |         ||          | j        d |<   d }| j        d |         }n| j        d |         }d }||                     |          |d	}| j        r| j        d |         |d
<   t9          || j        ||||                     ||j                            5   | j        di |}|                                  s|}|}n|\  }}d d d            n# 1 swxY w Y   ||         }| j        !                    |          } | j"        dk    r,| #                    d          }!|!$                    dd          S | j%        r| j&        d d |f         }"n| j&        |         }"| j        dv r| j        |         }n||         }t          |tN                    r1| (                    || |"|||
          }#tS          j*        |#d          S | #                    d          }!| j+        <t          || j+                  s'tY          dt[          |           d| j+                   |!g}#|                     ||          \  }$}%| j                            |$          \  }}|j        }&|%
|&|%| j        <   ||_.        d|_/        | j0        d |dz            |_1        tS          j2        | j3        d |dz                      4                                |_5        | j"        dk    r |	|xj6        |	z  c_6        d |_7        d |_8        ts          | j"        dz
            D ]r}'|#d         :                                }| j%        rR|"dz  }"|"d         | j;        k    }(tS          j<        |(=                    d          tS          j>        |"          |"          })n&|"dz  }"|"| j;        k    }(tS          j<        |(d|"          })|xj6        dz  c_6        |j6        ?                    |(d           t          |jA        dz   | j;                  |_A        |j7        |xj7        dz  c_7        |j8        |xj8        dz  c_8        |jB        jC        }*| j%        r|)d         |*z  }+n|)|*z  }+|jD        E                    d|+$                    dd                    },|,$                    d          },| j%        r|,|*z  |)d         |*z  z   |_        n|,|*z  |)|*z  z   |_        |j        ?                    |(t                     |                    ||'dz             }| j        D ]}|||<   || j        d |<   | G                    ||)           || j        d |<   | j        r6| j                            |          | j        d |<   d }| j        d |&         }n| j        d |&         }d }||                     |&          |d	}| j        r| j        d |&         |d
<   t9          || j        |&|%||                     |&|j                            5   | j        di |}|                                  s|}|}n|\  }}d d d            n# 1 swxY w Y   |d |         }| j        !                    |d |                   } | #                    d          }!|#H                    |!           ttS          jI        |#d          }!|!S )Neagle3r%   )r   r   r   r   cadr   r   r   draft_indexnum_tokens_unpaddednum_tokens_padded)NN)multimodal_embeddingsis_multimodalr[   r_   rb   r`   r   num_tokens_across_dpcudagraph_runtime_moder   r/   dim)deepseek_mtp	ernie_mtplongcat_flash_mtppangu_ultra_moe_mtp)
batch_sizelogitsr_   r`   r   r   z^Unsupported attention metadata type for speculative decoding with num_speculative_tokens > 1: z. Supported types are: r   indexr   )Jr   r:   
isinstancemodelr   combine_hidden_statesr   rJ   set_inputs_first_passr;   rP   _get_attention_metadata_builderbuild_for_draftingrQ   rR   rS   _pad_batch_across_dprW   dispatchr   r@   r*   r`   rO   embed_input_idsr[   rb   r   r	   r(   r   r   model_returns_tuplecompute_logitsrA   argmaxr   r\   r_   r   propose_treerX   catre   
ValueErrortypenum_actual_tokensmax_query_lenrG   query_start_loc
from_numpyrH   clonequery_start_loc_cpuseq_lens_seq_lens_cpu_num_computed_tokens_cpurv   intr=   where	unsqueeze
zeros_likemasked_fill_minmax_seq_lenkv_cache_spec
block_sizeblock_table_tensorgatherr   r   rk   stack)-ry   r   r   r   r   r   r   r   r   r   r   r   r   rP   attn_metadatadraft_indexer_metadataper_layer_attn_metadata
layer_namenum_tokens_dp_paddedr   r   
batch_descnum_input_tokens	mm_embedsis_mm_embedr[   rb   model_kwargsret_hidden_stateslast_hidden_statesr`   sample_hidden_statesr   draft_token_idsr_   draft_token_ids_listbatch_size_dp_paddedbatch_size_across_dpinput_batch_sizetoken_indexexceeds_max_model_lenclamped_positionsr   block_numbers	block_idss-                                                r   proposezSpecDecodeBaseProposer.propose  s]   & *4466
;(""dj*@AAAAA#':#C#C$$ $  (-b1T5EEEEE &&!1-!1#5((? '   	=
&(< {&&&%-$($H$H$J$J!!$($>!-@@!51 A 
 
 . 	*3FF)= ! G   #" &*" #%/ 	@ 	@J2?#J//2 	I 	IJ)5552H#J//595N5N *j 6O 6
 6
22 .2-F-O-O .
 .
*
 &0+1A .+ 	C /CD{
{+" 	!%4%D"I{.2j.H.H{
{+&/) /I / /D{
{+ I ./@0@/@AMM'8(8'89I M #,,-=>>*
 

 + 	R,0,>?P@P?P,QL) #'!5#9// "6"C 	
 	
 	
 	F 	F !+
 : :\ : :++-- F%6" 24E1"M	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F"  22DE**+?@@ &!++$mmm33O"''A...? 	;qqq*<'<=II'9:I; 
 
 
 !./ABMM)*<=Mm%:;; 	:#'#4#4%#+%9+ $5 $ $  91q9999 --B-//".z428
 8
. -&&- - *- -   !00595N5N *j 6O 6
 6
22 .2-F-O-O .
 .
*
 &0+1A .1;.-.*/3{;KZ!^;K/L,383C !1:>!124
 4

%'' 	0 &**/F/R ))-DD))15 .<@ 9 !<q!@AA |	9 |	9K -R04466I UQ	 )2!8J(J% %*K)33A66$Y//% %!! Q	(1T5G(G%$)K0Eq)$T$T!
 !))Q.)) !)667LaPPP 03$014d6H0 0 , $1=$22a722#<H$==B== /<GJ @ 1! 4
 B 1Z ?,?FF]//A66 G  I "r**I 
*->q-AJ-NN %11
 
*->-KK %1 !-::%  
 2DD%9{UV E  M #3 D D
6C'
33 +4DN;J;'
,=>>>.;D{
{+& %26*2L2LY2W2W";J;/ 	 $ 23D4D3D E N+<,<+<=	 $ '!001ABB!. L
 / V040BCTDTCT0U_-$' +%9'=!33$&:&G 	 	 	 J J %/DJ$>$>$>$>!//11 J):&$5MM8I5&J J J J J J J J J J J J J J J" *+:+6MZ../A+:+/NOOF$mmm33O ''8888  +&:BBBs$   ,I99I= I=,_  _	_	r   c                     ||j         dd          dz
  }|j        d         }|dd          | j        d |dz
  <   || j        |<   |                     ||           |||fS )Nr/   r   )r   r   r[   r   )ry   r   r   r   r   r   r   r   s           r   r   z,SpecDecodeBaseProposer.set_inputs_first_pass]  s     %!$!4QRR!81!<%+A.
 ,<ABB+?'a'( .<)* 	J(8999-s22r   c                     | j         dvS )N)mtpdraft_model)r:   )ry   s    r   r   z*SpecDecodeBaseProposer.model_returns_tuplev  s    {"888r   sampled_token_idsrequestsgpu_input_batchnum_scheduled_tokensc                 B   |j         }g }t          |          D ]Z\  }}|r	|d         }	n5||         }
||
         }|j        ||
         z   }|                    |          }	|                    |	           [t          j        |t
          j        | j        j	                  }|S )aj  
        This function is used to prepare the inputs for speculative decoding.
        It calculates the next token ids for each request based on the sampled
        token ids from the CPU. If a request has no sampled token ids (e.g.,
        during the initial decoding steps), it falls back to using the request
        state to get the next token id.
        r%   r,   )
req_ids	enumeratenum_computed_tokensget_token_idrk   rX   tensorrZ   r[   r)   )ry   r  r  r  r	  r  r   i	token_idsnext_token_idreq_id	req_stateseq_lens                r   prepare_next_token_ids_cpuz1SpecDecodeBaseProposer.prepare_next_token_ids_cpuy  s     ")$&%&788 	1 	1LAy 	@ )" !$V,	#7:Nv:VV ) 6 6w ? ?!!-0000%+dn6K
 
 
 r   discard_request_maskc                    j         }t          j        fdt          |          D             t          j                  | j        j        d|<   | j                            |           | j        j        }|j        \  }}	|j	        }
|j
        t          j        k    sJ |j
        t          j        k    sJ t          j        |t          j        |
          }|                    |          }|f}t          j        |	          }t#          |         |||||j        |	||                    d          |
  
         ||fS )a  
        This function is used to prepare the inputs for speculative decoding.
        It calculates the next token ids and the number of valid sampled tokens
        for each request, considering the "discarded" requests whose next token
        is not sampled and comes from `request.get_token_id()` instead. This is denoted
        the "backup" token id. It also counts rejected tokens via `sampled_token_ids`.
        c                     g | ]E}j         |                                      j        |                                                   FS r   )r  r  seq_lens_cpuitem)r   r  r   r  r  s     r   
<listcomp>zHSpecDecodeBaseProposer.prepare_next_token_ids_padded.<locals>.<listcomp>  s_         034AA(5a8==??   r   r-   Nr,   r   )BLOCK_SIZE_TOKENS)num_reqsrF   arrayrv   rZ   rc   copy_to_gpugpur   r)   r-   rX   boolempty	new_emptyr   next_power_of_2r    
vocab_sizestride)ry   r   r  r  r  r  r  backup_tokens_gpur   r   r)   r   valid_sampled_tokens_countgridr  s    ` ``          r   prepare_next_token_ids_paddedz4SpecDecodeBaseProposer.prepare_next_token_ids_padded  sz     #+358      x	   (4
 4
 4
"%ixi0 	"..x888 6:!2!8
J")#)UZ7777 &%+5555Zu{6RRR%3%=%=j%I%I" } #2:>>.t4 &&$$Q''/	
 	
 	
 	
 999r   spec_decode_metadatar*  c                    |j         }|j        }t          j        |ft          j        |          }t          j        |ft          j        |          }|f}t          |         |j        ||j        |||           |j        }	|	dd         |	dd         z
  }
|	d         	                                }t          |j        |j        |	|j        |j        |j         ||
                                	                                |j                                        	                                |j        |j        d|         d|j                  }|||fS )a  
        This function is used to prepare the inputs for speculative decoding
        It updates the common_attn_metadata for speculative decoding,
        but does not consider the rejected tokens. Instead, all tokens
        are included as inputs to the speculator, with the rejected tokens
        used as padding and filtered out later by `token_indices_to_sample`.
        No blocking CPU operations should be introduced in this function.
        r,   r/   Nr%   Tr   r   r   r   r   r  r   r   r   r   r   causaldcp_local_seq_lens)r  r)   rX   r$  rZ   r   cu_num_draft_tokensr   r   r  r   r   r   r   ra   r  r   r   r1  )ry   r   r-  r*  r  r)   token_indices_to_sampler   r+  r   new_query_len_per_reqtotal_num_tokensspec_common_attn_metadatas                r   prepare_inputs_paddedz,SpecDecodeBaseProposer.prepare_inputs_padded  sz    (0+2"'+Ku{6#
 #
 #
 #(+Ku{6#
 #
 #
 {*40 4& 0##	
 	
 	
 3F 3ABB 7:Mcrc:R R.r27799$;0@)2 3.<%9%R)2./3355::<<,9==??DDFF3F-:;L<L;LM3F%
 %
 %
!" &##
 	
r   r   r   r`   c                    | j         j        d         d                                         }t          |t                    sJ | j        d         }|}	| j        d         }
|
dk    r+|                    d                              |d          }n0t          j
        ||
d          j                            |d          }|g}|                    |dd          }t          j        d| j        j        | j        j                  }t          j        d| j        j        | j        j                  }t          j        d| j        j        | j        j                  }|                    |d          | j        d |d d f         z   }t'          | j                  }t)          |dz
            D ]}||dz   z   }||z   | j        k    }t          j        |d|                              |d          }|	dk    r|                    |	d          }|
dk    r|                    |
d          }t          j        ||gd          }t          j        ||gd          }t          j        ||gd          }|}t3          ||| j        d |dz            z  |j        |	z   ||z  |          }|                    ||dz             }i }| j        D ]}|||<   t=          |j        | j                  |_        |j                             |d           |j!        j"        }|d d |||z   f         }||z  }|j#        $                    d|          }||z  ||z  z   }tJ          ||<   |                    d          |_&        |j'        }|                    d          } | | j        d |<   |                    d          | j        d |<   |                    |d          | j        d |<   | j(        )                    |          \  }!}"|"j*        }#tW          || j,        |#|!| -                    |#|j&                  	          5  | .                    | j        d |#         | j        d |#         | j        d |#         d 
          \  }$}d d d            n# 1 swxY w Y   |d |                             ||d          d d |	 d f         }|$d |                             ||d          d d |	 d f         }%| j.        /                    |%0                    ||	z  d                    }| j        |dz            }
|
dk    r+|                    d                              |d          }n0t          j
        ||
d          j                            |d          }|1                    |           | j        |dz            |z
  }	| j        |dz            }|S )Nr   r/   r%   r   r0   )r   r   r   r   r   r   )r   r   r   )r[   r_   r`   rb   )2r;   attn_groupsget_metadata_builderr   r   rt   ru   r   r   rX   topkindicesr$  r[   r)   r-   r_   r`   rx   rs   rv   r=   r   repeat_interleaver   r   rG   r   r   rR   r   r   r   r   r   block_tabler   r   r   r   rW   r   r   r	   r(   r   r   r   reshaperk   )&ry   r   r   r_   r`   r   r   tree_attn_metadata_buildertotal_num_draftslevel_num_draftsnum_childrenr   r   draft_hidden_statestree_input_idstree_positionstree_hidden_statesflattened_draft_positionsr~   r   draft_positionsr   	query_lenr   r   r   r   query_positionsr   r   r   r   r[   r   r   r   r   draft_last_hidden_statess&                                         r   r   z#SpecDecodeBaseProposer.propose_tree  s&    &*[%<Q%?&



 
  	# 46RSSSSS3A6+2151$mmm3388RHHOO#j2FFFNSSB O !00+00QCC dn+4>3G
 
 
 dn+4>3G
 
 
 #[d(/t7I7O
 
 

 NN:r**T-H*VWVWVW-XX 	" 122
:>** {	C {	CE'5195O%.1A%AdFX$X! $k%  d:r""	   !##"1"C"C$! #D # # a&9&K&K a 'L ' '#
 #Y'HaPPPN"Y'HaPPPN!&#%89q" " " )I#*$ )DK8H*q.8H,I I-69II",y"8'$ $ $  7II%9uqy J  M
 ')#"3 D D
6C'
33 ),)4+=) )M%
 "//0EqIII 4ALJ7559CT;T8TUO+z9M%188Qm8TTI$z1Oj4PPL 3BL./)5):):2)>)>M& '8J&++B//I*3DN;J;'*8*=*=b*A*ADN;J;'.@.E.EjRT.U.UD{
{+151J1S1S2 2."J  *4$' +'=!33$m&@      59JJ"n->.>->?"n->.>->?"&"45F6F5F"G"&	 5? 5 51"M              " #0"<"A"AIr# #aa""####% (:+:+'F'K'KIr( (aa""###(%$
 Z..(00>N1NPRSS F
  6uqyALq  "(--B-"7"7"<"<Z"L"L"'*V\r"J"J"J"R"W"W# # !''888  $7	BEUU#7	B##s   2ARR		R	num_draft_tokensc                 b   fdt          |          D             }t          j        |t          j                  }|j        j        }|j        }|j        |z
  }|dd         |dd         z
  }||z
  }	|	                                }
t          j	        |j
        t          j        t                                }|                                }t          j        |
|dd                    |d         }t          j        |dd         |
          }| j        d|         |z
  }t          j        |dd                                         |
          }||z   }t          j        |                              |d	          }t%          |                    |d	          |                    |d	          |||j        |j        ||                                                                |                                                                |j        |j        |         d|j        
          }||fS )a+  
        This function is used to prepare the inputs for speculative decoding.
        It updates to the common_attn_metadata to account for the rejected
        tokens (and newly sampled tokens). It also returns the token indices
        of the tokens that should be fed to the speculator.
        c                 \    g | ](\  }}|d k    r|dz   t          |                   z
  nd )S )r   r/   )rs   )r   r  nr  s      r   r  z9SpecDecodeBaseProposer.prepare_inputs.<locals>.<listcomp>  sQ     
 
 
1 23QAEC)!,----A
 
 
r   r  r/   Nr%   )r-   r1   )outT)non_blockingr/  )r  rX   r  rZ   r   r)   r   r  numpyrY   r   r   rF   cumsumrw   rH   r   tor   r   r  ra   r  r   r   r1  )ry   r   r  rM  num_rejected_tokensr)   r   new_seq_lens_cpur4  new_num_tokens_per_reqnew_num_tokens_per_req_npnew_query_start_loc_cpunew_query_start_loc_npr5  new_query_start_locs_expandedtoken_offestsold_query_start_locs_expandedtoken_indices_nptoken_indicesr6  s     `                 r   prepare_inputsz%SpecDecodeBaseProposer.prepare_inputs  ss   8
 
 
 
!"233
 
 
 $l+>ekRRR%5<2F/<?RR !4ABB 7:Mcrc:R R!69L!L$:$@$@$B$B! #(+%+.00#
 #
 #

 "9!>!>!@!@
	+1G1KLLLL1"5 )+	"3B3')B)
 )
%  !2"2!236SS 	 )+	$**,,.G)
 )
% )+HH()9::==fSW=XX$;366vD6QQ%((d(CC 7*%9%R)2./3355::<<(,,..33553F-:=I3F%
 %
 %
!  )-77r   r   c                 H    t          |d          r|j        }|j        j        S )Nmodule)hasattrrc  	__class____name__)ry   r   s     r   get_model_namez%SpecDecodeBaseProposer.get_model_name$  s&    5(## 	!LE''r   target_modelc                    | j         j        j        }t          t	          | j         t
                                                              }t          t	          | j         t                                                              }ddlm	}  |d          5  t          | j         |          | _        d d d            n# 1 swxY w Y   t	          | j         t
                                                    |z
  }t	          | j         t                    }|                                |z
  }t          ||z
            | _        t          |          | _        | j        rx| j        d         }	 ||	                                                                         ||	                             | j                   | j        | j         | j                  | _        nd | _        | j        r~	 t+          j        dgg| j        j                  }
| j                            |
d            n># t2          t4          t6          f$ r$ t8                              d           d	| _        Y nw xY wt=          |          r|                     |          d
v r|j         j!        | j        j         _"        nU|                     |          dk    r!|j         j#        j!        | j        j         _"        n|j         j"        | j        j         _"        |$                                }n|}tK                      j&        dk    rtO          |j        d          r|j        j(        }n1tO          |j        d          r|j        j)        }nt5          d          d	}tO          | j        d          r | j        j*        sd}t8          +                    d           ntY          |j-        t*          j.                  rtY          | j        j        j(        j-        t*          j.                  rot+          j/        |j-        0                                | j        j        j(        j-        0                                          rd}t8          +                    d           n7t8          +                    d           nd}t8          +                    d           |r7tO          | j        j        d          r| j        j        `(|| j        j        _(        nt8          +                    d           d	}tO          | j        d          r| j        j1        sd}t8          +                    d           ntO          |d          rtY          |j2        j-        t*          j.                  rtY          | j        j2        j-        t*          j.                  rot+          j/        |j2        j-        0                                | j        j2        j-        0                                          rd}t8          +                    d           n7t8          +                    d           nd}t8          +                    d           |r?tO          |d          r1tO          | j        d          r| j        `2|j2        | j        _2        d S d S d S )Nr   )set_model_tag
eagle_head)r(   r<   r/   )r)   )r   zNDraft model does not support multimodal inputs, falling back to text-only modeF)"Qwen2_5_VLForConditionalGenerationQwen3VLForConditionalGeneration"Qwen3VLMoeForConditionalGenerationPixtralForConditionalGenerationembed_tokens	embeddingzBTarget model does not have 'embed_tokens' or 'embedding' attributehas_own_embed_tokensTzDetected EAGLE model without its own embed_tokens in the checkpoint. Sharing target model embedding weights with the draft model.zDetected EAGLE model with embed_tokens identical to the target model. Sharing target model embedding weights with the draft model.zrDetected EAGLE model with distinct embed_tokens weights. Keeping separate embedding weights from the target model.zPDetected MTP model. Sharing target model embedding weights with the draft model.zRThe draft model's vocab embedding will be loaded separately from the target model.has_own_lm_headzzDetected EAGLE model without its own lm_head in the checkpoint. Sharing target model lm_head weights with the draft model.lm_headz{Detected EAGLE model with lm_head identical to the target model. Sharing target model lm_head weights with the draft model.zkDetected EAGLE model with distinct lm_head weights. Keeping separate lm_head weights from the target model.zNDetected MTP model. Sharing target model lm_head weights with the draft model.)3r(   r8   r9   setr   r   keysr   vllm.compilation.backendsrj  r   r   listrR   rS   get_attn_backendget_builder_clsget_kv_cache_specr)   rQ   rO   rX   r  r[   r   NotImplementedErrorAttributeError	TypeErrorloggerwarningr   rg  configimage_token_idimage_token_indexvision_configget_language_modelr   
world_sizerd  rp  rq  rr  infor   weightTensorequalcpurs  rt  )ry   rh  r9   target_attn_layer_namestarget_indexer_layer_namesrj  draft_attn_layer_namesindexer_layersdraft_indexer_layer_namesfirst_layerdummy_input_idstarget_language_modeltarget_embed_tokensshare_embeddingsshare_lm_heads                  r   
load_modelz!SpecDecodeBaseProposer.load_model)  s   !-@S"%'(8:LMMRRTT#
 #
 &)' "9 dff&
 &
" 	<;;;;;]<(( 	 	" ,;M  DJ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ((8:LMMRRTT%& 	 55
 
 %3$7$7$9$9<V$V! $%;>W%W X X#'(A#B#B # 	7215K#{+!!## """;/AA$BRSS,$K  // 37D/" 	00"',uT^=R"S"S"S
**?RV*WWWW'C 0 0 05   +0'''0 |,, 	1""<00 5  
 7C6I6X
!33$$\226WWW '5D 
!33
 !'9 
!3 %1$C$C$E$E!!$0! >>$)),2NCC &;&A&N##.4kBB &;&A&K##$X    %tz#9:: $z6 '+$KK(    295<HH"4:#3#@#GVV
 +26688
(5<@@BB  (,$KK"    KKT    $( S  
   D4:+^<< 6
(50C
 -KK*   4:011 #	:-  $Q   
 -y994<CU\RR tz18%,GG K)18<<>>J&-1133  !%Q   
 N    !MKKM  
  	?W%:IFF 	?tz9-- 'J&!6!>DJ	? 	? 	? 	?s$   B==CC!>H   8IITFuse_cudagraphsis_graph_capturingc           	         t          |s| j        nd          D ]=}|dk    r[|                     ||          \  }}|r%| j                            |          \  }}	|	j        }
nt          j        }|}
|
|
|| j        <   | j	        r'|%| j	        d         |v r| 
                    |
          }n|pi }t          d | j        |
|||          5  | j        rd }| j        d |
         }n| j        d |
         }d }t!          ||                     |
          |          }| j        r| j        d |
         |d<    | j        di | d d d            n# 1 swxY w Y   ?d S )Nr/   r   r   r   r   r`   r   )rv   rA   r   rW   r   r   r   r   r@   rR   r   r	   r(   rO   rb   r[   dictr   r*   r`   r   )ry   r   r  r  r   fwd_idxr   r   r   r   r   slot_mapping_dictr[   rb   kwargss                  r   	dummy_runz SpecDecodeBaseProposer.dummy_run  s6    /AHD''q
 
 2	% 2	%G !||=A=V=V(2j >W > >:$&: " <1::;OPP 7*J (2'<$$-:-?*';$'39I(6 %8!-)!,==$($:$:;K$L$L!!$1$7R!$ +%9'=.   % % * ) $I$($67H8H7H$IMM $/@0@/@ AI$(M'"112BCC"/  
 3 T.2.@ARBRAR.SF?+
$$V$$$-% % % % % % % % % % % % % % %92	% 2	%s   	A6EE	E	c                     d}| j         d         }| j        j        D ]*}|D ]!}||j        v r|                                } n"| n+|
J d            |S )zFind and return the attention metadata builders for EAGLE layers.

        Returns:
            The metadata builders for EAGLE layers.

        Raises:
            AssertionError: If no metadata builders are found for EAGLE layers.
        Nr   z;Failed to find attention metadata builder for EAGLE layers.)rR   r;   r9  layer_namesr:  )ry   builderchosen_layerkv_cache_group
attn_groups        r   r   z6SpecDecodeBaseProposer._get_attention_metadata_builder  s     ,Q/"k5 	 	N,  
:#999(==??GE : " # ""I #"" r   c                     | j         dk    rdS d}t          | j        j        dd          }||                    dd          }|S )a5  
        Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
        hidden states and directly uses the last layer output just like eagle1.
        They might indicate this by setting "use_aux_hidden_state" to False
        inside the "eagle_config" dict of their hf_config.
        r   FTeagle_configNuse_aux_hidden_state)r:   getattrr9   	hf_configget)ry   r  r  s      r   rT   zCSpecDecodeBaseProposer._get_eagle3_use_aux_hidden_state_from_config6  sW     ;(""5#t6@.RVWW##/#3#34JD#Q#Q ##r   kv_cache_configc                     i t          |j                  D ]\  }}|j        D ]}||<   t          t	          fd| j        D                                 dk    s
J d            dS )z
        Validate that all drafting layers belong to the same KVCacheGroup.
        Need this assumption to ensure all drafting layers can use the
        same AttentionMetadata.
        May extend to multiple AttentionMetadata in the future.
        c                      g | ]
}|         S r   r   )r   r   kv_cache_groupss     r   r  zGSpecDecodeBaseProposer.validate_same_kv_cache_group.<locals>.<listcomp>T  s.       & (
3  r   r/   z<All drafting layers should belong to the same kv cache groupN)r  r  r  rs   ru  rR   )ry   r  idr  r   r  s        @r   validate_same_kv_cache_groupz3SpecDecodeBaseProposer.validate_same_kv_cache_groupF  s     +-"+O,K"L"L 	1 	1B,8 1 1
.0
++1    *.*?        J    r   r   r   c           	          t          || j        j        d| j        j        t
          j        k    |d d           \  }}}|r
J d            |}|,t          || j                 	                                          }||fS )NF)r   r>   allow_microbatchingallow_dp_paddingr   uniform_decode num_scheduled_tokens_per_requestz'DBO ubatching not implemented for EAGLE)
r"   r(   r>   rW   r   r   r   r   r@   r  )ry   r   r   should_ubatchnum_toks_across_dp_r   s          r   r   z+SpecDecodeBaseProposer._pad_batch_across_dp]  s     0J 3 ,< %!6E!"/-1	0
 	0
 	0
,)1 !KK"KKK 0)#&'9$,'G'L'L'N'N#O#O #%777r   r   )NNN)TFN),rf  
__module____qualname__r   rX   r)   r#  r   r   r   r  r   r  strr   r   r   r   r   rn   rx  r  r   r   r#   r$   r  r,  r   r7  r   ra  nnModulerg  r  inference_moder  r   r   rT   r   r  r   r   r   r   r'   r'   8   s8        b$ b$b$ b$ &*	b$ b$ b$ b$H+ + + + +
	4 	4 	4 	4 	4 	4 -1Y YY lT)Y 
c5<	 	Y Y Y Y$R R$ R R R R< KO7; #K K  ,K
  ,K $lK K "L4/K 6K ,K tEL15<?@4GK "'!4K C-.
tC%&
'(
K$ 
%K K K KZ
3,3 3  ,	3
 "L4/3 %3 "'!43 
sEL"99	:3 3 3 329T 9 9 9 9S	? s../ $	
 #38n 
   B8:58: !<8: s../	8:
 $8: $l8: 
u|U\)	*8: 8: 8: 8:t;
5;
 1;
 %*L	;

 
&elB	C;
 ;
 ;
 ;
R k$ k$k$ 	k$ <k$ |k$ 6k$ C-.
tC%&
'(
k$ 
el	k$ k$ k$ k$Zd85d8  S	?d8 s)	d8
 
&4	5d8 d8 d8 d8L(BI (# ( ( ( (
t?ry t?T t? t? t? t?l U  $#(8<;% ;%;% ;% !	;%
 C-.5;% 
;% ;% ;% ;%z1I    2$d $ $ $ $ JM Jd J J J J.8 8 8 
sEL 	!	8 8 8 8 8 8r   r'   c                   6     e Zd Z	 ddedej        f fdZ xZS )EagleProposerNr(   r)   c                 R    t                                          ||d|           d S )NT)r*   r;   )superr   )ry   r(   r)   r;   re  s       r   r   zEagleProposer.__init__v  s<     	(,	 	 	
 	
 	
 	
 	
r   r   )rf  r  r  r   rX   r)   r   __classcell__)re  s   @r   r  r  u  s[        
 	
 

 
 
 
 
 
 
 
 
 
 
r   r  r   r   r   c                    |j         r| }|                     d          }||fS |j        J |j        }|j        s!|t          k     }t          j        |d|          }|                     |                    dd                     | 	                    dt
          j
                  }t          j        |          }|                                 |                    |                              d                              d          }|j        s,|                    d          }t          j        |||          }||fS )Nr%   r   g      ?r/   )r   r-   )
all_greedyr   temperature
all_randomr   rX   r   div_r   softmaxfloat32
empty_likeexponential_div)r   r   probsr   r  	is_greedyqgreedy_token_idss           r   #compute_probs_and_sample_next_tokenr    s?    # % 2..u$$(444 $/K' ?-/	k)S+>>
KK  Q''(((NNrN77E 	ANN YYq\\((R(0055b99N' R <<B<//Y0@.QQ5  r   )Lrp   dataclassesr   importlib.utilr   rS  rF   rX   torch.nnr  vllm.configr   r   r   vllm.distributed.parallel_stater   vllm.forward_contextr	   vllm.loggerr
   /vllm.model_executor.layers.attention_layer_baser    vllm.model_executor.model_loaderr   vllm.model_executor.modelsr   &vllm.model_executor.models.deepseek_v2r   'vllm.model_executor.models.llama_eagle3r   vllm.multimodalr   vllm.platformsr   vllm.triton_utilsr   vllm.utils.platform_utilsr   vllm.v1.attention.backendr   r   #vllm.v1.attention.backends.registryr   $vllm.v1.attention.backends.tree_attnr   r   &vllm.v1.attention.backends.triton_attnr   vllm.v1.cudagraph_dispatcherr   vllm.v1.kv_cache_interfacer   vllm.v1.sample.metadatar   vllm.v1.sample.samplerr   vllm.v1.spec_decode.metadatar   vllm.v1.spec_decode.utilsr   r    vllm.v1.utilsr!   vllm.v1.worker.dp_utilsr"   vllm.v1.worker.gpu_input_batchr#   r$   rf  r  r   r'   r  r  rn   r  r   r   r   <module>r     sY   


       $ $ $ $ $ $                    
 9 8 8 8 8 8 4 4 4 4 4 4 # # # # # # N N N N N N 6 6 6 6 6 6 : : : : : : J J J J J J J J J J J J / / / / / / + + + + + + $ $ $ $ $ $ = = = = = =        E D D D D D        K J J J J J < < < < < < 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 0 0 0 ; ; ; ; ; ;        ' & & & & & > > > > > > I I I I I I I I	X		z8 z8 z8 z8 z8 z8 z8 z8z)
 
 
 
 
* 
 
 
*%!L%!'%! 5<%&%! %! %! %! %! %!r   