
    -`iP|                        d dl Z d dlmZmZmZmZ d dlmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ erd dlmZ d dlm c m!c m"Z# neZ ed e$            d          Z# ee%          Z&ed         Z'edde'f         Z(eddddde(f         Z)ee G d d                                  Z*dS )    N)TYPE_CHECKINGAnyLiteralget_args)FieldSkipValidationmodel_validator)	dataclass)Self)ModelConfig)ParallelConfig)config)init_logger)get_hf_text_config)	safe_hash)
LazyLoaderhas_arctic_inference)PretrainedConfigmodel_executorz'vllm.model_executor.layers.quantization)
deepseek_mtpmimo_mtpglm4_moe_mtpglm4_moe_lite_mtp	ernie_mtpexaone_moe_mtpqwen3_next_mtplongcat_flash_mtpmtppangu_ultra_moe_mtpeagleeagle3ngrammedusamlp_speculatordraft_modelsuffixc            	          e Zd ZU dZdZedz  ed<   	  edd          Ze	ed<   	 dZ
edz  ed<   	 dZedz  ed<   	  edd	
          Ze	dz  ed<   	 dZe	dz  ed<   	 dZej        dz  ed<   	  edd	
          Ze	dz  ed<   	 dZedz  ed<   	 dZedz  ed<   	  edd
          Ze	dz  ed<   	 dZeed<   	  edd	
          Ze	dz  ed<   	  edd	
          Ze	dz  ed<   	 dZedz  ed<   	 dZee         ed<   	 dZee         ed<   	 dZ ee         ed<   	 dZ!ee         ed<   	 dZ"e	ed<   	 dZ#e	ed<   	 d Z$e%ed!<   	 d"Z&e%ed#<   	 d$efd%Z'e(d&e)d$e)fd'            Z*d( Z+d) Z,e(d*e	dz  d+e	d,e	d$e	fd-            Z-e(ded.e	dz  d/e)d$e	fd0            Z.e(ded.e	d$efd1            Z/ e0d23          d$e1fd4            Z2d5 Z3d$efd6Z4d$efd7Z5d$efd8Z6dS )9SpeculativeConfigz'Configuration for speculative decoding.Nenforce_eagerr   )defaultgtnum_speculative_tokensmodelmethod   )r*   gedraft_tensor_parallel_sizetensor_parallel_sizequantizationmax_model_lenrevisioncode_revision   disable_by_batch_sizeFdisable_padded_drafter_batchprompt_lookup_maxprompt_lookup_minspeculative_token_treetarget_model_configtarget_parallel_configdraft_model_configdraft_parallel_config   suffix_decoding_max_tree_depthi'  #suffix_decoding_max_cached_requestsg      ?suffix_decoding_max_spec_factorg?suffix_decoding_min_token_probreturnc                     g }|                     | j        dk               t          t          |                                          d                                          }|S )a  
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        r!   F)usedforsecurity)appendr.   r   strencode	hexdigest)selffactorshash_strs      k/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/config/speculative.pycompute_hashzSpeculativeConfig.compute_hash   sY       	t{h.///S\\0022EJJJTTVV    	hf_configc                 "   | j         d         }| j        dv rd| _        | j        dk    r*t          | dd           }|                     |dgd           | j        dv rd| _        | j        dk    r*t          | dd           }|                     |d	gd           | j         d         d
k    r2d| _        t          | dd           }|                     d|dgd           | j         d         dk    r1d| _        t          | dd           }|                     |dgd           | j         d         dk    r2d| _        t          | dd           }|                     d|dgd           | j        dk    rd| _        | j        dk    r*t          | dd           }|                     |dgd           | j        dk    rd| _        | j        dk    r*t          | dd           }|                     |dgd           | j        dk    rd| _        | j        dk    r*t          | dd           }|                     |dgd           | j        dk    r1d| _        t          | dd          }|                     |d gd           |d!k    r|                     d"d#gi           | S )$Nr   )deepseek_v3deepseek_v32r   num_nextn_predict_layersDeepSeekMTPModel)	n_predictarchitecturespangu_ultra_moer   OpenPanguMTPModelMiMoForCausalLMr   MiMoMTPModel)num_hidden_layersrY   rZ   Glm4MoeForCausalLMr   Glm4MoeMTPModelGlm4MoeLiteForCausalLMr   Glm4MoeLiteMTPModelernie4_5_moer   ErnieMTPModel
qwen3_nextr   Qwen3NextMTP
exaone_moer   ExaoneMoeMTPlongcat_flashr   r/   LongCatFlashMTPModelMistralLarge3ForCausalLMrZ   EagleMistralLarge3ForCausalLM)rZ   
model_typegetattrupdate)rS   initial_architecturerY   s      rP   hf_config_overridez$SpeculativeConfig.hf_config_override   s   (6q9#BBB#1I >11	+EtLLI';M:NOO   $566#8I #888	+EtLLI';N:OPP   "1%):::#-I 	+EtLLI)*!*&4%5    "1%)===#1I 	+EtLLI!*&7%8    "1%)AAA#6I 	+EtLLI)*!*&;%<    >11#.I ;..	+EtLLI'?:KLL   <//#3I #333	+EtLLI'>:JKK   <//#3I #333	+EtLLI'>:JKK   ?22#6I 	+EqIII';Q:RSS    #===o0O/PQRRRrR   c                    | j         t          t                    v r2| j         dk    r't                              d| j                    d| _         | j        | j        | j         dk    r\| j        t          d          | j        j	        j
        dk    rd| _        | j        j        | _        | j        s| j        j        | _        n3| j         dv rd| _        n"| j         dk    rd| _        nt          d	          | j         | j        | j        dv rd| _         | j         dv rd| _         | j        | j        d
| _        d
| _        nS| j        #| j        t          d          | j        | _        n)| j        "| j        t          d          | j        | _        | j        | j        k    rt          d| j         d| j                   | j        | _        | j        | _        n| j         dk    r|                                  nd| _        d| _        | j        t'          d9i d| j        ddd| j        j        d| j        j        d| j        j        d| j        j        d| j        j        d| j        j        d| j        j        d| j        d| j        d| j        j        d| j        j        d| j        d| j        j        d| j        j        d t@          j!        d!| j        j"        | _        | j         d"v rnAd#| j        j        #                                v r	d$| _         nd%| j        j        #                                v rd%| _         n| j        j$        j
        d&k    rd&| _         n| j        j$        j
        d'k    rd'| _         n| j        j$        j
        t          t                    v r-d| _         | j        d(k    rt                              d)           nd| j        j$        j
        d*v r-d*| _         | j        d(k    rt                              d+           n$| j         d,k    rntK          d-| j          d.          | j         d"v rdd/l&m'} dd0l(m)} tU          | j        j$        ||f          rn || j        j$        | j         d$1          }|| j        _$        tW          | j        j$                  | j        _	        | j        ,                                | j        _-        | j        j.        /                    | j        j0        | j                  \  }}|| j        _1        || j        _2        | j        0tg          | j        j$        d2          r| j        | j        j$        _4        tk          | j        j$        d3d           }|B| j        || _        n3| j        |k    r(| j        |z  dk    rt          d4| j         d5|          | j6        1to          d6 tq          | j                  D                       | _6        n=ts          j:        | j6                  }to          tw          |d7 8                    | _6        t@          <                    | j        | j=        | j        j$                  | _=        t@          >                    | j        | j        j        | j        j                  | j        _        t@          ?                    | j        | j=                  | _        | S ):Nr   z0method `%s` is deprecated and replaced with mtp.z+target_model_config must be present for mtprV   T)r"   z[ngram]r"   r&   zBnum_speculative_tokens was provided but without speculative model.   z[Either prompt_lookup_max or prompt_lookup_min must be provided when using the ngram method.zprompt_lookup_min=z must be <= prompt_lookup_max=r   r-   runnerdraft	tokenizertokenizer_modetrust_remote_codeallowed_local_media_pathallowed_media_domainsdtypeseedr5   r6   tokenizer_revisionspec_target_max_model_lenr3   r)   max_logprobshf_overridesconfig_format)r    r!   zeagle-r    r!   r#   r$   r/   zEnabling num_speculative_tokens > 1 will runmultiple times of forward on same MTP layer,which may result in lower acceptance rater   z`LongCat MTP models only have one layer. Might need some code changes to support multiple layers.r%   z!Unsupported speculative method: '')SpeculatorsConfig)EAGLEConfig)r.   rn   num_lookahead_tokensrY   znum_speculative_tokens:z  must be divisible by n_predict=c                     g | ]
}|d z   dz  S )r/   )r    ).0is     rP   
<listcomp>z3SpeculativeConfig.__post_init__.<locals>.<listcomp>  s     TTTA!a%4TTTrR   c                 $    t          |           | fS N)len)ts    rP   <lambda>z1SpeculativeConfig.__post_init__.<locals>.<lambda>  s    CFFA; rR   )keyr   )@r.   r   MTPModelTypesloggerwarningr-   r,   r=   
ValueErrorhf_text_configrn   r)   r3   r;   r:   r?   r>   r@   _validate_suffix_decodingr   rw   rx   ry   rz   r{   r|   r}   r5   r6   r~   r4   r   r(   rr   r   lowerrS   NotImplementedErrorvllm.transformers_utils.configsr   %vllm.transformers_utils.configs.eagler   
isinstancer   get_model_arch_configmodel_arch_configregistryinspect_model_clsrZ   _model_info_architecturehasattrr   ro   r<   rJ   rangeastliteral_evalsorted_verify_and_get_draft_tpr1   #_maybe_override_draft_max_model_lencreate_draft_parallel_config)rM   r   r   eagle_config
model_infoarchrY   tree_choicess           rP   __post_init__zSpeculativeConfig.__post_init__  sD    ;(=1111dkU6J6JNNBDK    DK:$"="I{e##+3$%RSSS+:EWW *.D&!5;
 ( N(,(@(MD% 444$

((%

 X   ;J"tz5I'I'I!DK;...!DK%-$2H2P)*&)*&&'/)1$@   *.)?&&'/)1$@   *.)?& %(>>> H)? H H/3/EH H   '+&>D#)-)DD&&[H$$**,,,,%&D"%&D"z%*5 + + +**+"7+ #6@@+ $(#;#J#J	+
 '+&>&P&P+ .2-E-^-^+ +/*B*X*X+ 288+ 166+ "]]+ #'"4"4+ (,'?'R'R+ /3.F.T.T+ "&!2!2+ #'":"H"H+  "&!9!F!F!+" "3!E!E#+$ #'":"H"H%+', ;"555
 !8!>!D!D!F!FFF")DKK!8!>!D!D!F!FFF"*DKK,6AXMM"*DKK,6AEUUU"2DKK,6AX!F F   #(DK2Q66I  
 ,6A'  #6DK2Q66:  
 [M11-JDKJJJ  
 ;"555QQQQQQQQQQQQ!/9$&78  E '2{ 3=#';'.( ( ( =I/9AS 3=B B/> !3IIKK /A !3<NN $ 7 E $ 7  )
D ?I/;@D/=.:w+57M@ @: 3 +5J $+5{D 	 (2:6?333i?? 7)CqHH )Bd6Q B B5>B B  
 .625TTt7R1S1STTT3 3D//
 $'#3D4O#P#PL25|1F1FGGG3 3D/
 &>>37/9  / &II*/=0>  '5 &BB3T5T  *
 rR   c                    t                      st          d          | j        ,| j        | _        t                              d| j                   | j        dk     rt          d| j         d          | j        dk     rt          d| j         d          | j        dk     rt          d	| j         d          d| j	        cxk    rdk    sn t          d
| j	         d          d S )NzdArctic Inference is required for suffix decoding. Install via `pip install arctic-inference==0.1.1`.z;Defaulted num_speculative_tokens to %s for suffix decoding.r/   zsuffix_decoding_max_tree_depth=z must be >= 1r   z$suffix_decoding_max_cached_requests=z must be >= 0z suffix_decoding_max_spec_factor=zsuffix_decoding_min_token_prob=z must be in [0, 1])
r   ImportErrorr,   rB   r   r   r   rC   rD   rE   rM   s    rP   r   z+SpeculativeConfig._validate_suffix_decoding  s{   #%% 	E   &. +/*MD'NNM+  
 .22F6F F F   3a77K;K K K   /!33G7G G G   D7<<<<1<<<<K6K K K   =<rR   speculative_max_model_lendraft_max_model_lentarget_max_model_lenc                     | 8| |k    rt          d| d|          | |k    rt          d| d|          | S t          ||          S )a  Determine the max sequence len for the draft model. This is usually
        the draft_max_model_len, but may be the target_max_model_len if it is
        less than the draft_max_model_len, or may be speculative_max_model_len
        if it is specified.

        This is necessary so that sequences do not exceed the capacity of the
        draft model or the target model.

        speculative_max_model_len is mainly used for testing that sequences can
        skip speculation.
        Nzspeculative_max_model_len=z+ cannot be larger than draft_max_model_len=z, cannot be larger than target_max_model_len=)r   min)r   r   r   s      rP   r   z5SpeculativeConfig._maybe_override_draft_max_model_len  s    $ %0(+>>> :0 : :#6: :  
 )+??? ;0 ; ;#7; ;  
 -, 
 
 	
rR   &speculative_draft_tensor_parallel_sizedraft_hf_configc                     |A|j         dk    r.d}| j        dk    r t                              d|j                    n&| j        }n|d| j        fvrt	          d|d          |S )z
        Verifies and adjusts the tensor parallel size for a draft model
        specified using speculative_draft_tensor_parallel_size.
        Nr$   r/   zV%s cannot currently be run with tp>1; setting speculative_draft_tensor_parallel_size=1z'speculative_draft_tensor_parallel_size=zB cannot be other value than 1 or target model tensor_parallel_size)rn   r2   r   r   r   )r>   r   r   s      rP   r   z*SpeculativeConfig._verify_and_get_draft_tpA  s     29)-===9:6)>BBNNK'2   +? 76 4"7<
 
 
 K9 K K K   65rR   c           	      n    t          | j        || j        | j        | j        | j        | j                  }|S )zCreate a parallel config for use by the draft worker.

        This is mostly a copy of the target parallel config, except the tp_size.
        )pipeline_parallel_sizer2   distributed_executor_backendmax_parallel_loading_workersdisable_custom_all_reduceray_workers_use_nsightplacement_group)r   r   r   r   r   r   r   )r>   r   r@   s      rP   r   z.SpeculativeConfig.create_draft_parallel_configd  sH     !/#9#P!G)?)\)?)\&<&V#9#P2B!
 !
 !
 %$rR   after)modec                      j         t          d           j        t          d           j        dk    rt          d j         d           j        r j                             j                    j        " j        dk     rt          d j                  g d} j        d	k    rF j        r?t           fd
|D                       s$t          d| d j        j
        j                                                     S )Nz{'tensor_parallel_size' is not a valid argument in the speculative_config. Please pass 'draft_tensor_parallel_size' instead.z}num_speculative_tokens must be provided with speculative model unless the draft model config contains an n_predict parameter.r   z9Expected num_speculative_tokens to be greater than zero (z).r7   zmExpect the batch size threshold of disabling speculative decoding is > 1, but got self.disable_by_batch_size=)llamaqwenminicpmgpt_ossr!   c              3   >   K   | ]}|j         j        j        v V  d S r   )r=   r   rn   )r   supported_modelrM   s     rP   	<genexpr>z1SpeculativeConfig._verify_args.<locals>.<genexpr>  sE        #  4#;#J#UU     rR   zEagle3 is only supported for z@ models. Got self.target_model_config.hf_text_config.model_type=)r2   r   r,   r?   verify_with_parallel_configr@   r8   r.   r=   anyr   rn   &verify_equal_vocab_size_if_draft_model)rM   eagle3_target_supporteds   ` rP   _verify_argszSpeculativeConfig._verify_argsy  s   $0X  
 &.'   &!++>"9> > >  
 " 	#??*   %1d6PST6T6T1-1 1   #J"I"IK8##( $    '>     $ M0G M M/>IM M   	33555rR   c                     | j         dk    r\| j        W| j        R| j                                        }| j                                        }||k    rt	          d| d| d          d S d S d S d S )Nr%   zUTarget and draft model should have the same vocabulary size. Target model vocab_size=z. Draft model vocab_size=zd. Using models with different tokenizers can cause out-of-bounds errors during speculative decoding.)r.   r=   r?   get_vocab_sizer   )rM   target_vocab_sizedraft_vocab_sizes      rP   r   z8SpeculativeConfig.verify_equal_vocab_size_if_draft_model  s    K=(((4'3 $ 8 G G I I#6EEGG $444 ;/@; ;.>; ; ;   )(4433 54rR   c                     | j         dv S )N)r    r!   r   r.   r   s    rP   	use_eaglezSpeculativeConfig.use_eagle  s    {888rR   c                     | j         dk    S )Nr%   r   r   s    rP   uses_draft_modelz"SpeculativeConfig.uses_draft_model  s    {m++rR   c                 Z    | j         }|dv rd n| j        j        }| j        }d|d|d|dS )N)r"   r&   zSpeculativeConfig(method=z, model=z, num_spec_tokens=))r.   r?   r-   r,   )rM   r.   r-   num_spec_tokenss       rP   __repr__zSpeculativeConfig.__repr__  sL    "5554;R;X5LFLLuLLLLLLrR   )7__name__
__module____qualname____doc__r)   bool__annotations__r   r,   intr-   rJ   r.   SpeculativeMethodr1   r2   r3   me_quantQuantizationMethodsr4   r5   r6   r8   r9   r:   r;   r<   r=   r   r   r>   r   r?   r@   rB   rC   rD   floatrE   rQ   staticmethodr   rr   r   r   r   r   r   r	   r   r   r   r   r   r   r   rR   rP   r(   r(   7   s         21!%M4$;%%%>"'%";";";C;;;OE3:'+F$+++1 .3U4A-F-F-Fd
FFF?'+#*+++C 9=L(.5<<<I !&dq 9 9 9M3:9997HcDj !%M3:$$$%
 ).dq(A(A(A3:AAAD). $...5 %*E$1$=$=$=sTz===$$)E$1$=$=$=sTz===  *.C$J--- 8<4;;;0=AN>:AAA: 7;{3:::D<@>.9@@@N +-"C,,,N 05'444
 .1#U000. -0"E///c    & S&6 S;K S S S \Sjq q qf" " "H #
#&:#
 #
 "#
 
	#
 #
 #
 \#
J  6 . 603d
 6 * 6 
	 6  6  6 \ 6D % .%03% 
% % % \%( _'""".d . . . #".`  "94 9 9 9 9,$ , , , ,M# M M M M M MrR   r(   )+r   typingr   r   r   r   pydanticr   r   r	   pydantic.dataclassesr
   typing_extensionsr   vllm.config.modelr   vllm.config.parallelr   vllm.config.utilsr   vllm.loggerr   vllm.transformers_utils.configr   vllm.utils.hashingr   vllm.utils.import_utilsr   r   transformersr   'vllm.model_executor.layers.quantizationr   layersr3   r   globalsr   r   r   EagleModelTypesr   r(   r   rR   rP   <module>r      s   


 8 8 8 8 8 8 8 8 8 8 8 8 ; ; ; ; ; ; ; ; ; ; * * * * * * " " " " " " ) ) ) ) ) ) / / / / / / $ $ $ $ $ $ # # # # # # = = = = = = ( ( ( ( ( ( D D D D D D D D 	------>>>>>>>>>>>>>z''))%N H 
X			 '8]:;  
L
M L
M L
M L
M L
M L
M L
M  L
M L
M L
MrR   