
    .`iX+              
          d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZmZ  ee          Z G d de          ZdedefdZdedej        dej        dedef
dZej        d             ZdS )    )AnyN)	Attention)
VllmConfigget_layers_from_vllm_config)SpeculativeConfig)init_logger)	get_model)tltriton)CommonAttentionMetadataextend_all_queries_by_1)PADDING_SLOT_IDSpecDecodeBaseProposerc                        e Zd Z	 ddedej        f fdZdefdZd Z	d Z
d	 Zd
 Zd Zdej        dej        dej        dej        dz  dedej        dz  deeej        ef         fdZdeddfdZ xZS )DraftModelProposerNvllm_configdevicec                    t                                          ||d|           |                                  |                                  |                                  |                                  |                                  d S )NF)r   r   pass_hidden_states_to_modelrunner)super__init___raise_if_multimodal_raise_if_mrope'_raise_if_padded_drafter_batch_disabled_raise_if_vocab_size_mismatch_raise_if_draft_tp_mismatch)selfr   r   r   	__class__s       s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/spec_decode/draft_model.pyr   zDraftModelProposer.__init__   s     	#(-	 	 	
 	
 	
 	!!###44666**,,,((*****    returnc                 B    |                                  }|j        j        S N)_get_attention_metadata_builderkv_cache_spec
block_size)r   builders     r    _block_sizezDraftModelProposer._block_size)   s    6688$//r!   c                 2    | j         rt          d          d S )NzMSpeculative Decoding with draft models does not support multimodal models yet)supports_mm_inputsNotImplementedErrorr   s    r    r   z'DraftModelProposer._raise_if_multimodal-   s.    " 	%9  	 	r!   c                 <    | j         j        rt          d          d S )NzBSpeculative Decoding with draft models does not support M-RoPE yet)draft_model_config
uses_mroper,   r-   s    r    r   z"DraftModelProposer._raise_if_mrope4   s/    "- 	%T  	 	r!   c                 F    | j         j        j        rt          d          d S )NzSpeculative Decoding with draft models only supports padded drafter batch. Please don't pass --disable-padded-drafter-batch in the speculative_config.)r   speculative_configdisable_padded_drafter_batchr,   r-   s    r    r   z:DraftModelProposer._raise_if_padded_drafter_batch_disabled:   s4    .K 	%.  	 	r!   c                 B    | j         j                                         d S r$   )r   r2   &verify_equal_vocab_size_if_draft_modelr-   s    r    r   z0DraftModelProposer._raise_if_vocab_size_mismatchB   s     +RRTTTTTr!   c                     | j         j        }|j        j        }|j        j        }||k    rt          d| d| d          d S )NzYCurrently, 'draft_tensor_parallel_size' and 'tensor_parallel_size' must be the same. Got z and zE. Please pass 'draft_tensor_parallel_size' in the speculative_config.)r   r2   target_parallel_configtensor_parallel_sizedraft_parallel_config
ValueError)r   spec_cfgtgt_tpdraft_tps       r    r   z.DraftModelProposer._raise_if_draft_tp_mismatchE   st     '+&6&I0E1FvV)1V V8>V V V   r!   target_token_idsnext_token_idstarget_positionslast_token_indicescadnum_rejected_tokens_gpuc           
         |                                 }|f}|j        d d         }	|j        dd          dz
  }
||
|z  }
|j        d         |z   }t          j        |f| j        j        t          j                  }t          |         |||	|
| j        ||j        d         d           t          |         |||
         dz   |	|
| j	        ||j        d         d           t          || j	        d |         ||                                 | j                  }t          || j        |          }|j        dd          dz
  }|||z  }|||fS )N   r   )r   dtype)target_toks_ptrnext_toks_ptrquery_start_locs_ptrquery_end_locs_ptrout_ptr_merged_toksout_ptr_is_rejected_toktarget_toks_sizerejected_tok_fill)rB   new_positionsis_rejected_token_maskr'   max_model_len)arangenew_slot_mapping)
batch_sizequery_start_locshapetorchempty	input_idsr   boolmerge_toks_kernel	positionscompute_new_slot_mappingr)   rR   r   rS   )r   r>   r?   r@   rA   rB   rC   rU   grid
start_locsend_locs
num_tokensis_rejected_tokrT   new_cadnew_last_token_indicess                   r    set_inputs_first_passz(DraftModelProposer.set_inputs_first_passV   s    ^^%%
}("-
&qrr*Q.".//H%+A.;
+M$."7uz
 
 
 	$,(!+' $$3-3A6  	
 	
 	
 	
 	$,*84q8!+' $$3-3A6		
 		
 		
 		
 4.*5#2'')),
 
 
 ,C;-,
 ,
 ,
 ")!8!<q!@"."&=="17::r!   target_modelc                    t          t          | j        t                                                              }ddlm} t          | j                  }t          	                    d|j
        j        |j        j        |j        j                    |d          5  t          |d          | _        ddd           n# 1 swxY w Y   t          | j        t                                                    |z
  }t!          |          | _        dS )z/Takes target_model to satisfy the type checker.r   )set_model_tag)target_model_vllm_configz/Starting to load draft model %s. TP=%d, rank=%ddraft_model)r   prefixN)setr   r   r   keysvllm.compilation.backendsri   "create_vllm_config_for_draft_modelloggerinfomodel_configmodelparallel_configr8   rankr	   listattn_layer_names)r   rg   target_attn_layer_namesri   draft_vllm_configdraft_attn_layer_namess         r    
load_modelzDraftModelProposer.load_model   sf   
 #&'(8)DDIIKK#
 #
 	<;;;;;(J%)%5)
 )
 )
 	=*0-B-2		
 	
 	
 ]=)) 	X 	X"/@WWWDJ	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X ((8)DDIIKK%& 	 !%%; < <s   B>>CCr$   )__name__
__module____qualname__r   rX   r   r   intr)   r   r   r   r   r   Tensorr   tuplerf   r   r|   __classcell__)r   s   @r    r   r      sb       
 	+ ++ + + + + + +$0S 0 0 0 0      U U U  "?;,?; ?;  ,	?;
 "L4/?; %?; "'!4?; 
sEL"99	:?; ?; ?; ?;B=s =t = = = = = = = =r!   r   rj   r"   c                     | }|j         j                            |j        j                  }|                    d|j         j        |          }|S )ar  The vllm_config is configured for the target model, e.g.
    its quant_config and parallel_config. But the draft model is potentially
    quantized differently, and has potentially different tensor_parallel_size.
    This function creates a new vllm_config configured for the draft model.
    The vllm_config is useful when loading the draft model with get_model().
    )rv   N)quant_configrs   ru   )r2   r9   replaceru   rv   r/   )rj   oldnew_parallel_confignews       r    rp   rp      sd     #C0FNN % O   kk+>+ "  C
 Jr!   rB   rP   rQ   r'   rR   c                    | j         j        \  }}t          j        || j        j                  }t          j        ||                                 dz   t          |                    }t          j	        ||dz
            }||z  ||z  z   }	| j         
                    d          |	         }
||z  }|
|z  |z   }||k    }|                    |t                     |                    |t                     |S )N)r   rF   )output_size)maxrE   )block_table_tensorrW   rX   rS   rV   r   repeat_interleavenaive_query_lenslenclampviewmasked_fill_r   )rB   rP   rQ   r'   rR   rU   n_blocks_per_reqreq_indicesclamped_positionsblock_table_indices
block_numsblock_offsetsrT   exceeds_max_model_lens                 r    r^   r^      s    $'#9#? J ,z#2E2LMMMK)S))++a/S=O=O  K
 M}q7HIII&&):j)HH  ',,R001DEJ%
2M!J.>)]:!!"7III!!"8/JJJr!   c                 `   t          j        d          }t          j        ||z             }	|t          j        d          dz
  k    }
|
r |                    t           j                  }n7t          j        ||z   dz                                 t           j                  }t          j        ||z             }t          j        ||z             }t          |	|dz             D ]}||k    rNt          j        | |z             }t          j        ||z   |z   |           t          j        ||z   |z   d           V||dz   k    r7t          j        ||z   |z   |           t          j        ||z   |z   d           t          j        ||z   |z   |           t          j        ||z   |z   d           dS )ar  
    Merges the `target_toks_ptr` and the `next_toks_ptr` into a new tensor
    called `out_ptr_merged_toks`. Rejected tokens are those after the
    `query_end_locs_ptr` and before the next `query_start_locs_ptr`. Fills the
    rejected tokens positions with the value `rejected_tok_fill`. Also fills a mask
    of the rejected tokens in `out_ptr_is_rejected_tok`.
    r   rF   FTN)r
   
program_idloadnum_programstoint32rangestore)rH   rI   rJ   rK   rL   rM   rN   rO   pid	start_locis_last_programnext_start_locend_locnew_valiold_vals                   r    r\   r\      s   $ -

C,s233IR_Q//!33O N),,RX66!5!;a!?@@CCBHMMg(3.//Ggmc)**G9nq011 
> 
><<go122GH(3.2G<<<H,s2Q6>>>>'A+H(3.2G<<<H,s2Q6>>>>H(3.24EFFFH,s2Q6====
> 
>r!   ) typingr   rX   vllm.attention.layerr   vllm.configr   r   vllm.config.speculativer   vllm.loggerr    vllm.model_executor.model_loaderr	   vllm.triton_utilsr
   r    vllm.v1.attention.backends.utilsr   r   vllm.v1.spec_decode.eagler   r   r}   rq   r   rp   r   r   r^   jitr\    r!   r    <module>r      s          * * * * * * ? ? ? ? ? ? ? ? 5 5 5 5 5 5 # # # # # # 6 6 6 6 6 6 ( ( ( ( ( ( ( (        N M M M M M M M	X		^= ^= ^= ^= ^=/ ^= ^= ^=B(   *	 < "L 	
    : %> %> %> %> %>r!   