§
    .`ƒi+y  ã                   óâ  — U d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ  ee¦  «        ZdZe
j        e d<   d Z!e
j        e d<   dZ" G d„ dej#        ¦  «        Z$dej%        de&e'         de'dej%        dej%        dz  dej%        dej%        dedej%        fd„Z(dej%        dej%        dedej%        fd„Z)	 	 d1dej%        d ej%        d!e'd"e'd#e'dej%        fd$„Z*d!e'de&e'         d%e+e'ej,        f         d&ej-        dej%        f
d'„Z.de'de&e'         dej%        dej%        dej%        dz  dej%        ded&ej-        dej%        fd(„Z/ ej0        dg¬)¦  «        d*„ ¦   «         Z1 ej0        dg¬)¦  «        d+e
j        fd,„¦   «         Z2 ej0        d"d#g¬)¦  «        d-e
j        fd.„¦   «         Z3ej0        d/e
j        d+e
j        fd0„¦   «         Z4dS )2é    )ÚSequence)ÚreplaceN)Úinit_logger)ÚtlÚtriton)ÚLogprobsListsÚLogprobsTensorsÚSamplerOutput)ÚSamplingMetadata)Úapply_bad_words_with_drafts)Úapply_all_penalties)Úapply_top_k_top_p)ÚSampler)ÚSpecDecodeMetadataéÿÿÿÿÚPLACEHOLDER_TOKEN_IDÚGREEDY_TEMPERATUREé€   c                   ó.  ‡ — e Zd ZdZdefˆ fd„Zdedej        dz  dej        de	d	e
f
d
„Zdededej        dej        dej        dej        d	efd„Ze	 	 ddej        dedee         dedz  d	eeee                  edz  f         f
d„¦   «         Zdej        de	ded	ej        fd„Zedej        de	dedej        deee                  d	ej        fd„¦   «         Ze	 ddeee                  deee                  dz  d	eee                  fd„¦   «         Zˆ xZS )ÚRejectionSamplerau  
    The implementation strictly follows the algorithm described in
        https://arxiv.org/abs/2211.17192.
    However, we want to clarify the terminology used in the implementation:
    accepted tokens: tokens that are accepted based on the relationship
            between the "raw" draft and target probabilities.
    recovered tokens: tokens that are sampled based on the adjusted probability
        distribution, which is derived from both the draft and target
        probabilities.
    bonus tokens:
        If all proposed tokens are accepted, the bonus token is added to the
        end of the sequence. The bonus token is only sampled from the target
        probabilities. We pass in the bonus tokens instead of sampling them
        in the rejection sampler to allow for more flexibility in the
        sampling process. For example, we can use top_p, top_k sampling for
        bonus tokens, while spec decode does not support these sampling
        strategies.
    output tokens:
        Tokens are finally generated with the rejection sampler.
        output tokens = accepted tokens + recovered tokens + bonus tokens
    Úsamplerc                 óÖ   •— t          ¦   «                              ¦   «          || _        | j        j        }|                     d¦  «        | _        |                     d¦  «        | _        d S )NÚ	processedÚlogits)ÚsuperÚ__init__r   Úlogprobs_modeÚ
startswithÚis_processed_logprobs_modeÚendswithÚis_logits_logprobs_mode)Úselfr   r   Ú	__class__s      €út/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/sample/rejection_sampler.pyr   zRejectionSampler.__init__4   s\   ø€ Ý‰Œ×ÒÑÔÐØˆŒØœÔ2ˆØ*7×*BÒ*BÀ;Ñ*OÔ*OˆÔ'Ø'4×'=Ò'=¸hÑ'GÔ'GˆÔ$Ð$Ð$ó    ÚmetadataÚdraft_probsNr   Úsampling_metadataÚreturnc           
      óÚ  — |j         t          k    sJ ‚|j        }|j        }|€J ‚||         }|                      |t          |d¬¦  «        d| j        rdnd¬¦  «        }|j        }	||         }
|
                     t          j
        ¦  «        }
|
}| j        s|                     ¦   «         }|                      |||¦  «        }t          ||j        |¦  «        }|                     dt          j
        ¬¦  «        }t!          |j        |j        |j         |j        |||	|¦  «        }d}|j        2|                      |j        ||| j        r|n|
|j        j        |¦  «        }t/          ||¬	¦  «        S )
a^  
        Args:
            metadata:
                Metadata for spec decoding.
            draft_probs (Optional[torch.Tensor]):
                Probability distribution for the draft tokens. Shape is
                [num_tokens, vocab_size]. Can be None if probabilities are
                not provided, which is the case for ngram spec decode.
            logits (torch.Tensor):
                Target model's logits probability distribution.
                Shape is [num_tokens + batch_size, vocab_size]. Here,
                probabilities from different requests are flattened into a
                single tensor because this is the shape of the output logits.
                NOTE: `logits` can be updated in place to save memory.
            sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
                Additional metadata needed for sampling, such as temperature,
                top-k/top-p parameters, or other relevant information.
        Returns:
            SamplerOutput:
                Contains the final output token IDs and their logprobs if
                requested.
        Nr   )Úmax_num_logprobsTÚprocessed_logitsÚ
raw_logits)r   r(   Úpredict_bonus_tokenÚlogprobs_mode_override)ÚdimÚdtype)Úsampled_token_idsÚlogprobs_tensors)Úmax_spec_lenÚMAX_SPEC_LENÚbonus_logits_indicesÚtarget_logits_indicesr   r   r   r2   ÚtoÚtorchÚfloat32ÚcloneÚapply_logits_processorsÚapply_sampling_constraintsÚcu_num_draft_tokensÚsoftmaxÚrejection_sampleÚdraft_token_idsÚnum_draft_tokensr+   Ú_get_logprobs_tensorsr3   Úlogprobsr
   )r"   r&   r'   r   r(   r6   r7   Úbonus_logitsÚbonus_sampler_outputÚbonus_token_idsÚraw_target_logitsÚtarget_logitsÚtarget_probsÚoutput_token_idsr3   s                  r$   ÚforwardzRejectionSampler.forward;   sÛ  € ð> Ô$­Ò4Ð4Ð4Ð4à'Ô<ÐØ (Ô >Ðð Ð!Ð!Ð!ØÐ2Ô3ˆØ#Ÿ|š|ØÝ%Ø!Ø!#ðñ ô ð !%ð Ô.ð$Ð#5Ð#5àð  ,ñ  
ô  
Ðð /Ô@ˆð
 #Ð#8Ô9Ðà-×0Ò0µ´Ñ?Ô?ÐØ)ˆØÔ.ð 	2ð *×/Ò/Ñ1Ô1ˆMØ×4Ò4ØÐ,¨hñ
ô 
ˆõ 3ØØÔ(Øñ
ô 
ˆð %×,Ò,°½5¼=Ð,ÑIÔIˆå+ØÔ$ØÔ%ØÔ!ØÔ(ØØØØñ	
ô 	
Ðð  ÐØÔ-Ð9Ø#×9Ò9Ø!Ô2ØØØ!%Ô!@ÐWÐFWØ$Ô5Ô>Ø ñ ô  Ðõ Ø.Ø-ð
ñ 
ô 
ð 	
r%   r+   rI   rE   r2   c                 óŽ  — t          j        |j        ¦  «        }|j        d d…         |dd …<   |j        }|j        }	t          j        |t           j        ¬¦  «        }
|                     t           j        ¦  «        |
|	<   |                     t           j        ¦  «        |
|<   |}t          j        |j        d         |j	        |j
        ¬¦  «        }|                     d¦  «        |                     d¦  «        z                        ¦   «         }|                     |
j        d         dz
  ¬¦  «         |                     ¦   «                              ¦   «         }d||t          k    <   |
|         }| j        r|n| j                             |¦  «        }| j                             |||                     t           j        ¦  «        ¦  «        S )Nr   é   )r1   )Údevicer1   r   )Úmax)r9   Ú
zeros_likeÚcu_num_sampled_tokensr6   r7   r:   r8   ÚarangeÚshaperO   r1   Ú	unsqueezeÚflattenÚclamp_r;   r   r!   r   Úcompute_logprobsÚgather_logprobsÚint64)r"   r+   r&   r   rI   rE   r2   rR   r6   r7   Úfinal_logitsÚlogit_start_indicesÚoffsetsÚaccepted_logit_indicesÚaccepted_tokensÚaccepted_logitsÚaccepted_logprobss                    r$   rC   z&RejectionSampler._get_logprobs_tensors©   sº  € õ !&Ô 0°Ô1OÑ PÔ PÐØ$,Ô$BÀ3ÀBÀ3Ô$GÐ˜a˜b˜bÑ!ð  (Ô<ÐØ (Ô >ÐÝÔ'¨µe´mÐDÑDÔDˆØ.;×.>Ò.>½u¼}Ñ.MÔ.MˆÐ*Ñ+Ø-9¯_ª_½U¼]Ñ-KÔ-KˆÐ)Ñ*ð
 4ÐÝ”,ØÔ# BÔ'Ø&Ô-Ø%Ô+ð
ñ 
ô 
ˆð  ×)Ò)¨!Ñ,Ô,¨w×/@Ò/@ÀÑ/CÔ/CÑCß
Š'‰)Œ)ð 	ð 	×%Ò%¨,Ô*<¸QÔ*?À!Ñ*CÐ%ÑDÔDÐDØ+×1Ò1Ñ3Ô3×;Ò;Ñ=Ô=ˆàCDˆ˜Õ+?Ò?Ñ@ð 'Ð'=Ô>ˆð Ô+ð@ˆOˆOà”×.Ò.¨Ñ?Ô?ð 	ð
 Œ|×+Ò+ØØØ×Òuœ{Ñ+Ô+ñ
ô 
ð 	
r%   © rK   Ú
vocab_sizeÚdiscard_req_indicesr3   c                 óÞ  ‡	— |                       ¦   «                              ¦   «         }|t          k    ||k     z  Š	d}|zdg‰	                     d¬¦  «                             ¦   «                              ¦   «         z   }|                     ‰	                     ¦   «         ¦  «        }|                     |¦  «        }t          |¦  «        dk    rd‰	|<   ˆ	fd„t          |¦  «        D ¦   «         }||fS )aM  Parse the output of the rejection sampler.
        Args:
            output_token_ids: The sampled token IDs in shape
                [batch_size, max_spec_len + 1]. The rejected tokens are
                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                and will be filtered out in this function.
            vocab_size: The size of the vocabulary.
            discard_req_indices: Optional row indices to discard tokens in.
            logprobs_tensors: Optional logprobs tensors to filter.
        Returns:
            A list of lists of token IDs.
        Nr   rN   ©ÚaxisFc                 óV   •— g | ]%\  }}|‰|                                        ¦   «         ‘Œ&S rb   )Útolist)Ú.0ÚiÚrowÚ
valid_masks      €r$   ú
<listcomp>z1RejectionSampler.parse_output.<locals>.<listcomp>ú   s?   ø€ ð 
ð 
ð 
Ù,2¨A¨sˆC
˜1”Ô×%Ò%Ñ'Ô'ð
ð 
ð 
r%   )ÚcpuÚnumpyr   ÚsumÚcumsumri   ÚfilterrV   ÚtolistsÚlenÚ	enumerate)
rK   rc   rd   r3   Úoutput_token_ids_npÚoutput_logprobsÚcu_num_tokensÚfiltered_tensorsÚoutputsrm   s
            @r$   Úparse_outputzRejectionSampler.parse_outputÚ   s
  ø€ ð& /×2Ò2Ñ4Ô4×:Ò:Ñ<Ô<Ðà)Õ-AÒAØ *Ò,ñ
ˆ
ð ˆØÐ'Ø˜C *§.¢.°a .Ñ"8Ô"8×"?Ò"?Ñ"AÔ"A×"HÒ"HÑ"JÔ"JÑJˆMØ/×6Ò6°z×7IÒ7IÑ7KÔ7KÑLÔLÐØ.×6Ò6°}ÑEÔEˆOåÐ"Ñ#Ô# aÒ'Ð'Ø.3ˆJÐ*Ñ+ð
ð 
ð 
ð 
Ý6?Ð@SÑ6TÔ6Tð
ñ 
ô 
ˆð ˜Ð'Ð'r%   c                 óN  — |j          }|j        p|}|j        }|r|                      ||j        ¦  «        }|j        €|rÆt          |j        ¦  «        }t          j        |j	        d¬¦  «        }t          j
        |d¬¦  «        }	|	                     |¦  «        }
|
                     |j        d¬¦  «        }|                      |||||¦  «        }|j        0|j        |         }|                     |t!          d¦  «        ¦  «         |j        x}rt#          ||||j	        ¦  «         |S )Nro   )rO   T)rO   Únon_blockingú-inf)Úno_penaltiesÚbad_words_token_idsrK   Ú!_combine_outputs_with_spec_tokensÚspec_token_idsÚallowed_token_ids_maskru   r9   ÚtensorrB   rS   Úrepeat_interleaver8   rO   Úapply_penaltiesÚmasked_fill_Úfloatr   )r"   r   r(   r&   Úhas_penaltiesÚany_penalties_or_bad_wordsrK   Únum_requestsrB   Úoriginal_indicesÚrepeat_indices_cpuÚrepeat_indicesÚ
token_maskr   s                 r$   r<   z(RejectionSampler.apply_logits_processorsÿ   sg  € ð .Ô:Ð:ˆàÔ1ÐB°]ð 	#ð -Ô=ÐØ%ð 	Ø#×EÒEØ Ø!Ô0ñ ô  Ðð Ô3Ð?À=Ð?ÝÐ0ÔAÑBÔBˆLÝ$œ|¨HÔ,EÈeÐTÑTÔTÐÝ$œ|¨LÀÐGÑGÔGÐØ!1×!CÒ!CÐDTÑ!UÔ!UÐØ/×2Ò2Ø”}°4ð 3ñ ô ˆNð ×)Ò)ØÐ)¨8°^ÐEUñô ˆFð
 !Ô7ÐCØ.ÔEÀnÔU
Ø×#Ò# Jµ°f±´Ñ>Ô>Ð>ð #4Ô"GÐGÐð 	Ý'ØÐ+Ð-=¸xÔ?Xñô ð ð ˆr%   r   c                 óº   — |j         r| S |j        €J ‚|j        |         }|j        |         }|j        |         }|j        |         }t          | |||||¦  «        } | S ©N)r€   Úprompt_token_idsÚpresence_penaltiesÚfrequency_penaltiesÚrepetition_penaltiesr   )	r   r(   r&   r   rK   r“   r”   r•   r–   s	            r$   r‡   z RejectionSampler.apply_penalties+  sƒ   € ð Ô)ð 	ØˆMà Ô1Ð=Ð=Ð=à,Ô=¸nÔMÐØ.ÔAÀ.ÔQÐØ/ÔCÀNÔSÐØ0ÔEÀnÔUÐå$ØØØØØ Øñ
ô 
ˆð ˆr%   rƒ   c                 ó  — |€| S g }t          | |¦  «        D ]u\  }}t          |¦  «        dk    rŒ|                     |¦  «         t          t          |¦  «        dz
  ¦  «        D ]'}|                     g |d         ¢||         ‘¦  «         Œ(Œv|S )Nr   rN   r   )Úzipru   ÚappendÚrange)rK   rƒ   ÚresultÚoutÚspecrk   s         r$   r‚   z2RejectionSampler._combine_outputs_with_spec_tokensG  sª   € ð
 Ð!Ø#Ð#àˆÝÐ-¨~Ñ>Ô>ð 	6ð 	6‰IˆCÝ4‰yŒy˜AŠ~ˆ~ØØMŠM˜#ÑÔÐÝ3˜t™9œ9 q™=Ñ)Ô)ð 6ð 6Ø—’Ð4  r¤
Ð4¨D°¬GÐ4Ñ5Ô5Ð5Ð5ð6àˆr%   )rb   Nr’   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r9   ÚTensorr   r
   rL   Úintr	   rC   Ústaticmethodr   ÚtupleÚlistr   r|   r<   r‡   r‚   Ú__classcell__)r#   s   @r$   r   r      s‘  ø€ € € € € ðð ð,H ð Hð Hð Hð Hð Hð Hðl
à$ðl
ð ”\ DÑ(ð	l
ð ”ðl
ð ,ðl
ð 
ðl
ð l
ð l
ð l
ð\/
àð/
ð %ð/
ð ”ð	/
ð
 ”|ð/
ð ”lð/
ð !œ<ð/
ð 
ð/
ð /
ð /
ð /
ðb ð .0Ø37ð	"(ð "(Øœ,ð"(àð"(ð & cœ]ð"(ð *¨DÑ0ð	"(ð
 
ˆtD˜”IŒ °Ñ 4Ð4Ô	5ð"(ð "(ð "(ñ „\ð"(ðH*à”ð*ð ,ð*ð %ð	*ð
 
Œð*ð *ð *ð *ðX ðØ”ðà+ðð %ðð œð	ð
 ˜t Cœyœ/ðð 
Œðð ð ñ „\ðð6 ð 26ðð Ø˜t Cœyœ/ðà˜T #œYœ¨$Ñ.ðð 
ˆd3ŒiŒðð ð ñ „\ðð ð ð ð r%   r   rA   rB   r4   r>   r'   rJ   rG   r(   r)   c                 óV  — | j         dk    sJ ‚||j         dk    sJ ‚|j         dk    sJ ‚|j         dk    sJ ‚t          |¦  «        }| j        d         }	|j        d         }
|j        }|                      ¦   «         sJ ‚||                     ¦   «         sJ ‚|                     ¦   «         sJ ‚|                     ¦   «         sJ ‚|j        |	|
fk    sJ ‚t          j        ||dz   ft          t
          j        |¬¦  «        }|j	        rd }n|j
        t          k    }|j        s;|                     d¬¦  «        }t          |f         ||| ||||¦  «         |j	        r|S t          |	||j        |¦  «        }t#          |||| ||||¦  «        }t%          |f         ||| ||||||||
|d u ¬¦  «         |S )NrN   é   r   r   ©r1   rO   )r0   ©ÚNO_DRAFT_PROBS)Úndimru   rT   rO   Úis_contiguousr9   Úfullr   Úint32Ú
all_greedyÚtemperaturer   Ú
all_randomÚargmaxÚrejection_greedy_sample_kernelÚgenerate_uniform_probsÚ
generatorsÚsample_recovered_tokensÚrejection_random_sample_kernel)rA   rB   r4   r>   r'   rJ   rG   r(   Ú
batch_sizeÚ
num_tokensrc   rO   rK   Ú	is_greedyÚtarget_argmaxÚuniform_probsÚrecovered_token_idss                    r$   r@   r@   Y  sX  € ð  Ô 1Ò$Ð$Ð$Ð$ØÐ +Ô"2°aÒ"7Ð"7Ð"7Ð7ØÔ# qÒ(Ð(Ð(Ð(ØÔ Ò!Ð!Ð!Ð!åÐ%Ñ&Ô&€JØ Ô& qÔ)€JØÔ# BÔ'€JØÔ €FØ×(Ò(Ñ*Ô*Ð*Ð*Ð*ØÐ +×";Ò";Ñ"=Ô"=ÐÐÐ=Ø×%Ò%Ñ'Ô'Ð'Ð'Ð'Ø×(Ò(Ñ*Ô*Ð*Ð*Ð*ØÔ *¨jÐ!9Ò9Ð9Ð9Ð9õ ”zØ	\ AÑ%Ð&ÝÝŒkØð	ñ ô Ðð Ô#ð HØˆ	ˆ	à%Ô1Õ5GÒGˆ	ØÔ'ð $à$×+Ò+°Ð+Ñ3Ô3ˆÝ&¨
 }Ô5ØØØØØØØñ	
ô 	
ð 	
ð Ô'ð 	$Ø#Ð#õ +ØØØÔ$Øñ	ô €Mõ 2ØØØØØØØØñ	ô 	Ðõ # J =Ô1ØØØØØØØØØØØØ" dÐ*ðñ ô ð ð Ðr%   r   c                 óŒ  — | j         dk    sJ ‚|j         dk    sJ ‚|j        r| S | j        d         }t          |j        ||t
          d¬¦  «        }|                      |                     d¦  «        ¦  «         d}|j        t          |j        ||¦  «        }d}|j	        t          |j	        ||¦  «        }t          | ||¦  «        S )aR  Process logits based on sampling metadata.

    This function applies temperature scaling to the logits,
    as well as top-k and top-p. For greedy decoding, it returns
    the original logits.

    Args:
        logits: Input logits tensor to be processed.
        cu_num_draft_tokens: Cumulative number of draft tokens.
        sampling_metadata: Metadata containing sampling parameters such as
            temperature and whether greedy sampling is used.

    Returns:
        torch.Tensor: Processed logits if non-greedy sampling is used,
        otherwise returns the original logits.
    r©   rN   r   )Úreplace_fromÚ
replace_tor   N)r­   r±   rT   Úexpand_batch_to_tokensr²   r   Údiv_rU   Útop_kÚtop_pr   )r   r>   r(   r»   r²   rÅ   rÆ   s          r$   r=   r=   »  sø   € ð* Œ;˜!ÒÐÐÐØÔ# qÒ(Ð(Ð(Ð(ØÔ#ð Øˆà”˜a”€JÝ(ØÔ%ØØÝ'Øðñ ô €Kð ‡K‚K×%Ò% bÑ)Ô)Ñ*Ô*Ð*ð €EØÔÐ*Ý&ØÔ#ØØñ
ô 
ˆð
 €EØÔÐ*Ý&ØÔ#ØØñ
ô 
ˆõ ˜V U¨EÑ2Ô2Ð2r%   Úxry   r»   rÁ   rÂ   c                 ó²   — | j         d         }|j         d         |k    sJ ‚|                      |¦  «        }t          |f         || |||t          ¬¦  «         |S )a  Expand [batch_size] tensor to [num_tokens] tensor based on the number of
    tokens per batch in cu_num_tokens.

    For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then
    num_tokens = 6, and expanded_x = [a, a, b, b, b, c].

    Args:
        x: [batch_size] tensor to expand.
        cu_num_tokens: [batch_size] tensor containing the cumulative number of
            tokens per batch. Each element represents the total number of
            tokens up to and including that batch.
        num_tokens: Total number of tokens.
        replace_from: int = 0
            Value to be replaced if it is found in x.
        replace_to: int = 0
            Value to replace with when replace_from is found.
    Returns:
        expanded_x: [num_tokens] tensor.
    r   )ÚMAX_NUM_TOKENS)rT   Ú	new_emptyÚexpand_kernelr5   )rÇ   ry   r»   rÁ   rÂ   rº   Ú
expanded_xs          r$   rÃ   rÃ   õ  sq   € ð4 ”˜”€JØÔ˜qÔ! ZÒ/Ð/Ð/Ð/Ø—’˜ZÑ(Ô(€JÝ:-Ô ØØ	ØØØÝ#ðñ ô ð ð Ðr%   r·   rO   c                 óþ   — t          j        | ft           j        |¬¦  «        }d}t          |¦  «        D ]H\  }}|dk    rŒ||z   }|                     |¦  «        }	|	|||…                              |	¬¦  «         |}ŒI|S )aÒ  
    Generates a batch of uniform random samples, with optional seeding
    if available.

    This method creates a tensor of shape `(num_tokens, )` filled
    with uniform random values in the range [0, 1). If `generators` is provided,
    the requests with their own seeds will use the provided `torch.Generator`
    for reproducibility. The samples for the other requests will be generated
    without a seed.

    Args:
        num_tokens: int
            Total number of tokens.
        num_draft_tokens: List[List[int]]
            Number of draft tokens per request.
        generators: Optional[Dict[int, torch.Generator]]
            A dictionary mapping indices in the batch to
            `torch.Generator` objects.
        device: torch.device
            The device on which to allocate the tensor.
    Returns:
        uniform_rand: torch.Tensor
            A tensor of shape `(num_tokens, )` containing uniform
            random values in the range [0, 1).
    rª   r   N©Ú	generator)r9   ÚrandÚfloat64rv   ÚgetÚuniform_)
r»   rB   r·   rO   r¾   Ú	start_idxÚreq_idxÚnÚend_idxrÏ   s
             r$   r¶   r¶     s§   € õH ”JØ	ˆÝŒmØðñ ô €Mð
 €IÝÐ 0Ñ1Ô1ð 	ð 	‰
ˆð Š6ˆ6ØØ˜a‘-ˆØ—N’N 7Ñ+Ô+ˆ	ØÐ Ø˜) GÐ+Ô,×5Ò5À	Ð5ÑJÔJÐJØˆ	ˆ	ØÐr%   c                 óÊ  — t          |¦  «        }|j        d         }	t          j        ||	ft          j        |¬¦  «        }
|
                     ¦   «          |j                             ¦   «         D ]-\  }}||         dk    r|
|                              |¬¦  «         Œ.t          j        |¦  «        }t          || f         ||||||
|	t          j        |	¦  «        |d u ¬¦	  «	         |S )Nr   rª   r   rÎ   r«   )ru   rT   r9   Úemptyr:   Úexponential_r·   ÚitemsÚ
empty_likeÚsample_recovered_tokens_kernelr   Únext_power_of_2)r4   rB   r>   rA   r'   rJ   r(   rO   rº   rc   Úqrk   rÏ   r¿   s                 r$   r¸   r¸   T  s  € õ Ð%Ñ&Ô&€JØÔ# BÔ'€JÝŒØ	ZÐ ÝŒmØð	ñ 	ô 	€Að
 ‡N‚NÑÔÐØ)Ô4×:Ò:Ñ<Ô<ð 3ð 3‰ˆˆ9ð ˜AÔ Ò"Ð"ØˆaŒD×Ò¨	ÐÑ2Ô2Ð2øåÔ*¨?Ñ;Ô;ÐÝ" J°Ð#=Ô>ØØØØØØ	ØÝÔ˜zÑ*Ô*Ø" dÐ*ð
ñ 
ô 
ð 
ð Ðr%   )Údo_not_specializec                 óF  — t          j        d¦  «        }|€dnt          j        ||z   ¦  «        }|sd S |dk    rdnt          j        ||z   dz
  ¦  «        }	t          j        ||z   ¦  «        }
|
|	z
  }d}t          |¦  «        D ]a}|s]t          j        ||	z   |z   ¦  «        }t          j        ||	z   |z   ¦  «        }t          j        | ||dz   z  z   |z   |¦  «         ||k    rd}Œb|s:t          j        ||z   ¦  «        }t          j        | ||dz   z  z   |z   |¦  «         d S d S )Nr   TrN   F©r   Ú
program_idÚloadrš   Ústore)Úoutput_token_ids_ptrÚcu_num_draft_tokens_ptrÚdraft_token_ids_ptrÚtarget_argmax_ptrÚbonus_token_ids_ptrÚis_greedy_ptrr4   rÕ   r¼   rÔ   r×   rB   ÚrejectedÚposÚdraft_token_idÚtarget_argmax_idÚbonus_token_ids                    r$   rµ   rµ     s‚  € õ Œm˜AÑÔ€Gð &Ð-µ2´7¸=È7Ñ;RÑ3SÔ3S€IØð àˆà ’\\¥r¤wÐ/FÈÑ/PÐSTÑ/TÑ'UÔ'U€IÝŒgÐ-°Ñ7Ñ8Ô8€GØ Ñ*Ðà€HÝÐ%Ñ&Ô&ð 
 ð 
 ˆØð 		 ÝœWÐ%8¸9Ñ%DÀsÑ%JÑKÔKˆNÝ!œwÐ'8¸9Ñ'DÀsÑ'JÑKÔKÐÝŒHØ$ w°,ÀÑ2BÑ'CÑCÀcÑIØ ñô ð ð Ð!1Ò1Ð1àøàð 
åœÐ!4°wÑ!>Ñ?Ô?ˆÝ
ŒØ  7¨l¸QÑ.>Ñ#?Ñ?ÐBRÑRØñ	
ô 	
ð 	
ð 	
ð 	
ð
ð 
r%   r¬   c                 ó  — t          j        d¦  «        }t          j        ||z   ¦  «        }|rd S |dk    rdnt          j        ||z   dz
  ¦  «        }t          j        ||z   ¦  «        }||z
  }d}t          |¦  «        D ]Ì}|sÈt          j        ||z   |z   ¦  «        }|rd}n t          j        |||z   |
z  z   |z   ¦  «        }t          j        |||z   |
z  z   |z   ¦  «        }t          j        ||z   |z   ¦  «        }|dk    r||z  |k    r|}nd}t          j        ||z   |z   ¦  «        }t          j        | ||	dz   z  z   |z   |¦  «         ŒÍ|s:t          j        ||z   ¦  «        }t          j        | ||	dz   z  z   |z   |¦  «         d S d S )Nr   rN   FTrâ   )ræ   rç   rè   Údraft_probs_ptrÚtarget_probs_ptrrê   Úrecovered_token_ids_ptrÚuniform_probs_ptrrë   r4   rc   r¬   rÕ   r¼   rÔ   r×   rB   rì   rí   rî   Ú
draft_probÚtarget_probÚuniform_probÚtoken_idrð   s                            r$   r¹   r¹   ®  s  € õ Œm˜AÑÔ€GÝ”˜¨Ñ/Ñ0Ô0€IØð àˆà ’\\¥r¤wÐ/FÈÑ/PÐSTÑ/TÑ'UÔ'U€IÝŒgÐ-°Ñ7Ñ8Ô8€GØ Ñ*Ðà€HÝÐ%Ñ&Ô&ð ð ˆØð 	ÝœWÐ%8¸9Ñ%DÀsÑ%JÑKÔKˆNØð Ø

åœWØ# y°3¡¸*Ñ&DÑDÀ~ÑUñô 
õ œ'Ø  I°¡O°zÑ#AÑAÀNÑRñô ˆKõ œ7Ð#4°yÑ#@À3Ñ#FÑGÔGˆLð ˜AŠ~ˆ~ +°
Ñ":¸lÒ"JÐ"Jà)ð  Ýœ7Ð#:¸YÑ#FÈÑ#LÑMÔMÝŒHØ$ w°,ÀÑ2BÑ'CÑCÀcÑIÈ8ñô ð øð ð 
åœÐ!4°wÑ!>Ñ?Ô?ˆÝ
ŒØ  7¨l¸QÑ.>Ñ#?Ñ?ÐBRÑRØñ	
ô 	
ð 	
ð 	
ð 	
ð
ð 
r%   rÉ   c                 óz  — t          j        d¦  «        }|dk    rd}nt          j        ||z   dz
  ¦  «        }t          j        ||z   ¦  «        }||z
  }	t          j        ||z   ¦  «        }
t          j        |
|k    ||
¦  «        }
t          j        d|¦  «        }t          j        | |z   |z   |
||	k     ¬¦  «         d S )Nr   rN   )Úmask)r   rã   rä   ÚwhererS   rå   )Ú
output_ptrÚ	input_ptrÚcu_num_tokens_ptrrÁ   rÂ   rÉ   rÕ   rÔ   r×   r»   Úsrc_valÚoffsets               r$   rË   rË   ì  sÅ   € õ Œm˜AÑÔ€GØ!‚|€|Øˆ	ˆ	å”GÐ-°Ñ7¸!Ñ;Ñ<Ô<ˆ	ÝŒgÐ'¨'Ñ1Ñ2Ô2€GØ˜9Ñ$€JåŒgi 'Ñ)Ñ*Ô*€GÝŒhw ,Ò.°
¸GÑDÔD€GÝŒYq˜.Ñ)Ô)€FÝ„HˆZ˜)Ñ# fÑ,¨g¸FÀZÒ<OÐPÑPÔPÐPÐPÐPr%   ÚPADDED_VOCAB_SIZEc	                 ó8  — t          j        d¦  «        }	|	dk    rdnt          j        ||	z   dz
  ¦  «        }
t          j        ||	z   ¦  «        }||
z
  }t          j        d¦  «        }||k    rd S t          j        d|¦  «        }|rIt          j        ||
z   |z   ¦  «        }t          j        ||
|z   |z  z   |z   ||k     ||k    z  d¬¦  «        }nft          j        ||
|z   |z  z   |z   ||k     d¬¦  «        }t          j        ||
|z   |z  z   |z   ||k     d¬¦  «        }t          j        ||z
  d¦  «        }t          j        ||	|z  z   |z   ||k     t          d¦  «        ¬¦  «        }t          j        ||z  d¬¦  «        }t          j        | |
z   |z   |¦  «         d S )Nr   rN   )rû   Úotherr   r   rf   )r   rã   rä   rS   Úmaximumr‰   r´   rå   )ræ   rç   rè   rò   ró   Úq_ptrrc   r  r¬   rÕ   rÔ   r×   rB   rí   Úvocab_offsetrî   Úprobrö   r÷   rß   Úrecovered_ids                        r$   rÝ   rÝ     sõ  € õ Œm˜AÑÔ€GØ ’\\¥r¤wÐ/FÈÑ/PÐSTÑ/TÑ'UÔ'U€IÝŒgÐ-°Ñ7Ñ8Ô8€GØ Ñ*Ðõ Œ-˜Ñ
Ô
€CØ
ÐÒÐØˆå”9˜QÐ 1Ñ2Ô2€LØð 7ÝœÐ!4°yÑ!@À3Ñ!FÑGÔGˆÝŒwØ 	¨C¡°:Ñ=Ñ=ÀÑLØ *Ò,°ÀÒ1OÑPØð
ñ 
ô 
ˆˆõ ”WØ˜y¨3™°*Ñ<Ñ<¸|ÑKØ 
Ò*Øð
ñ 
ô 
ˆ
õ
 ”gØ 	¨C¡°:Ñ=Ñ=ÀÑLØ 
Ò*Øð
ñ 
ô 
ˆõ
 Œz˜+¨
Ñ2°AÑ6Ô6ˆõ 	ŒØ˜*Ñ$Ñ$ |Ñ3Ø˜JÒ&ÝF‰mŒmð	ñ 	ô 	€Aõ
 ”9˜T A™X¨BÐ/Ñ/Ô/€LÝ„HÐ! IÑ-°Ñ3°\ÑBÔBÐBÐBÐBr%   )r   r   )5Úcollections.abcr   Údataclassesr   r9   Útorch.nnÚnnÚvllm.loggerr   Úvllm.triton_utilsr   r   Úvllm.v1.outputsr   r	   r
   Úvllm.v1.sample.metadatar   Úvllm.v1.sample.ops.bad_wordsr   Úvllm.v1.sample.ops.penaltiesr   Ú$vllm.v1.sample.ops.topk_topp_samplerr   Úvllm.v1.sample.samplerr   Úvllm.v1.spec_decode.metadatar   rž   Úloggerr   Ú	constexprÚ__annotations__r   r5   ÚModuler   r¢   r¦   r£   r@   r=   rÃ   ÚdictÚ	GeneratorrO   r¶   r¸   Újitrµ   r¹   rË   rÝ   rb   r%   r$   ú<module>r     st  ðð %Ð $Ð $Ð $Ð $Ð $Ð $Ø Ð Ð Ð Ð Ð à €€€Ø Ð Ð Ð Ð Ð à #Ð #Ð #Ð #Ð #Ð #Ø (Ð (Ð (Ð (Ð (Ð (Ð (Ð (Ø IÐ IÐ IÐ IÐ IÐ IÐ IÐ IÐ IÐ IØ 4Ð 4Ð 4Ð 4Ð 4Ð 4Ø DÐ DÐ DÐ DÐ DÐ DØ <Ð <Ð <Ð <Ð <Ð <Ø BÐ BÐ BÐ BÐ BÐ BØ *Ð *Ð *Ð *Ð *Ð *Ø ;Ð ;Ð ;Ð ;Ð ;Ð ;à	ˆXÑ	Ô	€à%'Ð b”lÐ 'Ð 'Ñ 'Ø#$Ð B”LÐ $Ð $Ñ $ð €ðyð yð yð yð yr”yñ yô yð yðx	_à”\ð_ð ˜3”ið	_ð
 ð_ð œð_ð ” Ñ$ð_ð ”,ð_ð ”\ð_ð (ð_ð „\ð_ð _ð _ð _ðD73ØŒLð73àœð73ð (ð73ð „\ð	73ð 73ð 73ð 73ð| Øð%ð %Ø„|ð%à”<ð%ð ð%ð ð	%ð
 ð%ð „\ð%ð %ð %ð %ðP4Øð4à˜3”ið4ð S˜%œ/Ð)Ô*ð4ð ŒLð	4ð
 „\ð4ð 4ð 4ð 4ðn)Øð)à˜3”ið)ð œð	)ð ”\ð)ð ” Ñ$ð)ð ”,ð)ð (ð)ð ŒLð)ð „\ð)ð )ð )ð )ðZ €„˜~Ð.Ð/Ñ/Ô/ð(
ð (
ñ 0Ô/ð(
ðX €„˜~Ð.Ð/Ñ/Ô/ð9
ð ”Lð9
ð 9
ð 9
ñ 0Ô/ð9
ðz €„˜~¨|Ð<Ð=Ñ=Ô=ðQð ”LðQð Qð Qñ >Ô=ðQð, „ð2Cð ”|ð2Cð ”Lð2Cð 2Cð 2Cñ „ð2Cð 2Cð 2Cr%   