
    .`i+y                        U d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ  ee          ZdZe
j        e d<   d Z!e
j        e d<   dZ" G d dej#                  Z$dej%        de&e'         de'dej%        dej%        dz  dej%        dej%        dedej%        fdZ(dej%        dej%        dedej%        fdZ)	 	 d1dej%        d ej%        d!e'd"e'd#e'dej%        fd$Z*d!e'de&e'         d%e+e'ej,        f         d&ej-        dej%        f
d'Z.de'de&e'         dej%        dej%        dej%        dz  dej%        ded&ej-        dej%        fd(Z/ ej0        dg)          d*             Z1 ej0        dg)          d+e
j        fd,            Z2 ej0        d"d#g)          d-e
j        fd.            Z3ej0        d/e
j        d+e
j        fd0            Z4dS )2    )Sequence)replaceN)init_logger)tltriton)LogprobsListsLogprobsTensorsSamplerOutput)SamplingMetadata)apply_bad_words_with_drafts)apply_all_penalties)apply_top_k_top_p)Sampler)SpecDecodeMetadataPLACEHOLDER_TOKEN_IDGREEDY_TEMPERATURE   c                   .    e Zd ZdZdef fdZdedej        dz  dej        de	d	e
f
d
Zdededej        dej        dej        dej        d	efdZe	 	 ddej        dedee         dedz  d	eeee                  edz  f         f
d            Zdej        de	ded	ej        fdZedej        de	dedej        deee                  d	ej        fd            Ze	 ddeee                  deee                  dz  d	eee                  fd            Z xZS )RejectionSamplerau  
    The implementation strictly follows the algorithm described in
        https://arxiv.org/abs/2211.17192.
    However, we want to clarify the terminology used in the implementation:
    accepted tokens: tokens that are accepted based on the relationship
            between the "raw" draft and target probabilities.
    recovered tokens: tokens that are sampled based on the adjusted probability
        distribution, which is derived from both the draft and target
        probabilities.
    bonus tokens:
        If all proposed tokens are accepted, the bonus token is added to the
        end of the sequence. The bonus token is only sampled from the target
        probabilities. We pass in the bonus tokens instead of sampling them
        in the rejection sampler to allow for more flexibility in the
        sampling process. For example, we can use top_p, top_k sampling for
        bonus tokens, while spec decode does not support these sampling
        strategies.
    output tokens:
        Tokens are finally generated with the rejection sampler.
        output tokens = accepted tokens + recovered tokens + bonus tokens
    samplerc                     t                                                       || _        | j        j        }|                    d          | _        |                    d          | _        d S )N	processedlogits)super__init__r   logprobs_mode
startswithis_processed_logprobs_modeendswithis_logits_logprobs_mode)selfr   r   	__class__s      t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/sample/rejection_sampler.pyr   zRejectionSampler.__init__4   s\    2*7*B*B;*O*O''4'='=h'G'G$$$    metadatadraft_probsNr   sampling_metadatareturnc           
         |j         t          k    sJ |j        }|j        }|J ||         }|                     |t          |d          d| j        rdnd          }|j        }	||         }
|
                    t          j
                  }
|
}| j        s|                                }|                     |||          }t          ||j        |          }|                    dt          j
                  }t!          |j        |j        |j         |j        |||	|          }d}|j        2|                     |j        ||| j        r|n|
|j        j        |          }t/          ||	          S )
a^  
        Args:
            metadata:
                Metadata for spec decoding.
            draft_probs (Optional[torch.Tensor]):
                Probability distribution for the draft tokens. Shape is
                [num_tokens, vocab_size]. Can be None if probabilities are
                not provided, which is the case for ngram spec decode.
            logits (torch.Tensor):
                Target model's logits probability distribution.
                Shape is [num_tokens + batch_size, vocab_size]. Here,
                probabilities from different requests are flattened into a
                single tensor because this is the shape of the output logits.
                NOTE: `logits` can be updated in place to save memory.
            sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
                Additional metadata needed for sampling, such as temperature,
                top-k/top-p parameters, or other relevant information.
        Returns:
            SamplerOutput:
                Contains the final output token IDs and their logprobs if
                requested.
        Nr   )max_num_logprobsTprocessed_logits
raw_logits)r   r(   predict_bonus_tokenlogprobs_mode_override)dimdtype)sampled_token_idslogprobs_tensors)max_spec_lenMAX_SPEC_LENbonus_logits_indicestarget_logits_indicesr   r   r   r2   totorchfloat32cloneapply_logits_processorsapply_sampling_constraintscu_num_draft_tokenssoftmaxrejection_sampledraft_token_idsnum_draft_tokensr+   _get_logprobs_tensorsr3   logprobsr
   )r"   r&   r'   r   r(   r6   r7   bonus_logitsbonus_sampler_outputbonus_token_idsraw_target_logitstarget_logitstarget_probsoutput_token_idsr3   s                  r$   forwardzRejectionSampler.forward;   s   > $4444'< ( > !!!23#||%!!#   !% .$#5#5  ,  
  
 /@
 ##89-00??). 	2 *//11M44,h
 
 3(
 
 %,,5=,II+$%!(	
 	
  -9#99!2!%!@WFW$5>     .-
 
 
 	
r%   r+   rI   rE   r2   c                    t          j        |j                  }|j        d d         |dd <   |j        }|j        }	t          j        |t           j                  }
|                    t           j                  |
|	<   |                    t           j                  |
|<   |}t          j        |j        d         |j	        |j
                  }|                    d          |                    d          z                                   }|                    |
j        d         dz
             |                                                                }d||t          k    <   |
|         }| j        r|n| j                            |          }| j                            |||                    t           j                            S )Nr      )r1   )devicer1   r   )max)r9   
zeros_likecu_num_sampled_tokensr6   r7   r:   r8   arangeshaperO   r1   	unsqueezeflattenclamp_r;   r   r!   r   compute_logprobsgather_logprobsint64)r"   r+   r&   r   rI   rE   r2   rR   r6   r7   final_logitslogit_start_indicesoffsetsaccepted_logit_indicesaccepted_tokensaccepted_logitsaccepted_logprobss                    r$   rC   z&RejectionSampler._get_logprobs_tensors   s    !& 01O P P$,$B3B3$Gabb!  (< ( >'emDDD.;.>.>u}.M.M*+-9__U]-K-K)*
 4,#B'&-%+
 
 
  ))!,,w/@/@/C/CC
')) 	 	%%,*<Q*?!*C%DDD+1133;;==CD+??@ ''=> +@OO..?? 	
 |++u{++
 
 	
r%    rK   
vocab_sizediscard_req_indicesr3   c                   	 |                                                                  }|t          k    ||k     z  	d}|zdg	                    d                                                                          z   }|                    	                                          }|                    |          }t          |          dk    rd	|<   	fdt          |          D             }||fS )aM  Parse the output of the rejection sampler.
        Args:
            output_token_ids: The sampled token IDs in shape
                [batch_size, max_spec_len + 1]. The rejected tokens are
                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                and will be filtered out in this function.
            vocab_size: The size of the vocabulary.
            discard_req_indices: Optional row indices to discard tokens in.
            logprobs_tensors: Optional logprobs tensors to filter.
        Returns:
            A list of lists of token IDs.
        Nr   rN   axisFc                 V    g | ]%\  }}||                                                   &S rb   )tolist).0irow
valid_masks      r$   
<listcomp>z1RejectionSampler.parse_output.<locals>.<listcomp>   s?     
 
 
,2AsC
1%%''
 
 
r%   )cpunumpyr   sumcumsumri   filterrV   tolistslen	enumerate)
rK   rc   rd   r3   output_token_ids_npoutput_logprobscu_num_tokensfiltered_tensorsoutputsrm   s
            @r$   parse_outputzRejectionSampler.parse_output   s
   & /2244::<<)-AA*,

 'C*..a."8"8"?"?"A"A"H"H"J"JJM/66z7I7I7K7KLL.66}EEO"##a''.3J*+
 
 
 
6?@S6T6T
 
 
 ''r%   c                 N   |j          }|j        p|}|j        }|r|                     ||j                  }|j        |rt          |j                  }t          j        |j	        d          }t          j
        |d          }	|	                    |          }
|
                    |j        d          }|                     |||||          }|j        0|j        |         }|                    |t!          d                     |j        x}rt#          ||||j	                   |S )Nro   )rO   T)rO   non_blocking-inf)no_penaltiesbad_words_token_idsrK   !_combine_outputs_with_spec_tokensspec_token_idsallowed_token_ids_maskru   r9   tensorrB   rS   repeat_interleaver8   rO   apply_penaltiesmasked_fill_floatr   )r"   r   r(   r&   has_penaltiesany_penalties_or_bad_wordsrK   num_requestsrB   original_indicesrepeat_indices_cpurepeat_indices
token_maskr   s                 r$   r<   z(RejectionSampler.apply_logits_processors   sg    .::1B] 	# -=% 	#EE !0    3?=?0ABBL$|H,EeTTT$|LGGG!1!C!CDT!U!U/22}4 3  N )))8^EU F
 !7C.EnU
##Jf>>> #4"GG 	'+-=x?X   r%   r   c                     |j         r| S |j        J |j        |         }|j        |         }|j        |         }|j        |         }t          | |||||          } | S N)r   prompt_token_idspresence_penaltiesfrequency_penaltiesrepetition_penaltiesr   )	r   r(   r&   r   rK   r   r   r   r   s	            r$   r   z RejectionSampler.apply_penalties+  s     ) 	M 1===,=nM.A.Q/CNS0EnU$ 
 
 r%   r   c                    || S g }t          | |          D ]u\  }}t          |          dk    r|                    |           t          t          |          dz
            D ]'}|                    g |d         ||                    (v|S )Nr   rN   r   )zipru   appendrange)rK   r   resultoutspecrk   s         r$   r   z2RejectionSampler._combine_outputs_with_spec_tokensG  s    
 !##-~>> 	6 	6IC4yyA~~MM#3t99q=)) 6 64r
4DG455556r%   )rb   Nr   )__name__
__module____qualname____doc__r   r   r   r9   Tensorr   r
   rL   intr	   rC   staticmethodr   tuplelistr   r|   r<   r   r   __classcell__)r#   s   @r$   r   r      s        ,H H H H H H Hl
$l
 \D(	l
 l
 ,l
 
l
 l
 l
 l
\/
/
 %/
 	/

 |/
 l/
 !</
 
/
 /
 /
 /
b  .037	"( "(,"("( &c]"( *D0	"(
 
tDI 44	5"( "( "( \"(H** ,* %	*
 
* * * *X + % 	
 tCy/ 
   \6  26 tCy/T#Y$. 
d3i   \    r%   r   rA   rB   r4   r>   r'   rJ   rG   r(   r)   c                 V   | j         dk    sJ ||j         dk    sJ |j         dk    sJ |j         dk    sJ t          |          }| j        d         }	|j        d         }
|j        }|                                 sJ ||                                sJ |                                sJ |                                sJ |j        |	|
fk    sJ t          j        ||dz   ft          t
          j        |          }|j	        rd }n|j
        t          k    }|j        s;|                    d          }t          |f         ||| ||||           |j	        r|S t          |	||j        |          }t#          |||| ||||          }t%          |f         ||| ||||||||
|d u            |S )NrN      r   r   r1   rO   )r0   NO_DRAFT_PROBS)ndimru   rT   rO   is_contiguousr9   fullr   int32
all_greedytemperaturer   
all_randomargmaxrejection_greedy_sample_kernelgenerate_uniform_probs
generatorssample_recovered_tokensrejection_random_sample_kernel)rA   rB   r4   r>   r'   rJ   rG   r(   
batch_size
num_tokensrc   rO   rK   	is_greedytarget_argmaxuniform_probsrecovered_token_idss                    r$   r@   r@   Y  sX     1$$$$+"2a"7"7"77#q((((!!!!%&&J &q)J#B'J F((*****+";";"="==%%'''''((******j!99999 z	\A%&k	   # H		%15GG	' $$+++33&
}5	
 	
 	
 ' 	$## +$	 M 2	 	 #J=1"d*    r%   r   c                    | j         dk    sJ |j         dk    sJ |j        r| S | j        d         }t          |j        ||t
          d          }|                     |                    d                     d}|j        t          |j        ||          }d}|j	        t          |j	        ||          }t          | ||          S )aR  Process logits based on sampling metadata.

    This function applies temperature scaling to the logits,
    as well as top-k and top-p. For greedy decoding, it returns
    the original logits.

    Args:
        logits: Input logits tensor to be processed.
        cu_num_draft_tokens: Cumulative number of draft tokens.
        sampling_metadata: Metadata containing sampling parameters such as
            temperature and whether greedy sampling is used.

    Returns:
        torch.Tensor: Processed logits if non-greedy sampling is used,
        otherwise returns the original logits.
    r   rN   r   )replace_from
replace_tor   N)r   r   rT   expand_batch_to_tokensr   r   div_rU   top_ktop_pr   )r   r>   r(   r   r   r   r   s          r$   r=   r=     s    * ;!#q((((# aJ(%'  K KK%%b))*** E*&#
 

 E*&#
 
 VUE222r%   xry   r   r   r   c                     | j         d         }|j         d         |k    sJ |                     |          }t          |f         || |||t                     |S )a  Expand [batch_size] tensor to [num_tokens] tensor based on the number of
    tokens per batch in cu_num_tokens.

    For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then
    num_tokens = 6, and expanded_x = [a, a, b, b, b, c].

    Args:
        x: [batch_size] tensor to expand.
        cu_num_tokens: [batch_size] tensor containing the cumulative number of
            tokens per batch. Each element represents the total number of
            tokens up to and including that batch.
        num_tokens: Total number of tokens.
        replace_from: int = 0
            Value to be replaced if it is found in x.
        replace_to: int = 0
            Value to replace with when replace_from is found.
    Returns:
        expanded_x: [num_tokens] tensor.
    r   )MAX_NUM_TOKENS)rT   	new_emptyexpand_kernelr5   )r   ry   r   r   r   r   
expanded_xs          r$   r   r     sq    4 Jq!Z////Z((J:- 	#    r%   r   rO   c                     t          j        | ft           j        |          }d}t          |          D ]H\  }}|dk    r||z   }|                    |          }	|	|||                             |	           |}I|S )a  
    Generates a batch of uniform random samples, with optional seeding
    if available.

    This method creates a tensor of shape `(num_tokens, )` filled
    with uniform random values in the range [0, 1). If `generators` is provided,
    the requests with their own seeds will use the provided `torch.Generator`
    for reproducibility. The samples for the other requests will be generated
    without a seed.

    Args:
        num_tokens: int
            Total number of tokens.
        num_draft_tokens: List[List[int]]
            Number of draft tokens per request.
        generators: Optional[Dict[int, torch.Generator]]
            A dictionary mapping indices in the batch to
            `torch.Generator` objects.
        device: torch.device
            The device on which to allocate the tensor.
    Returns:
        uniform_rand: torch.Tensor
            A tensor of shape `(num_tokens, )` containing uniform
            random values in the range [0, 1).
    r   r   N	generator)r9   randfloat64rv   getuniform_)
r   rB   r   rO   r   	start_idxreq_idxnend_idxr   s
             r$   r   r     s    H J	m  M
 I 011 	 	
 66a-NN7++	 )G+,55	5JJJ		r%   c                    t          |          }|j        d         }	t          j        ||	ft          j        |          }
|
                                 |j                                        D ]-\  }}||         dk    r|
|                             |           .t          j        |          }t          || f         ||||||
|	t          j        |	          |d u 	  	         |S )Nr   r   r   r   r   )ru   rT   r9   emptyr:   exponential_r   items
empty_likesample_recovered_tokens_kernelr   next_power_of_2)r4   rB   r>   rA   r'   rJ   r(   rO   r   rc   qrk   r   r   s                 r$   r   r   T  s    %&&J#B'J	Z m	 	 	A
 NN)4::<< 3 39 A""aD	222*?;;"J#=>	z**"d*
 
 
 
 r%   )do_not_specializec                 F   t          j        d          }|dnt          j        ||z             }|sd S |dk    rdnt          j        ||z   dz
            }	t          j        ||z             }
|
|	z
  }d}t          |          D ]a}|s]t          j        ||	z   |z             }t          j        ||	z   |z             }t          j        | ||dz   z  z   |z   |           ||k    rd}b|s:t          j        ||z             }t          j        | ||dz   z  z   |z   |           d S d S )Nr   TrN   Fr   
program_idloadr   store)output_token_ids_ptrcu_num_draft_tokens_ptrdraft_token_ids_ptrtarget_argmax_ptrbonus_token_ids_ptris_greedy_ptrr4   r   r   r   r   rB   rejectedposdraft_token_idtarget_argmax_idbonus_token_ids                    r$   r   r     s    mAG &-27=7;R3S3SI \\rw/F/PST/T'U'UIg-788G*H%&& 
  
  		 W%89%Ds%JKKN!w'89'Ds'JKKH$w,2B'CCcI    !111 
!4w!>??
 7lQ.>#??BRR	
 	
 	
 	
 	

 
r%   r   c                    t          j        d          }t          j        ||z             }|rd S |dk    rdnt          j        ||z   dz
            }t          j        ||z             }||z
  }d}t          |          D ]}|st          j        ||z   |z             }|rd}n t          j        |||z   |
z  z   |z             }t          j        |||z   |
z  z   |z             }t          j        ||z   |z             }|dk    r||z  |k    r|}nd}t          j        ||z   |z             }t          j        | ||	dz   z  z   |z   |           |s:t          j        ||z             }t          j        | ||	dz   z  z   |z   |           d S d S )Nr   rN   FTr   )r   r   r   draft_probs_ptrtarget_probs_ptrr   recovered_token_ids_ptruniform_probs_ptrr   r4   rc   r   r   r   r   r   rB   r   r   r   
draft_probtarget_probuniform_probtoken_idr   s                            r$   r   r     s    mAG/00I \\rw/F/PST/T'U'UIg-788G*H%&&   	W%89%Ds%JKKN 

W#y3*&DD~U 
 ' IOz#AANR K 7#4y#@3#FGGL A~~+
":l"J"J)  7#:Y#F#LMMH$w,2B'CCcI8    
!4w!>??
 7lQ.>#??BRR	
 	
 	
 	
 	

 
r%   r   c                 z   t          j        d          }|dk    rd}nt          j        ||z   dz
            }t          j        ||z             }||z
  }	t          j        ||z             }
t          j        |
|k    ||
          }
t          j        d|          }t          j        | |z   |z   |
||	k                d S )Nr   rN   )mask)r   r   r   whererS   r   )
output_ptr	input_ptrcu_num_tokens_ptrr   r   r   r   r   r   r   src_valoffsets               r$   r   r     s     mAG!||		G-7!;<<	g''122G9$Jgi')**Ghw,.
GDDGYq.))FHZ)#f,gFZ<OPPPPPPr%   PADDED_VOCAB_SIZEc	                 8   t          j        d          }	|	dk    rdnt          j        ||	z   dz
            }
t          j        ||	z             }||
z
  }t          j        d          }||k    rd S t          j        d|          }|rIt          j        ||
z   |z             }t          j        ||
|z   |z  z   |z   ||k     ||k    z  d          }nft          j        ||
|z   |z  z   |z   ||k     d          }t          j        ||
|z   |z  z   |z   ||k     d          }t          j        ||z
  d          }t          j        ||	|z  z   |z   ||k     t          d                    }t          j        ||z  d          }t          j        | |
z   |z   |           d S )Nr   rN   )r   otherr   r   rf   )r   r   r   rS   maximumr   r   r   )r   r   r   r   r   q_ptrrc   r  r   r   r   r   rB   r   vocab_offsetr   probr   r   r   recovered_ids                        r$   r   r     s    mAG\\rw/F/PST/T'U'UIg-788G* -

C
9Q 122L 7!4y!@3!FGGw	C:==L*,1OP
 
 
 Wy3*<<|K
*
 
 


 g	C:==L
*
 
 

 z+
2A66 	*$$|3J&Fmm	 	 	A
 9TAXB///LH!I-3\BBBBBr%   )r   r   )5collections.abcr   dataclassesr   r9   torch.nnnnvllm.loggerr   vllm.triton_utilsr   r   vllm.v1.outputsr   r	   r
   vllm.v1.sample.metadatar   vllm.v1.sample.ops.bad_wordsr   vllm.v1.sample.ops.penaltiesr   $vllm.v1.sample.ops.topk_topp_samplerr   vllm.v1.sample.samplerr   vllm.v1.spec_decode.metadatar   r   loggerr   	constexpr__annotations__r   r5   Moduler   r   r   r   r@   r=   r   dict	GeneratorrO   r   r   jitr   r   r   r   rb   r%   r$   <module>r     st   % $ $ $ $ $ $              # # # # # # ( ( ( ( ( ( ( ( I I I I I I I I I I 4 4 4 4 4 4 D D D D D D < < < < < < B B B B B B * * * * * * ; ; ; ; ; ;	X		%' bl ' ' '#$ BL $ $ $ y y y y yry y y yx	_\_ 3i	_
 _ _ $_ ,_ \_ (_ \_ _ _ _D73L7373 (73 \	73 73 73 73| % %|%<% % 	%
 % \% % % %P443i4 S%/)*4 L	4
 \4 4 4 4n))3i) 	) \) $) ,) () L) \) ) ) )Z ~.///(
 (
 0/(
X ~.///9
 L9
 9
 9
 0/9
z ~|<===Q LQ Q Q >=Q, 2C |2C L2C 2C 2C 2C 2C 2Cr%   