
    .`i'1                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ dZ G d dej                  ZdS )z>A layer that samples the next tokens from the model's outputs.    N)LogprobsMode)is_pin_memory_available)LogprobsTensorsSamplerOutput)SamplingMetadata)apply_bad_words)batched_count_greater_than)apply_all_penalties)TopKTopPSamplergh㈵>c                       e Zd ZdZddef fdZ	 	 ddej        ded	e	d
edz  de
f
dZedej        dej        de	dej        fd            Zedej        dej        fd            Z	 ddej        ded
edz  deej        ej        dz  f         fdZedej        dej        fd            Zedej        dedej        defd            Ze	 ddeee                  deee                  dz  deee                  fd            Zdej        ded	e	dej        fdZedej        dedeee                  dej        fd            Z xZS )SampleraM  
    A layer that samples the next tokens from the model's outputs
    with the following steps in order:

    1. If logprobs are requested:
        a) If `logprobs_mode` is `raw_logprobs`, compute logprobs
           as the final logprobs to return.
        b) If `logprobs_mode` is `raw_logits`, clone the logits
           as the final logprobs to return.
    2. Convert logits to float32.
    3. Apply allowed token ids whitelist.
    4. Apply bad words exclusion.
    5. Apply logit processors which are not argmax-invariant,
       i.e. that can impact greedy sampling.
        a) Min tokens processor
        b) Logit bias processor
    6. Apply penalties
        a) Repetition penalty
        b) Frequency penalty
        c) Presence penalty
    7. Sample the next tokens. `sample` method performs the following steps:
        a) If not `all_random`, perform greedy sampling. If `all_greedy`,
           return the greedily sampled tokens and final logprobs if requested.
        b) Apply temperature.
        c) Apply logit processors which are argmax-invariant, by default
           the min_p processor.
        d) Apply top_k and/or top_p.
        e) Sample the next tokens with the probability distribution.
        f) If `all_random` or temperature >= epsilon (1e-5), return the
           randomly sampled tokens and final logprobs if requested. Else,
           return the greedily sampled tokens and logprobs if requested.
    8. Gather the logprobs of the top `max_num_logprobs` and sampled token
       (if requested). Note that if the sampled token is within the top
       `max_num_logprobs`, the logprob will be eventually merged in
       `LogprobsProcessor` during output processing. Therefore, the
       final output may contain either `max_num_logprobs + 1` or
       `max_num_logprobs` logprobs.
    9. Return the final `SamplerOutput`.
    raw_logprobslogprobs_modec                     t                                                       t          |          | _        t	                      | _        || _        d S N)super__init__r   topk_topp_samplerr   
pin_memoryr   )selfr   	__class__s     j/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/sample/sampler.pyr   zSampler.__init__=   sD    !0!?!?133*    FNlogitssampling_metadatapredict_bonus_tokenlogprobs_mode_overridereturnc                    |p| j         }|j        }|k|dk    r|                     |          }nO|dk    rI|j        t          j        k    r|                                }n|                    t          j                  }|                    t          j                  }|                     |||          }| 	                    ||          \  }}	|	|	}|
                                }|d }
nT|dk    r6t          t	          j        d          |t	          j        d                    }
n|                     |||          }
|                    t          j                  }t          |                    d          |
          }|S )Nr   
raw_logitsr   )	token_ids)sampled_token_idslogprobs_tensors)r   max_num_logprobscompute_logprobsdtypetorchfloat32clonetoapply_logits_processorssamplelongr   emptygather_logprobsint32r   	unsqueeze)r   r   r   r   r   r   num_logprobsr   sampledprocessed_logprobsr$   sampler_outputs               r   forwardzSampler.forwardC   s    /D$2D
 )9#..#44V<<,..<5=00#)<<>>LL#)99U]#;#;L 5=))--%':
 
 '+kk&:K&L&L##)-L
 ,,..#R.Aek!nn   
  $33lg  4    
 **U[)) ' &//33-
 
 
 r   temp
all_randomc                     |st          j        |t          k     d|          }|                     |                    d                    S )Ng      ?   dim)r(   where_SAMPLING_EPSdiv_r2   )r   r8   r9   s      r   apply_temperaturezSampler.apply_temperature   sE      	@;tm3S$??D{{4>>a>00111r   c                 T    |                      d                              d          S )Nr!   r<   )argmaxviewr   s    r   greedy_samplezSampler.greedy_sample   s$    }}}$$))"---r   c                 2   |p| j         }|j        r	|j        rJ |j        rd}nM|                     |          }|j        r1d}|j        $|dk    r|}n|dk    r|                     |          }||fS |j        J |                     ||j        |j                  }|j        j	        D ]}|
                    |          }|                     ||j        |j        |j                  \  }}|||fS t          j        |j        t"          k     |||          }	|	|fS )zSample logits based on sampling metadata.

        The various logits processing functions called in this method
        may update the logits tensor in-place.
        Nprocessed_logitsr5   )out)r   
all_greedyr9   rF   r%   r&   temperaturerA   logitsprocsargmax_invariantapplyr   
generatorstop_ktop_pr(   r>   r?   )
r   r   r   r   r   greedy_sampledr5   	processorrandom_sampledr4   s
             r   r-   zSampler.sample   s    /D$2D%0R5F5QRRR' 	:!NN!//77N + :%)"$5A$(:::-3**&*>>>-1-B-B6-J-J*%'999 ,888 ''%13D3O
 
 +6G 	- 	-I__V,,FF .2-C-C(##	.
 .
** !!#555+)M9	
 
 
 ***r   c                 D    |                      dt          j                  S )Nr!   )r=   r'   )log_softmaxr(   r)   rE   s    r   r&   zSampler.compute_logprobs   s    !!b!>>>r   logprobsr3   r"   c                    |j         t          j        k    sJ t          j        | |d          \  }}|                    d          }|                     d|          }t          | |          }t          j        ||fd          }t          j        ||fd          } |                    t          j	                  }t          || |          S )a  
        Gather logprobs for topk and sampled/prompt token.

        Args:
          logprobs: (num tokens) x (vocab) tensor
          num_logprobs: minimum number of logprobs to
                        retain per token
          token_ids: prompt tokens (if prompt logprobs)
                     or sampled tokens (if sampled
                     logprobs); 1D token ID tensor
                     with (num tokens) elements
                     Must be int64.

        Returns:
          Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
          Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
          Sampled token rank tensor, (num tokens)
        r!   r<   r;   )r'   r(   int64topkr2   gatherr	   catr+   r1   r   )rW   r3   r"   topk_logprobstopk_indicestoken_logprobstoken_ranksindicess           r   r0   zSampler.gather_logprobs   s    0 %+----&+j<R&P&P&P#| ''++	!Y77 1>JJ )Y51===9nm<!DDD **U[))w+>>>r   output_token_idsspec_token_idsc                 >    || S d t          | |          D             S )Nc                 (    g | ]\  }}|rg ||n|S  rf   ).0rI   specs      r   
<listcomp>z=Sampler._combine_outputs_with_spec_tokens.<locals>.<listcomp>  s=     
 
 
T "*McMDMMs
 
 
r   )zip)rb   rc   s     r   !_combine_outputs_with_spec_tokensz)Sampler._combine_outputs_with_spec_tokens   s<    
 !##
 
 !1>BB
 
 
 	
r   c                    |j         }t          |          p|j         }|j        }|r|r|                     ||j                  }|j        (|                    |j        t          d                     |rt          |||           |j
        j        D ]}|                    |          }|                     |||          }|S )Nz-inf)bad_words_token_idsboolno_penaltiesrb   rk   rc   allowed_token_ids_maskmasked_fill_floatr   rL   non_argmax_invariantrN   apply_penalties)r   r   r   r   rm   any_penalties_or_bad_wordsrb   rS   s           r   r,   zSampler.apply_logits_processors
  s     0C$%%K->-K)K 	# -= 	#= 	  $EE !0    3? 1 H%PV--XXX  	KF$79IJJJ +6K 	- 	-I__V,,FF %%f.?AQRRr   c                 v    |j         r| S |j        J t          | |j        |j        |j        |j        |          S r   )ro   prompt_token_idsr
   presence_penaltiesfrequency_penaltiesrepetition_penalties)r   r   rb   s      r   rt   zSampler.apply_penalties.  sS     ) 	M 1===".012
 
 	
r   )r   )FNr   )__name__
__module____qualname____doc__r   r   r(   Tensorr   rn   r   r7   staticmethodrA   rF   tupler-   r&   intr   r0   listrk   r,   rt   __classcell__)r   s   @r   r   r      s       & &P+ +l + + + + + + %*6:> >> ,> "	>
 !-t 3> 
> > > >@ 	2	2l	2 	2 
		2 	2 	2 \	2 .el .u| . . . \. 7;	8+ 8+8+ ,8+ !-t 3	8+
 
u|U\D00	18+ 8+ 8+ 8+t ? ?%, ? ? ? \? )?,)?)? <)? 
	)? )? )? \)?V  26

 

tCy/

T#Y$.

 
d3i

 

 

 \

"" ," "	"
 
" " " "H 

+
 tCy/
 
	
 
 
 \
 
 
 
 
r   r   )r~   r(   torch.nnnnvllm.config.modelr   vllm.utils.platform_utilsr   vllm.v1.outputsr   r   vllm.v1.sample.metadatar   vllm.v1.sample.ops.bad_wordsr   vllm.v1.sample.ops.logprobsr	   vllm.v1.sample.ops.penaltiesr
   $vllm.v1.sample.ops.topk_topp_samplerr   r?   Moduler   rf   r   r   <module>r      s   E D        * * * * * * = = = = = = : : : : : : : : 4 4 4 4 4 4 8 8 8 8 8 8 B B B B B B < < < < < < @ @ @ @ @ @k
 k
 k
 k
 k
bi k
 k
 k
 k
 k
r   