
    .`i9              
       $   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ  ee          Z G d d	ej                  Z e j        d
          de j        de j        fd            Zde j        de j        dz  de j        dz  de j        fdZde j        de j        de j        fdZde j        deee j        f         de j        fdZde j        de j        dz  de j        dz  deee j        f         de j        f
dZd ZdS )    N)version)envs)rocm_aiter_ops)LogprobsMode)init_logger)CpuArchEnumcurrent_platformc                       e Zd ZdZddeddf fdZdej        dee	ej
        f         d	ej        dz  d
ej        dz  deej        ej        dz  f         f
dZdej        dee	ej
        f         d	ej        dz  d
ej        dz  deej        ej        dz  f         f
dZdej        dee	ej
        f         d	ej        dz  d
ej        dz  deej        ej        dz  f         f
dZdej        dee	ej
        f         d	ej        dz  d
ej        dz  deej        ej        dz  f         f
dZdej        d	ej        dz  d
ej        dz  dee	ej
        f         dej        f
dZ xZS )TopKTopPSamplerz
    Module that performs optional top-k and top-p filtering followed by
    weighted random sampling of logits.

    Implementations may update the logits tensor in-place.
    raw_logprobslogprobs_modereturnNc                    t                                                       || _        |dvrt          j                    rt
          j        rddlm} t          j	                    }|J |
                    |          s'|                                }t          d| d          t                              dd           | j        | _        nt                              d	           | j        | _        nt          j                    rGt          j                    }|t*          j        t*          j        fv r| j        | _        n| j        | _        n|dvrt3          j                    rx	 dd l}t8          j        j        | _        t                              d
           | j         | _        nB# tB          $ r) t          "                    d           | j        | _        Y nw xY w| j        | _        tF          | _#        d S )Nprocessed_logitsprocessed_logprobsr   )FlashInferBackendz/FlashInfer does not support compute capability z&, unset VLLM_USE_FLASHINFER_SAMPLER=1.z,Using FlashInfer for top-p & top-k sampling.global)scopezFlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads.z9Using aiter sampler on ROCm (lazy import, sampling-only).z[aiter.ops.sampling is not available on ROCm. Falling back to forward_native implementation.)$super__init__r   r	   is_cudar   VLLM_USE_FLASHINFER_SAMPLER%vllm.v1.attention.backends.flashinferr   get_device_capabilitysupports_compute_capabilityas_version_strRuntimeErrorlogger	info_onceforward_cudaforward
debug_onceforward_nativeis_cpuget_cpu_architecturer   RISCVPOWERPCforward_cpur   
is_enabledaiter.ops.samplingtorchopsaiter	aiter_opsforward_hipImportErrorwarning_onceapply_top_k_top_p)selfr   r   
capabilitycapability_strarchr.   	__class__s          x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/sample/ops/topk_topp_sampler.pyr   zTopKTopPSampler.__init__   sB   * !KKK (** L / 3SSSSSS-CEE
!---(DDZPP %/%>%>%@%@N&R)R R R  
   B" !     $0!!C  
  $2$&& 	/#8::D );+>???#2#/!KKK)++ L3))))!&  O    $/ 3 3 3##E    $23  .DL!2s   (A F) )0GGlogits
generatorskpc                    |                      |||          }d}| j        dk    r|}n,| j        dk    r!|                    dt          j                  }|                    dt          j                  }t          ||          |fS )z
        PyTorch-native implementation of top-k and top-p sampling.

        The logits tensor may be updated in-place.
        Nr   r   dimdtype)r3   r   log_softmaxr,   float32softmaxrandom_sample)r4   r:   r;   r<   r=   logits_to_returnprobss          r9   r$   zTopKTopPSampler.forward_native\   s     ''155!333%#777%11b1NN2U];;UJ//1AAA    c                     |||r4|rt                               d           |                     ||||          S | j        dvs
J d            t	          |                                |||          dfS )z;More optimized implementation for top-k and top-p sampling.NziFlashInfer 0.2.3+ does not support per-request generators. Falling back to PyTorch-native implementation.r   z5FlashInfer does not support returning logits/logprobs)r   r#   r$   r   flashinfer_sample
contiguous)r4   r:   r;   r<   r=   s        r9   r!   zTopKTopPSampler.forward_cudaq   s     I!)
) !!5  
 &&vz1a@@@!)SSSSC TSS !!2!2!4!4aJGGMMrI   c                    |                      |||          }d}| j        dk    r|}n,| j        dk    r!|                    dt          j                  }t          |          |j        d         k    rt          |          |fS |                    dt          j                  }t          j	        |          }|
                                 |                                D ]!\  }}	||         
                    |	           "|                    |                              d                              d          |fS )	z
        PyTorch-native implementation of top-k and top-p sampling for CPU.

        The logits tensor may be updated in-place.
        Nr   r   r?   r@   r   	generatorrA   )r3   r   rC   r,   rD   lenshapecompiled_random_samplerE   
empty_likeexponential_itemsdiv_argmaxview)
r4   r:   r;   r<   r=   rG   rH   qirO   s
             r9   r)   zTopKTopPSampler.forward_cpu   s7    ''155!333%#777%11b1NNz??fl1o--)&113CCCNNrN??E ''ANN * 0 0 2 2 7 79!!!I!6666::a==''B'//44R88:JJJrI   c                 
   d}	 |||r4|rt                               d           |                     ||||          S | j        dvs
J d            |r|                     ||||          S |                     ||||          d fS )NTzVaiter sampler does not support per-request generators; falling back to PyTorch-native.r   z9aiter sampler does not support returning logits/logprobs.)r   r2   r$   r   aiter_sample)r4   r:   r;   r<   r=   DISABLE_AITER_SAMPLERs         r9   r0   zTopKTopPSampler.forward_hip   s     !%II!)
) ##6   &&vz1a@@@! *
 
 
 
 G
 
 
 ! 	A&&vz1a@@@  Az::D@@rI   c                    |du}|du}|r}|r{|                     dt          j                                                  } | j        j        |dgt          |          t          |          R ddi}|                    d          S |rl|                     dt          j                                                  } | j        j        |dgt          |          R ddi}|                    d          S |r||                     dt          j                                                  } | j        j	        |gt          |          R  }	t          j
        |	d                              d          S t          d          )	z#Sample from logits using aiter ops.Nr?   r@   deterministicT   )num_samplesz6aiter_sample was called with no active top-k or top-p.)rE   r,   rD   rL   r/   top_k_top_p_sampling_from_probs_to_tensor_scalar_tuplerY   top_p_sampling_from_probstop_k_renorm_probsmultinomialr   )
r4   r:   r<   r=   r;   	use_top_k	use_top_prH   next_token_idsrenorm_probss
             r9   r]   zTopKTopPSampler.aiter_sample   s    TM	TM	 	K 	KNNrN??JJLLEKT^K )++ )++	  
 # N "&&r*** 	KNNrN??JJLLEET^Et5a88  HL N "&&r*** 	KNNrN??JJLLE<4></22  L $\qAAAFFrJJJSTTTrI   )r   )__name__
__module____qualname____doc__r   r   r,   Tensordictint	Generatortupler$   r!   r)   r0   r]   __classcell__)r8   s   @r9   r   r      s        @3 @3l @3 @3 @3 @3 @3 @3 @3DBB eo-.B <$	B
 <$B 
u|U\D00	1B B B B*NN eo-.N <$	N
 <$N 
u|U\D00	1N N N N6KK eo-.K <$	K
 <$K 
u|U\D00	1K K K K<AA eo-.A <$	A
 <$A 
u|U\D00	1A A A A2#U#U <$#U <$	#U
 eo-.#U 
#U #U #U #U #U #U #U #UrI   r   T)dynamicr:   r   c                    |                      dt          j                  }t          j        |          }|                                 |                    |                              d                              d          S )Nr?   r@   rP   )rE   r,   rD   rT   rU   divrX   rY   )r:   rH   rZ   s      r9   rS   rS      sh    NNrN77EANN99Q<<2&&++B///rI   r<   r=   c                    ||| S t          | |          S |                     dd          \  }}||                    d          |                    t          j                  z
  }|                    d|                    d                    }||k     }|                    |t          d                      |w|
                    d          }t	          j        |d|          }|d|                    d          z
  k    }d|dddf<   |                    |t          d                      |                    d||	          } | S )
zApply top-k and top-p masks to the logits.

    If a top-p is used, this function will sort the logits tensor,
    which can be slow for large batches.

    The logits tensor may be updated in-place.
    Nr?   F)rA   
descendingra   rP   inf)rA   out)rA   indexsrc)apply_top_k_onlysortsizetor,   longgather	unsqueezemasked_fill_floatrE   cumsumscatter)	r:   r<   r=   logits_sort
logits_idx
top_k_mask
probs_sort	probs_sum
top_p_masks	            r9   r3   r3      sW    	y9M  ***$kkbUkCCK} %%a((144
+;+;;
 '':+?+?A+?+F+FGG
 :-
  eEll];;;} ((R(00
LDDD	!akkak&8&8"88
!
111b5  eEll];;;   Rz{ KKFMrI   c                    || j         d         k    }|                    |d          }|                                }|                    d                              d          }|                     |d          j                            d|                                          }|	                    |                    d          t          d                      | 	                    | |k     t          d                      | S )z
    Apply top-k mask to the logits.

    This implementation doesn't involve sorting the entire vocab.

    The logits tensor may be updated in-place.
    ra   rP   r{   )rR   masked_fillmaxsub_r   topkvaluesr   r   r   r   )r:   r<   no_top_k_mask	max_top_kk_indexr   s         r9   r   r     s     a(M	mQ''AI ffQii!!!$$GYA..5<<QOOJM33A66uFFF
+eEll];;;MrI   rH   r;   c                 v   t          j        |           }t          |          | j        d         k    r|                                 |r6|                                D ]!\  }}||                             |           "|                     |                              d                              d          S )zRandomly sample from the probabilities.

    We use this function instead of torch.multinomial because torch.multinomial
    causes CPU-GPU synchronization.
    r   rN   r?   rP   )	r,   rT   rQ   rR   rU   rV   rW   rX   rY   )rH   r;   rZ   r[   rO   s        r9   rF   rF   7  s     	A
 :%+a.((	 3 ',,.. 	3 	3LAyaD	2222::a==B'',,R000rI   c                    ddl }t          j        |j                  t          j        d          k     rt	          d          ||J |?|                     dt          j                  }|j        	                    ||d          }n_|?|                     dt          j                  }|j        
                    ||d          }n|j                            | ||d          }|                    d          S )	ab  Sample from the logits using FlashInfer.

    Statistically, this function is equivalent to the `random_sample` function.
    However, this function is faster because it avoids sorting the logits tensor
    via rejection sampling.

    NOTE: The outputs of this function do not necessarily match the outputs of
    the `random_sample` function. It only guarantees that the outputs are
    statistically equivalent.

    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
    does not. Call this function at the end of the forward pass to minimize
    the synchronization overhead.
    r   Nz0.2.3zCFlashInfer version >= 0.2.3 required for top-k and top-p sampling. r?   r@   T)r`   )
flashinferr   parse__version__r1   rE   r,   rD   samplingre   top_k_sampling_from_probs top_k_top_p_sampling_from_logitsrY   )r:   r<   r=   r;   r   rH   rj   s          r9   rK   rK   O  s   ( }Z+,,w}W/E/EEEQ
 
 	
 	aii(y2U];;#,FF1D G 
 
 
2U];;#,FF1D G 
 

 $,MMAq N 
 
 r"""rI   c                 F    t          | t          j                  r| dfS d | fS )Nr   )
isinstancer,   rp   )xs    r9   rd   rd     s)    !U\"" 1vayrI   )r,   torch.nnnn	packagingr   vllmr   vllm._aiter_opsr   vllm.config.modelr   vllm.loggerr   vllm.platformsr   r	   rl   r   Moduler   compilerp   rS   r3   r   rq   rr   rs   rF   rK   rd    rI   r9   <module>r      sY  
                    * * * * * * * * * * * * # # # # # # 8 8 8 8 8 8 8 8	X		TU TU TU TU TUbi TU TU TUr t05< 0EL 0 0 0 0(L(|d( |d( \	( ( ( (VL| \   21<1S%/)*1 \1 1 1 10.#L.#|d.# |d.# S%/)*	.#
 \.# .# .# .#b    rI   