
    )`i                     6   d Z ddlZddlmZ ddlmZmZmZ ddlZddl	m
Z
 ddlmZ ddlmZmZmZmZmZ 	 	 d.d	ed
eej                 deej                 deeef         fdZej        d             Zd Ze
	 	 d.dej        deeej        ef                  dee         dej        fd            Ze
	 	 	 	 	 	 d/dej        deej                 ded
eej                 dedee         dee         dej        fd            Ze
	 	 	 	 	 	 d/dej        deej                 ded
eej                 dedee         dee         dej        fd            Ze
	 	 	 	 	 	 d/dej        deej        ef         deej                 ded
eej                 dedee         dee         dej        fd            Z e
	 	 	 	 	 	 d/dej        d eej        ef         deej                 ded
eej                 dedee         dee         dej        fd!            Z!e
	 	 	 	 	 	 d/dej        d"eej        ef         deej                 ded
eej                 dedee         dee         dej        fd#            Z"e
	 	 	 	 	 	 	 d0dej        d eej        ef         deej        ef         deej                 d%e#ded
eej                 dedee         dee         dej        fd&            Z$e
	 	 	 	 	 	 	 d0dej        d eej        ef         deej        ef         deej                 d%e#ded
eej                 dedee         dee         dej        fd'            Z%e
dej        deej        ef         dej        fd(            Z&e&Z'e
dej        d eej        ef         dej        fd)            Z(e(Z)e
dej        d eej        ef         dej        fd*            Z*e
	 	 	 	 	 	 d1d+eej                 d,eej                 ded
eej                 dee         dee         dej        fd-            Z+dS )2a3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)SimpleNamespace)OptionalTupleUnion   )flashinfer_api)gen_sampling_module)_get_cache_bufdevice_support_pdlget_default_generatorsregister_custom_opregister_fake_op	increment	generatordevicereturnc           	         |t          |          }|                                }|                    t          j                  \  }}|| dz   dz  dz  z  }|                    t          j        ||gt          j        t          j        d                                        t          j                             t          |          t          |          fS )N      cpudtyper   )
r   	get_stateviewtorchint64	set_statetensorr   uint8int)r   r   r   stateseedoffsets         g/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/sampling.pyget_seed_and_offsetr%   !   s    
 *622	!!E::ek**LD&
y1}"Q&&F6N%+el56I6I	
 	
 	

$u{

  
 t99c&kk!!    c                  j   t                                                      t          dd          dt          j        dt          j        dt
          t          j                 dt          dt          d	t          j        ffd
            } t          d          dt          j        dt          j        dt
          t          j                 dt          dt          d	t          j        fd            }t          dd          	 	 d?dt          j        dt
          t          j                 dt          dt
          t          j	                 dt
          t                   dt
          t                   d	t          j        ffd            }t          d          dt          j        dt
          t          j                 dt          dt
          t          j	                 d	t          j        f
d            }t          dd          	 	 d?dt          j        dt
          t          j                 dt          dt
          t          j	                 dt
          t                   dt
          t                   d	t          j        ffd            }t          d          dt          j        dt
          t          j                 dt          dt
          t          j	                 d	t          j        f
d            }t          dd          	 	 d?dt          j        dt
          t          j                 dt
          t          j                 dt          dt          dt
          t          j	                 dt
          t                   dt
          t                   d	t          j        ffd            }t          d          dt          j        dt
          t          j                 dt
          t          j                 dt          dt          dt
          t          j	                 d	t          j        fd            }t          dd          	 	 d?dt          j        dt
          t          j                 dt
          t          j                 d t          dt          dt
          t          j	                 dt
          t                   dt
          t                   d	t          j        ffd!            }t          d          dt          j        dt
          t          j                 dt
          t          j                 d t          dt          dt
          t          j	                 d	t          j        fd"            }	t          d#d          	 	 d?dt          j        dt
          t          j                 d$t
          t          j                 d%t          dt          dt
          t          j	                 dt
          t                   dt
          t                   d	t          j        ffd&            }
t          d'd          	 	 d?dt          j        dt
          t          j                 dt
          t          j                 d t          dt
          t          j                 dt          dt          dt
          t          j	                 dt
          t                   dt
          t                   d	t          j        ffd(            }t          d'          dt          j        dt
          t          j                 dt
          t          j                 d t          dt
          t          j                 dt          dt          dt
          t          j	                 d	t          j        fd)            }t          d*d          dt          j        dt
          t          j                 dt          d	t          j        ffd+            }t          d*          dt          j        dt
          t          j                 dt          d	t          j        fd,            }t          d-d.          dt          j        dt
          t          j                 d t          d/t          j        d	t          j        f
fd0            }t          d-          dt          j        dt
          t          j                 d t          d/t          j        d	t          j        f
d1            }t          d2d.          dt          j        dt
          t          j                 d t          d/t          j        d	t          j        f
fd3            }t          d2          dt          j        dt
          t          j                 d t          d/t          j        d	t          j        f
d4            }t          d5d6          	 	 d?d7t          j        d8t          j        d9t          j        d:t          j        d;t          j        dt          dt
          t          j	                 dt
          t                   dt
          t                   d	t          j        ffd<            }t          d5          d7t          j        d8t          j        d9t          j        d:t          j        d;t          j        dt          dt
          t          j	                 d	t          j        fd=            }t          | |||||
|||||>          S )@Nzflashinfer::softmax)workspace_buffer)mutates_argsr(   logitsmaybe_temperature_arrtemperature_val
enable_pdlr   c                     |                                 }t          j        ||j                  }||                                 nd }                    | |||||           |S )N)r   )floatr   
empty_liker   softmax)r(   r*   r+   r,   r-   probsmodules         r$   r1   z$get_sampling_module.<locals>.softmax8   sz      >>>-B-N!'')))TX 	 	!	
 	
 	
 r&   c                 N    t          j        ||j        t           j                  S )N)r   r   )r   r0   r   float32)r(   r*   r+   r,   r-   s        r$   _fake_softmaxz*get_sampling_module.<locals>._fake_softmaxO   s      v}EMRRRRr&   z flashinfer::sampling_from_logits indicesdeterministicr   r"   r#   c                 ~   | j         }|                                 } ||                    d          n|                     d          }||j        nt          j        }t	          j        |||          }	||*t          ||                     d          z  ||          \  }}
                    | |	||||           |	S )Nr   r   r   )	r   r/   sizer   r   int32emptyr%   sampling_from_logits)r*   r8   r9   r   r"   r#   r   
batch_size	out_dtypesamplesr3   s             r$   r>   z1get_sampling_module.<locals>.sampling_from_logitsZ   s      (/(;W\\!___Q
%,%8GMMek	+j	&III<6>.V[[^^+Y LD& 	##	
 	
 	
 r&   c                     ||                     d          n|                      d          }||j        nt          j        }t          j        ||| j                  S Nr   r   r;   r   r   r<   r=   r   )r*   r8   r9   r   r?   r@   s         r$   _fake_sampling_from_logitsz7get_sampling_module.<locals>._fake_sampling_from_logitsx   sQ     )0(;W\\!___Q
%,%8GMMek	{:Yv}MMMMr&   zflashinfer::sampling_from_probsr2   c                 R   | j         }|                                 } ||                    d          n|                     d          }||j        nt          j        }t	          j        |||          }	||t          |||          \  }}
                    | |	||||           |	S rC   )	r   r/   r;   r   r   r<   r=   r%   sampling_from_probs)r2   r8   r9   r   r"   r#   r   r?   r@   rA   r3   s             r$   rG   z0get_sampling_module.<locals>.sampling_from_probs   s     (/(;W\\!___A
%,%8GMMek	+j	&III<6>.z9fMMLD&""	
 	
 	
 r&   c                     ||                     d          n|                      d          }||j        nt          j        }t          j        ||| j                  S rC   rD   )r2   r8   r9   r   r?   r@   s         r$   _fake_sampling_from_probsz6get_sampling_module.<locals>._fake_sampling_from_probs   sQ     )0(;W\\!___A
%,%8GMMek	{:Yu|LLLLr&   z%flashinfer::top_p_sampling_from_probsmaybe_top_p_arr	top_p_valc           
         | j         }|                                 } ||                                nd }||                    d          n|                     d          }	||j        nt          j        }
t	          j        |	|
|          }||t          |	dz  ||          \  }}                    | |||||||           |S Nr   r       )	r   r/   r;   r   r   r<   r=   r%   top_p_sampling_from_probs)r2   r8   rJ   rK   r9   r   r"   r#   r   r?   r@   rA   r3   s               r$   rO   z6get_sampling_module.<locals>.top_p_sampling_from_probs   s     '6'BO!!### 	 )0(;W\\!___A
%,%8GMMek	+j	&III<6>.zB	6RRLD&((		
 		
 		
 r&   c                     ||                     d          n|                      d          }||j        nt          j        }t          j        ||| j                  }|S rC   rD   )	r2   r8   rJ   rK   r9   r   r?   r@   samples	            r$   _fake_top_p_sampling_from_probsz<get_sampling_module.<locals>._fake_top_p_sampling_from_probs   U     )0(;W\\!___A
%,%8GMMek	ZyNNNr&   z%flashinfer::top_k_sampling_from_probsmaybe_top_k_arr	top_k_valc           
         | j         }|                                 } ||                    d          n|                     d          }	||                                nd }||j        nt
          j        }
t          j        |	|
|          }||t          |	dz  ||          \  }}	                    | |||||||           |S rM   )
r   r/   r;   r    r   r   r<   r=   r%   top_k_sampling_from_probs)r2   r8   rT   rU   r9   r   r"   r#   r   r?   r@   rA   r3   s               r$   rW   z6get_sampling_module.<locals>.top_k_sampling_from_probs   s     (/(;W\\!___A
3B3N/--///TX%,%8GMMek	+j	&III<6>.zB	6RRLD&((		
 		
 		
 r&   c                     ||                     d          n|                      d          }||j        nt          j        }t          j        ||| j                  }|S rC   rD   )	r2   r8   rT   rU   r9   r   r?   r@   rQ   s	            r$   _fake_top_k_sampling_from_probsz<get_sampling_module.<locals>._fake_top_k_sampling_from_probs   rS   r&   z%flashinfer::min_p_sampling_from_probsmaybe_min_p_arr	min_p_valc           
         | j         }|                                 } ||                                nd }||                    d          n|                     d          }	||j        nt          j        }
t	          j        |	|
|          }||t          |	||          \  }}                    | |||||||           |S rC   )	r   r/   r;   r   r   r<   r=   r%   min_p_sampling_from_probs)r2   r8   rZ   r[   r9   r   r"   r#   r   r?   r@   rA   r3   s               r$   r]   z6get_sampling_module.<locals>.min_p_sampling_from_probs  s     '6'BO!!### 	 )0(;W\\!___A
%,%8GMMek	+j	&III<6>.z9fMMLD&((		
 		
 		
 r&   z+flashinfer::top_k_top_p_sampling_from_probsc
                    | j         }
|                                 } ||                                nd }||                                nd }||                    d          n|                     d          }||j        nt
          j        }t          j        |||
          }||	t          |dz  ||
          \  }}		                    | |||||||||	
  
         |S rM   )
r   r/   r    r;   r   r   r<   r=   r%   top_k_top_p_sampling_from_probs)r2   r8   rT   rU   rJ   rK   r9   r   r"   r#   r   r?   r@   rA   r3   s                 r$   r_   z<get_sampling_module.<locals>.top_k_top_p_sampling_from_probs1  s     3B3N/--///TX'6'BO!!### 	 )0(;W\\!___A
%,%8GMMek	+j	&III<6>.zB	6RRLD&..	
 	
 	
 r&   c                     ||                     d          n|                      d          }||j        nt          j        }	t          j        ||	| j                  }
|
S rC   rD   )r2   r8   rT   rU   rJ   rK   r9   r   r?   r@   rQ   s              r$   %_fake_top_k_top_p_sampling_from_probszBget_sampling_module.<locals>._fake_top_k_top_p_sampling_from_probsW  sU     )0(;W\\!___A
%,%8GMMek	ZyNNNr&   zflashinfer::top_p_renorm_probsc                     |                                  } ||                                 nd }t          j        |           }                    | |||           |S N)r/   r   r0   top_p_renorm_probs)r2   rJ   rK   renorm_probsr3   s       r$   rd   z/get_sampling_module.<locals>.top_p_renorm_probsi  sn     '6'BO!!### 	 '..!!		
 	
 	
 r&   c                 *    t          j        |           S rc   r   r0   )r2   rJ   rK   s      r$   _fake_top_p_renorm_probsz5get_sampling_module.<locals>._fake_top_p_renorm_probs|  s     &&&r&   zflashinfer::top_k_renorm_probs)row_states_bufferri   c                    | j         t          j        t          j        t          j        fv sJ d| j          d            ||                                nd }t          j        |           }                    | ||||           |S NzUnsupported dtype z(, expected float32, float16, or bfloat16)r   r   r5   float16bfloat16r    r0   top_k_renorm_probs)r2   rT   rU   ri   re   r3   s        r$   rn   z/get_sampling_module.<locals>.top_k_renorm_probs  s     {u}emU^LLLLVVVV MLL 4C3N/--///TX'..!!	
 	
 	
 r&   c                 *    t          j        |           S rc   rg   )r2   rT   rU   ri   s       r$   _fake_top_k_renorm_probsz5get_sampling_module.<locals>._fake_top_k_renorm_probs  s     &&&r&   zflashinfer::top_k_mask_logitsc                    | j         t          j        t          j        t          j        fv sJ d| j          d            ||                                nd }t          j        |           }                    | ||||           |S rk   )r   r   r5   rl   rm   r    r0   top_k_mask_logits)r*   rT   rU   ri   mask_logitsr3   s        r$   rr   z.get_sampling_module.<locals>.top_k_mask_logits  s     |u}enMMMMWWWW NMM 4C3N/--///TX&v..  	
 	
 	
 r&   c                 *    t          j        |           S rc   rg   )r*   rT   rU   ri   s       r$   _fake_top_k_mask_logitsz4get_sampling_module.<locals>._fake_top_k_mask_logits  s     '''r&   z&flashinfer::chain_speculative_sampling)output_accepted_token_numoutput_emitted_draft_token_numdraft_probsdraft_token_idstarget_probsrv   rw   c	                     | j         }	|                                 } |                                }|                                }|                                }|                                }|j        \  }
}t	          j        |
|dz   ft          j        |	          }||@t          |                     d          |                     d          dz   z  ||	          \  }}	                    | ||||||||	  	         |S )Nr   r   r   )
r   r/   r    shaper   r=   r<   r%   r;   chain_speculative_sampling)rx   ry   rz   rv   rw   r9   r   r"   r#   r   bnoutput_token_idsr3   s                r$   r}   z7get_sampling_module.<locals>.chain_speculative_sampling  s   $ #!'')))--//#))++$=$A$A$C$C!)G)K)K)M)M&$1 ;1q5zVTTT<6>.  ##{'7'7':':Q'>?F LD& 	))%*
	
 
	
 
	
  r&   c                 p    |j         \  }}|j        }	t          j        ||dz   ft          j        |	          S )Nr   r   )r|   r   r   r=   r<   )
rx   ry   rz   rv   rw   r9   r   r~   r   r   s
             r$    _fake_chain_speculative_samplingz=get_sampling_module.<locals>._fake_chain_speculative_sampling  s9     $1 '{Aq1u:U[HHHHr&   )r1   rG   r>   rO   rW   r]   r_   rd   rn   rr   r}   NN)r	   build_and_loadr   r   Tensorr   r/   boolr   	Generatorr    r   )r1   r6   r>   rE   rG   rI   rO   rR   rW   rY   r]   r_   ra   rd   rh   rn   rp   rr   ru   r}   r   r3   s                        @r$   get_sampling_moduler   4   s"    ""1133F-<QRRR,  (5 	
  
     SR, +,,S,SS  (5S 	S
 S 
S S S -,S :LLL # $ %,'  EO,	
 sm  
     ML: 899NN%,'N N EO,	N
 
N N N :9N 9KKK # $ |%,'  EO,	
 sm  
     LK6 788M|M%,'M M EO,	M
 
M M M 98M ?bQQQ # $ |%,' "%,/ 	
  EO, sm  
     RQ@ =>>|%,' "%,/ 	
  EO, 
   ?> ?bQQQ # $ |%,' "%,/ 	
  EO, sm  
     RQ< =>>|%,' "%,/ 	
  EO, 
   ?> ?bQQQ # $ |%,' "%,/ 	
  EO, sm  
     RQD ETVWWW # $# #|#%,'# "%,/# 	#
 "%,/# # # EO,# sm# # 
# # # # # XW#J CDD|%,' "%,/ 	
 "%,/   EO, 
   ED" 8rJJJ|!%,/  
	     KJ$ 677'|'!%,/' ' 
	' ' ' 87' (7M  |!%,/  !<	
 
     * 677'|'!%,/' ' !<	'
 
' ' ' 87' '6L  !%,/  !<	
 
     , 566((!%,/( ( !<	(
 
( ( ( 76( 0
   # $"  " \" "  l"  $)<	" 
 )."  "  EO,"  sm"  "  
"  "  "  "  "  " H >??I\II lI $)<	I
 ).I I EO,I 
I I I @?I /1";";";(G--+#=   r&   c                 F    t          | t          j                  r| dfS d | fS )Nr   )
isinstancer   r   )xs    r$   _to_tensor_scalar_tupler     s)    !U\"" 1vayr&   r*   temperaturer-   c                     t          dd| j                  }|d}|t          | j                  } t                      j        || gt          |          |R  S )ao  Fused GPU kernel for `online safe softmax <https://arxiv.org/abs/1805.02867>`_ with temperature scaling.


    Parameters
    ----------
    logits : torch.Tensor
        Input tensor of logits.
    temperature: Optional[Union[torch.Tensor, float]]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the temperature for temperature scaling.
        If a scalar, the same temperature is used for all requests.
        If a tensor, each request has its own temperature.
    enable_pdl : Optional[bool]
        Whether to enable Programmatic Dependent Launch (PDL) for improved performance on supported hardware.
        If None (default), PDL will be automatically enabled on devices with compute capability >= 9.0.
    Returns
    -------
    probs : torch.Tensor
        Tensor of the same shape as input containing the softmax probabilities.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> logits = torch.rand(batch_size, vocab_size).to(0)
    >>> logits
    tensor([[0.8823, 0.9150, 0.3829, 0.9593, 0.3904],
            [0.6009, 0.2566, 0.7936, 0.9408, 0.1332],
            [0.9346, 0.5936, 0.8694, 0.5677, 0.7411],
            [0.4294, 0.8854, 0.5739, 0.2666, 0.6274]], device='cuda:0')
    >>> probs = flashinfer.sampling.softmax(logits, temperature=1.0)
    >>> probs
    tensor([[0.2309, 0.2385, 0.1401, 0.2493, 0.1412],
            [0.2019, 0.1431, 0.2448, 0.2837, 0.1265],
            [0.2401, 0.1707, 0.2249, 0.1664, 0.1979],
            [0.1724, 0.2719, 0.1991, 0.1465, 0.2101]], device='cuda:0')
    softmax_workspace   Ng      ?)r
   r   r   r   r1   r   )r*   r   r-   r(   s       r$   r1   r1     s|    Z &&9;VV '66
(  (&#:;#G#GIS   r&   TFr8   r9   	check_nanr"   r#   c                     |r5t          j        t          j        |                     rt          d          t	                                          | |||||          S )a	  Fused GPU kernel for category sampling from logits. It's equivalent to sampling
    from :attr:`logits` after applying softmax.
    Parameters
    ----------
    logits: torch.Tensor
        Logits for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
        and the i-th output will be sampled from the i-th row of logits. When indices is provided,
        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
        probability distributions.
    indices: Optional[torch.Tensor]
        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` or ``torch.int64``
        that maps each output to a row in logits. The output tensor will have the same dtype as indices.
        For example, if indices[i] = j, then the i-th output will be sampled from logits[j].
        This allows reusing the same probability distribution for multiple outputs.
        If indices is not provided, the i-th output will be sampled from the i-th row of logits
        and output dtype defaults to ``torch.int32``.
    deterministic: bool
        Since the sampling doesn't use cub's BlockScan, the sampling is deterministic. We keep this
        argument for compatibility with other sampling functions.
    generator: Optional[torch.Generator]
        A random number generator for the operation.
    check_nan: bool
        Whether to check nan in :attr:`logits`, default is ``False``.
    seed: Optional[int]
        seed value to use for the rng during the sampling operation.
    offset: Optional[int]
        offset value to use for the rng during the sampling operation.
    Returns
    -------
    samples: torch.Tensor
        Sampled categories, shape (batch_size,). It's equivalent to sampling from
        :attr:`logits` after applying softmax.
    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> logits = torch.rand(batch_size, vocab_size).to(0)
    >>> logits
    tensor([[0.8823, 0.9150, 0.3829, 0.9593, 0.3904],
            [0.6009, 0.2566, 0.7936, 0.9408, 0.1332],
            [0.9346, 0.5936, 0.8694, 0.5677, 0.7411],
            [0.4294, 0.8854, 0.5739, 0.2666, 0.6274]], device='cuda:0')
    >>> samples = flashinfer.sampling.sampling_from_logits(logits)
    >>> samples
    tensor([0, 1, 1, 1], device='cuda:0', dtype=torch.int32)
    zInput logits contains NaN.)r   anyisnan
ValueErrorr   r>   )r*   r8   r9   r   r   r"   r#   s          r$   r>   r>   W  sc    v  ;9U[(()) 	;9:::  55	4  r&   r2   c                     |r5t          j        t          j        |                     rt          d          t	                                          | |||||          S )a`	  Fused GPU kernel for category sampling from probabilities.

    Parameters
    ----------
    probs: torch.Tensor
        Probabilities for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
        and the i-th output will be sampled from the i-th row of probabilities. When indices is provided,
        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
        probability distributions.
    indices: Optional[torch.Tensor]
        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` or ``torch.int64``
        that maps each output to a row in probs. The output tensor will have the same dtype as indices.
        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
        This allows reusing the same probability distribution for multiple outputs.
        If indices is not provided, the i-th output will be sampled from the i-th row of probs
        and output dtype defaults to ``torch.int32``.
    deterministic: bool
        Whether to use deterministic kernel implementation, default is ``True``.
    generator: Optional[torch.Generator]
        A random number generator for the operation.
    check_nan: bool
        Whether to check nan in :attr:`probs`, default is ``False``.
    seed: Optional[int]
        seed value to use for the rng during the sampling operation.
    offset: Optional[int]
        offset value to use for the rng during the sampling operation.

    Returns
    -------
    samples: torch.Tensor
        Sampled categories, shape (batch_size,).

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
    >>> norm_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
    >>> norm_prob
    tensor([[0.2499, 0.2592, 0.1085, 0.2718, 0.1106],
            [0.2205, 0.0942, 0.2912, 0.3452, 0.0489],
            [0.2522, 0.1602, 0.2346, 0.1532, 0.2000],
            [0.1543, 0.3182, 0.2062, 0.0958, 0.2255]], device='cuda:0')
    >>> samples = flashinfer.sampling.sampling_from_probs(norm_prob)
    >>> samples
    tensor([1, 2, 1, 4], device='cuda:0', dtype=torch.int32)

    Note
    ----
    This function expects float32 inputs, and the output is int32.
    Input probs contains NaN.)r   r   r   r   r   rG   )r2   r8   r9   r   r   r"   r#   s          r$   rG   rG     sc    B  :9U[''(( 	:8999  44wy$  r&   top_pc                     |r5t          j        t          j        |                     rt          d           t	                      j        | |gt          |          ||||R  S )a  Fused GPU kernel for top-p sampling (nucleus sampling) from probabilities,
    this operator implements GPU-based rejection sampling without explicit sorting.
    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.

    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
    which is more efficient than the naive implementation that launches a series of kernels.

    Parameters
    ----------
    probs: torch.Tensor
        Probabilities for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
        and the i-th output will be sampled from the i-th row of probabilities. When indices is provided,
        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
        probability distributions.
    top_p: Union[torch.Tensor, float]
        Either a float or a tensor of shape ``(batch_size,)``, representing the threshold for top-p sampling.
        If a float, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
    indices: Optional[torch.Tensor]
        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` or ``torch.int64``
        that maps each output to a row in probs. The output tensor will have the same dtype as indices.
        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
        This allows reusing the same probability distribution for multiple outputs.
        If indices is not provided, the i-th output will be sampled from the i-th row of probs
        and output dtype defaults to ``torch.int32``.
    deterministic: bool
        Whether to use deterministic kernel implementation, default is ``True``.
    generator: Optional[torch.Generator]
        A random number generator for the operation.
    check_nan: bool
        Whether to check nan in :attr:`probs`, default is ``False``.
    seed: Optional[int]
        seed value to use for the rng during the sampling operation.
    offset: Optional[int]
        offset value to use for the rng during the sampling operation.

    Returns
    -------
    samples: torch.Tensor
        Sampled categories, shape ``(batch_size,)``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> top_p = 0.5
    >>> pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
    >>> norm_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
    >>> norm_prob
    tensor([[0.2499, 0.2592, 0.1085, 0.2718, 0.1106],
            [0.2205, 0.0942, 0.2912, 0.3452, 0.0489],
            [0.2522, 0.1602, 0.2346, 0.1532, 0.2000],
            [0.1543, 0.3182, 0.2062, 0.0958, 0.2255]], device='cuda:0')
    >>> samples = flashinfer.sampling.top_p_sampling_from_probs(norm_prob, top_p)
    >>> samples
    tensor([1, 2, 0, 4], device='cuda:0', dtype=torch.int32)


    Note
    ----
    This function expects float32 inputs, and the output is int32.

    See Also
    --------
    top_k_top_p_sampling_from_probs
    top_k_sampling_from_probs
    top_p_renorm_probs
    r   )r   r   r   r   r   rO   r   )r2   r   r8   r9   r   r   r"   r#   s           r$   rO   rO         f  :9U[''(( 	:8999:  : 
!	'	' 		
 	 	 	   r&   top_kc                     |r5t          j        t          j        |                     rt          d           t	                      j        | |gt          |          ||||R  S )aj  Fused GPU kernel for top-k sampling from probabilities,
    this operator implements GPU-based rejection sampling without explicit sorting.
    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.

    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
    which is more efficient than the naive implementation that launches a series of kernels.

    Parameters
    ----------
    probs: torch.Tensor
        Probabilities for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
        and the i-th output will be sampled from the i-th row of probabilities. When indices is provided,
        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
        probability distributions.
    top_k: Union[torch.Tensor, int]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-k sampling.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
    indices: Optional[torch.Tensor]
        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` or ``torch.int64``
        that maps each output to a row in probs. The output tensor will have the same dtype as indices.
        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
        This allows reusing the same probability distribution for multiple outputs.
        If indices is not provided, the i-th output will be sampled from the i-th row of probs
        and output dtype defaults to ``torch.int32``.
    deterministic: bool
        Whether to use deterministic kernel implementation, default is ``True``.
    generator: Optional[torch.Generator]
        A random number generator for the operation.
    check_nan: bool
        Whether to check nan in :attr:`probs`, default is ``False``.
    seed: Optional[int]
        seed value to use for the rng during the sampling operation.
    offset: Optional[int]
        offset value to use for the rng during the sampling operation.

    Returns
    -------
    samples: torch.Tensor
        Sampled categories, shape ``(batch_size,)``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> top_k = 1
    >>> pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
    >>> norm_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
    >>> norm_prob
    tensor([[0.2499, 0.2592, 0.1085, 0.2718, 0.1106],
            [0.2205, 0.0942, 0.2912, 0.3452, 0.0489],
            [0.2522, 0.1602, 0.2346, 0.1532, 0.2000],
            [0.1543, 0.3182, 0.2062, 0.0958, 0.2255]], device='cuda:0')
    >>> samples = flashinfer.sampling.top_k_sampling_from_probs(norm_prob, top_k)
    >>> samples
    tensor([3, 3, 0, 1], device='cuda:0', dtype=torch.int32)


    Note
    ----
    This function expects float32 inputs, and the output is int32.

    See Also
    --------
    top_k_top_p_sampling_from_probs
    top_p_sampling_from_probs
    top_k_renorm_probs
    r   )r   r   r   r   r   rW   r   )r2   r   r8   r9   r   r   r"   r#   s           r$   rW   rW   D  r   r&   min_pc                     |r5t          j        t          j        |                     rt          d           t	                      j        | |gt          |          ||||R  S )at  Fused GPU kernel for `min_p sampling <https://arxiv.org/abs/2407.01082>`_ from probabilities,

    this operator implements GPU-based rejection sampling without explicit sorting.
    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.

    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
    which is more efficient than the naive implementation that launches a series of kernels.

    Parameters
    ----------
    probs: torch.Tensor
        Probabilities for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
        and the i-th output will be sampled from the i-th row of probabilities. When indices is provided,
        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
        probability distributions.
    min_p: Union[torch.Tensor, float]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for min-p sampling.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
    indices: Optional[torch.Tensor]
        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` or ``torch.int64``
        that maps each output to a row in probs. The output tensor will have the same dtype as indices.
        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
        This allows reusing the same probability distribution for multiple outputs.
        If indices is not provided, the i-th output will be sampled from the i-th row of probs
        and output dtype defaults to ``torch.int32``.
    deterministic: bool
        Whether to use deterministic kernel implementation, default is ``True``.
    generator: Optional[torch.Generator]
        A random number generator for the operation.
    check_nan: bool
        Whether to check nan in :attr:`probs`, default is ``False``.
    seed: Optional[int]
        seed value to use for the rng during the sampling operation.
    offset: Optional[int]
        offset value to use for the rng during the sampling operation.

    Returns
    -------
    samples: torch.Tensor
        Sampled categories, shape ``(batch_size,)``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    <torch._C.Generator object at 0x7f8b3db06df0>
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> min_p = torch.full((batch_size,), 0.05).to(0)
    >>> pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
    >>> norm_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
    >>> norm_prob
    tensor([[0.2499, 0.2592, 0.1085, 0.2718, 0.1106],
            [0.2205, 0.0942, 0.2912, 0.3452, 0.0489],
            [0.2522, 0.1602, 0.2346, 0.1532, 0.2000],
            [0.1543, 0.3182, 0.2062, 0.0958, 0.2255]], device='cuda:0')
    >>> samples = flashinfer.sampling.min_p_sampling_from_probs(norm_prob, min_p)
    >>> samples
    tensor([1, 2, 1, 4], device='cuda:0', dtype=torch.int32)

    Note
    ----
    This function expects float32 inputs, and the output is int32.
    r   )r   r   r   r   r   r]   r   )r2   r   r8   r9   r   r   r"   r#   s           r$   r]   r]     s    ^  :9U[''(( 	:8999:  : 
!	'	' 		
 	 	 	   r&   top_k_firstfilter_apply_orderc
           
         |dk    r=t          | |          }
t          j        |
d          }t          ||||||||	          S |dk    rt          j        | d          }|r5t          j        t          j        |                    rt          d           t                      j        ||gt          |          t          |          ||||	R  S t          d|           )a0  Fused GPU kernel for top-k and top-p sampling from pre-softmax logits,

    this operator implements GPU-based rejection sampling without explicit sorting.
    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.

    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
    which is more efficient than the naive implementation that launches a series of kernels.

    Parameters
    ----------
    logits: torch.Tensor
        Pre-softmax logits for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
        and the i-th output will be sampled from the i-th row of logits. When indices is provided,
        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
        probability distributions.
    top_k: Union[torch.Tensor, int]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-k sampling.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
    top_p: Union[torch.Tensor, float]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-p sampling.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
    indices: Optional[torch.Tensor]
        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` or ``torch.int64``
        that maps each output to a row in probs. The output tensor will have the same dtype as indices.
        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
        This allows reusing the same probability distribution for multiple outputs.
        If indices is not provided, the i-th output will be sampled from the i-th row of probs
        and output dtype defaults to ``torch.int32``.
    filter_apply_order: str
        The order of applying top-k and top-p sampling, should be either ``"top_k_first"`` or ``"joint"``.
        If ``"top_k_first"``, we first apply top-k filter, then apply top-p sampling on the top-k results.
        If ``"joint"``, we apply top-k and top-p filter simultaneously in each round. Default is ``"top_k_first"``.
    deterministic: bool
        Whether to use deterministic kernel implementation, default is ``True``.
    generator: Optional[torch.Generator]
        A random number generator for the operation.
    check_nan: bool
        Whether to check nan in :attr:`probs`, default is ``False``.
    seed: Optional[int]
        seed value to use for the rng during the sampling operation.
    offset: Optional[int]
        offset value to use for the rng during the sampling operation.

    Returns
    -------
    samples: torch.Tensor
        Sampled categories, shape ``(batch_size,)``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> top_p = 0.5
    >>> top_k = 3
    >>> logits = torch.rand(batch_size, vocab_size).to(0)
    >>> logits
    tensor([[ 1.9269,  1.4873,  0.9007, -2.1055, -0.7581],
            [ 1.0783,  0.8008,  1.6806,  0.3559, -0.6866],
            [-0.4934,  0.2415, -0.2316,  0.0418, -0.2516],
            [ 0.8599, -0.3097, -0.3957,  0.8034, -0.6216]], device='cuda:0')
    >>> samples = flashinfer.sampling.top_k_top_p_sampling_from_logits(logits, top_k, top_p)
    >>> samples
    tensor([0, 2, 1, 3], device='cuda:0', dtype=torch.int32
    >>> probs = torch.softmax(logits, dim=-1)
    >>> probs
    tensor([[0.4788, 0.3085, 0.1716, 0.0085, 0.0327],
        [0.2358, 0.1787, 0.4307, 0.1145, 0.0404],
        [0.1358, 0.2831, 0.1764, 0.2318, 0.1729],
        [0.3613, 0.1122, 0.1029, 0.3415, 0.0821]], device='cuda:0')
    >>> samples
    tensor([0, 2, 1, 3], device='cuda:0', dtype=torch.int32)

    Note
    ----
    This function expects float32 inputs, and the output is int32.

    See Also
    --------
    top_k_top_p_sampling_from_probs
    top_k_mask_logits
    top_p_sampling_from_probs
    r   )dimr   r   r"   r#   jointr   Invalid filter_apply_order: )
rr   r   r1   rO   r   r   r   r   r_   r   )r*   r   r   r8   r   r9   r   r   r"   r#   masked_logitsr2   s               r$    top_k_top_p_sampling_from_logitsr     sE   J ]**)&%88m444(	
 	
 	
 		
 
w	&	&f"--- 	>yU++,, > !<===D"$$D	
 %U++	
 %U++		

 	
 	
 	
 	
 	
 	
 		
 L8JLLMMMr&   c
           
      x   |dk    r't          | |          }
t          |
|||||||	          S |dk    rv|r5t          j        t          j        |                     rt          d           t                      j        | |gt          |          t          |          ||||	R  S t          d|           )at  Fused GPU kernel for top-k and top-p sampling from probabilities,

    this operator implements GPU-based rejection sampling without explicit sorting.
    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.

    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
    which is more efficient than the naive implementation that launches a series of kernels.

    Parameters
    ----------
    probs: torch.Tensor
        Probabilities for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
        and the i-th output will be sampled from the i-th row of probabilities. When indices is provided,
        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
        probability distributions.
    top_k: Union[torch.Tensor, int]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-k sampling.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
    top_p: Union[torch.Tensor, float]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-p sampling.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
    indices: Optional[torch.Tensor]
        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` or ``torch.int64``
        that maps each output to a row in probs. The output tensor will have the same dtype as indices.
        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
        This allows reusing the same probability distribution for multiple outputs.
        If indices is not provided, the i-th output will be sampled from the i-th row of probs
        and output dtype defaults to ``torch.int32``.
    filter_apply_order: str
        The order of applying top-k and top-p sampling, should be either ``"top_k_first"`` or ``"joint"``.
        If ``"top_k_first"``, we first apply top-k filter, then apply top-p sampling on the top-k results.
        If ``"joint"``, we apply top-k and top-p filter simultaneously in each round. Default is ``"top_k_first"``.
    deterministic: bool
        Whether to use deterministic kernel implementation, default is ``True``.
    generator: Optional[torch.Generator]
        A random number generator for the operation.
    check_nan: bool
        Whether to check nan in :attr:`probs`, default is ``False``.
    seed: Optional[int]
        seed value to use for the rng during the sampling operation.
    offset: Optional[int]
        offset value to use for the rng during the sampling operation.

    Returns
    -------
    samples: torch.Tensor
        Sampled categories, shape ``(batch_size,)``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> top_p = torch.full((batch_size,), 0.2).to(0)
    >>> top_k = torch.full((batch_size,), 2).to(0)
    >>> pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
    >>> norm_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
    >>> norm_prob
    tensor([[0.2499, 0.2592, 0.1085, 0.2718, 0.1106],
            [0.2205, 0.0942, 0.2912, 0.3452, 0.0489],
            [0.2522, 0.1602, 0.2346, 0.1532, 0.2000],
            [0.1543, 0.3182, 0.2062, 0.0958, 0.2255]], device='cuda:0')
    >>> samples = flashinfer.sampling.top_k_top_p_sampling_from_probs(norm_prob, top_k, top_p)
    >>> samples
    tensor([3, 3, 0, 1], device='cuda:0', dtype=torch.int32)

    Note
    ----
    This function expects float32 inputs, and the output is int32.

    See Also
    --------
    top_k_sampling_from_probs
    top_p_sampling_from_probs
    top_k_renorm_probs
    top_p_renorm_probs
    top_k_mask_logits
    r   r   r   r   r   )	rn   rO   r   r   r   r   r   r_   r   )r2   r   r   r8   r   r9   r   r   r"   r#   re   s              r$   r_   r_     s   @ ]**)%77(	
 	
 	
 		
 
w	&	& 	>yU++,, > !<===D"$$D	
 %U++	
 %U++		

 	
 	
 	
 	
 	
 	
 		
 L8JLLMMMr&   c                 P     t                      j        | gt          |          R  S )a  Fused GPU kernel for renormalizing probabilities by top-p thresholding.

    Parameters
    ----------
    probs: torch.Tensor
        Probabilities, shape ``(batch_size, num_classes)``.
    top_p: Union[torch.Tensor, float]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the top-p threshold for for
        re-normalizing probabilities, should be in ``(0, 1)``.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
        We mask out the probabilities less than `threshold` where the cumulative sum
        of ``probs[probs >= threshold]`` is `top_p`, and renormalize the probabilities.

    Returns
    -------
    renorm_probs: torch.Tensor
        Renormalized probabilities, shape ``(batch_size, num_classes)``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> top_p = 0.3
    >>> pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
    >>> prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
    >>> prob
    tensor([[0.2499, 0.2592, 0.1085, 0.2718, 0.1106],
            [0.2205, 0.0942, 0.2912, 0.3452, 0.0489],
            [0.2522, 0.1602, 0.2346, 0.1532, 0.2000],
            [0.1543, 0.3182, 0.2062, 0.0958, 0.2255]], device='cuda:0')
    >>> renormed_probs = flashinfer.sampling.top_p_renorm_probs(prob, top_p)
    >>> renormed_probs
    tensor([[0.0000, 0.4882, 0.0000, 0.5118, 0.0000],
            [0.0000, 0.0000, 0.0000, 1.0000, 0.0000],
            [0.5181, 0.0000, 0.4819, 0.0000, 0.0000],
            [0.0000, 1.0000, 0.0000, 0.0000, 0.0000]], device='cuda:0')

    Note
    ----
    This combination of ``top_p_renorm_probs`` and ``sampling_from_probs`` should be equivalent to
    ``top_p_sampling_from_probs``.

    See Also
    --------
    top_p_sampling_from_probs
    sampling_from_probs
    top_k_renorm_probs
    )r   rd   r   )r2   r   s     r$   rd   rd     s;    t 4  3'..   r&   c                     d}t          d| j         || j        d          } t                      j        | gt	          |          |R  S )a   Fused GPU kernel for renormalizing probabilities by top-k thresholding.

    Parameters
    ----------
    probs: torch.Tensor
        Probabilities, shape ``(batch_size, num_classes)``.
        Supported dtypes: ``float32``, ``float16``, ``bfloat16``.
    top_k: Union[torch.Tensor, int]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the top-k threshold for for
        for re-normalizing probabilities, should be in ``(0, num_classes)``.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
        We keep the top-k probabilities, set the rest to zero, and renormalize the probabilities.

    Returns
    -------
    renorm_probs: torch.Tensor
        Renormalized probabilities, shape ``(batch_size, num_classes)``.
        Same dtype as input ``probs``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> top_k = 3
    >>> pre_norm_prob = torch.rand(batch_size, vocab_size).to(0)
    >>> prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
    >>> prob
    tensor([[0.2499, 0.2592, 0.1085, 0.2718, 0.1106],
            [0.2205, 0.0942, 0.2912, 0.3452, 0.0489],
            [0.2522, 0.1602, 0.2346, 0.1532, 0.2000],
            [0.1543, 0.3182, 0.2062, 0.0958, 0.2255]], device='cuda:0')
    >>> renormed_probs = flashinfer.sampling.top_k_renorm_probs(prob, top_k)
    >>> renormed_probs
    tensor([[0.3201, 0.3319, 0.0000, 0.3480, 0.0000],
            [0.2573, 0.0000, 0.3398, 0.4028, 0.0000],
            [0.3672, 0.0000, 0.3416, 0.0000, 0.2912],
            [0.0000, 0.4243, 0.2750, 0.0000, 0.3007]], device='cuda:0')

    Note
    ----
    This combination of ``top_k_renorm_probs`` and ``sampling_from_probs`` should be equivalent to
    ``top_k_sampling_from_probs``.

    See Also
    --------
    top_k_sampling_from_probs
    sampling_from_probs
    top_p_renorm_probs
    top_k : General-purpose top-k selection (returns indices and values)
    r   top_k_renorm_probs_row_states_T	zero_init)r
   r   r   rn   r   )r2   r   buffer_bytesri   s       r$   rn   rn   G  sw    z L&777	   4  3'..0A   r&   c                     d}t          d| j         || j        d          } t                      j        | gt	          |          |R  S )as  Fused GPU kernel for masking logits by top-k thresholding.

    Parameters
    ----------
    logits: torch.Tensor
        Logits before softmax, shape ``(batch_size, num_classes)``.
        Supported dtypes: ``float32``, ``float16``, ``bfloat16``.
    top_k: Union[torch.Tensor, int]
        Either a scalar or a tensor of shape ``(batch_size,)``, representing the top-k threshold for for
        for masking logits, should be in ``(0, num_classes)``.
        If a scalar, the same threshold is used for all requests.
        If a tensor, each request has its own threshold.
        We keep the top-k logits, set the rest to negative infinity.

    Returns
    -------
    masked_logits: torch.Tensor
        Masked logits, shape ``(batch_size, num_classes)``.
        Same dtype as input ``logits``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 4
    >>> vocab_size = 5
    >>> top_k = 3
    >>> logits = torch.randn(batch_size, vocab_size).to(0)
    >>> logits
    tensor([[ 1.9269,  1.4873,  0.9007, -2.1055, -0.7581],
            [ 1.0783,  0.8008,  1.6806,  0.3559, -0.6866],
            [-0.4934,  0.2415, -0.2316,  0.0418, -0.2516],
            [ 0.8599, -0.3097, -0.3957,  0.8034, -0.6216]], device='cuda:0')
    >>> masked_logits = flashinfer.sampling.top_k_mask_logits(logits, top_k)
    >>> masked_logits
    tensor([[ 1.9269,  1.4873,  0.9007,    -inf,    -inf],
            [ 1.0783,  0.8008,  1.6806,    -inf,    -inf],
            [   -inf,  0.2415, -0.2316,  0.0418,    -inf],
            [ 0.8599, -0.3097,    -inf,  0.8034,    -inf]], device='cuda:0')

    Note
    ----
    The combination of ``top_k_mask_logits`` and ``softmax`` should be equivalent to ``top_k_renorm_probs``.

    See Also
    --------
    top_k_renorm_probs
    top_k : General-purpose top-k selection (returns indices and values)
    r   top_k_mask_logits_row_states_Tr   )r
   r   r   rr   r   )r*   r   r   ri   s       r$   rr   rr     sw    p L&777	   3  2(//1B   r&   maybe_output_accepted_token_num$maybe_output_emitted_draft_token_numc	                 .   |                      d          }	| j        }
|"t          j        |	t          j        |
          }n|}|"t          j        |	t          j        |
          }n|}t                                          | ||||||||	  	        }|||fS )a  Fused-GPU kernel for speculative sampling for sequence generation (proposed in
    paper `Accelerating Large Language Model Decoding with Speculative Sampling <https://arxiv.org/pdf/2302.01318>`_),
    where the draft model generates a sequence(chain) of tokens for each request.

    Parameters
    ----------
    draft_probs: torch.Tensor
        The probability over vocabulary generated by draft model.
        Shape: ``(batch_size, num_speculate_tokens, vocab_size)``
    draft_token_ids: torch.Tensor
        The draft model's generated token indices.
        Shape: ``(batch_size, num_speculate_tokens)``
    target_probs: torch.Tensor
        The probability over vocabulary generated by target model.
        Compared to input :attr:`draft_probs`, the target model's probability has an additional
        slot at the end because the target model will generate one more token than the draft model.
        Shape: ``(batch_size, num_speculate_tokens + 1, vocab_size)``
    maybe_output_accepted_token_num: Optional[torch.Tensor]
        The number of tokens that can be accepted if each token is considered independently for each request.
        This metric does not consider the fact that rejection sampling will stop at the first token that does not
        satisfy the probability requirement r < p/q.
        It only evaluates the alignment of draft model and target model.
        Shape: ``(batch_size)``
        If specified, the number of accepted token number will be added to this tensor inplace. Default is ``None``.
    maybe_output_emitted_draft_token_num: Optional[torch.Tensor]
        The number of draft tokens that are finally emitted for each request. Does not include
        the bonus token. (Thus the total number of tokens sampled for a given request is
        output_emitted_draft_token_num + 1).
        Shape: ``(batch_size)``
        If specified, the number of emitted token number will be added to this tensor inplace. Default is ``None``.
    deterministic: bool
        Whether to use deterministic kernel implementation, default is ``True``.
    generator: Optional[torch.Generator]
        A random number generator for the operation.
    seed: Optional[int]
        seed value to use for the rng during the sampling operation.
    offset: Optional[int]
        offset value to use for the rng during the sampling operation.

    Returns
    -------
    output_token_ids: torch.Tensor
        The output token indices verified by the target model, rejected samples are
        padded with ``-1``.
        Compared to input :attr:`draft_token_ids`, the output tensor has an additional
        token index at the end for the final token, if all previous tokens are accepted,
        another "bonus" token will be sampled from the target model's probability.
        Shape: (batch_size, num_speculate_tokens + 1)
    output_accepted_token_num: torch.Tensor
        The number of tokens that can be accepted if each token is considered independently for each request.
        This metric does not consider the fact that rejection sampling will stop at the first token that does not
        satisfy the probability requirement r < p/q.
        It only evaluates the alignment of draft model and target model.
        Shape: ``(batch_size)``
    output_emitted_draft_token_num: torch.Tensor
        The number of draft tokens that are finally emitted for each request. Does not include
        the bonus token. (Thus the total number of tokens sampled for a given request is
        output_emitted_draft_token_num + 1).
        Shape: ``(batch_size)``

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> torch.manual_seed(42)
    >>> batch_size = 1
    >>> num_speculate_tokens = 2
    >>> vocab_size = 4
    >>> draft_probs = torch.tensor([[[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.1]]]).to(0)
    >>> # token 2 was sampled from draft model for the first token, and
    >>> # token 1 was sampled from draft model for the second token
    >>> draft_token_ids = torch.tensor([[2, 1]], dtype=torch.int32).to(0)
    >>> target_probs = torch.tensor([[[0.0, 0.1, 0.6, 0.3], [1.0, 0.0, 0.0, 0.0], [0.7, 0.1, 0.1, 0.1]]]).to(0)
    >>> output_token_ids, output_accepted_token_num, output_emitted_draft_token_num =\
    ...     flashinfer.sampling.chain_speculative_sampling(
    ...         draft_probs, draft_token_ids, target_probs)
    >>> # the first token is accepted, the second token is rejected and sampled from the difference
    >>> # between the target model and the draft model, the third token is padded with -1
    >>> output_token_ids
    tensor([[ 2,  0, -1]], device='cuda:0', dtype=torch.int32)
    >>> output_accepted_token_num
    tensor([1], device='cuda:0')
    >>> output_emitted_draft_token_num
    tensor([1], device='cuda:0')
    r   Nr   )r;   r   r   zerosr<   r   r}   )rx   ry   rz   r   r   r9   r   r"   r#   r~   devrv   rw   r   s                 r$   r}   r}     s    D 	A

C&.$)KS$Q$Q$Q!!$C!+3).QekRU)V)V)V&&)M&*,,GG!&
 
 68VVVr&   r   )NTNFNN)Nr   TNFNN)NNTNNN),__doc__	functoolstypesr   typingr   r   r   r   api_loggingr   jit.samplingr	   utilsr
   r   r   r   r   r    r   r   r%   cacher   r   r   r/   r   r1   r>   rG   rO   rW   r]   strr   r_   rd   top_p_renorm_probrn   top_k_renorm_probrr   r}   r7   r&   r$   <module>r      s	         ! ! ! ! ! ! ) ) ) ) ) ) ) ) ) )  ' ' ' ' ' ' - - - - - -              ,0%)" ""(" U\"" 38_	" " " "& ^ ^ ^B    9=!%6 6L6%e 3456 6 \	6 6 6 6r  '++/ ? ?L?el#? ? (	?
 ? 3-? SM? \? ? ? ?D  '++/ E E<Eel#E E (	E
 E 3-E SME \E E E EP  '++/ ] ]<]u$%] el#] 	]
 (] ] 3-] SM] \] ] ] ]@  '++/ ] ]<]s"#] el#] 	]
 (] ] 3-] SM] \] ] ] ]@  '++/ Y Y<Yu$%Y el#Y 	Y
 (Y Y 3-Y SMY \Y Y Y Yx 
 '+++/ AN ANLANs"#AN u$%AN el#	AN
 AN AN (AN AN 3-AN SMAN \AN AN AN ANH 
 '+++/ zN zN<zNs"#zN u$%zN el#	zN
 zN zN (zN zN 3-zN SMzN \zN zN zN zNz ;<;u$%; \; ; ; ;| '  F<Fs"#F \F F F FR '  DLD!&u|S'8!9D
\D D D DN 
 ?CCG+/ vW vW &.el%;	vW
 +35<*@vW vW (vW 3-vW SMvW \vW vW vW vW vW vWr&   