
    &`i                         d dl Z d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
  e
d           G d d	e                      ZdS )
    N)List)Preprocessor)simple_hash)	PublicAPIalpha)	stabilityc                        e Zd ZdZdZdee         dedef fdZde	j
        fdZd	ee         fd
Zd	ee         fdZd Z xZS )FeatureHashera  Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a
    table that describes token frequencies.

    :class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``,
    where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
    ``hash_{index}`` describes the frequency of tokens that hash to ``index``.

    Distinct tokens can correspond to the same index. However, if ``num_features`` is
    large enough, then columns probably correspond to a unique token.

    This preprocessor is memory efficient and quick to pickle. However, given a
    transformed column, you can't know which tokens correspond to it. This might make it
    hard to determine which tokens are important to your model.

    .. warning::
        Sparse matrices aren't supported. If you use a large ``num_features``, this
        preprocessor might behave poorly.

    Examples:

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import FeatureHasher

        The data below describes the frequencies of tokens in ``"I like Python"`` and
        ``"I dislike Python"``.

        >>> df = pd.DataFrame({
        ...     "I": [1, 1],
        ...     "like": [1, 0],
        ...     "dislike": [0, 1],
        ...     "Python": [1, 1]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP

        :class:`FeatureHasher` hashes each token to determine its index. For example,
        the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.

        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
        >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy()  # doctest: +SKIP
        array([[0, 0, 0, 2, 0, 1, 0, 0],
               [0, 0, 0, 1, 0, 1, 1, 0]])

        Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
        :math:`3`. You can avoid hash collisions like these by increasing
        ``num_features``.

    Args:
        columns: The columns to apply the hashing trick to. Each column should describe
            the frequency of a token.
        num_features: The number of features used to represent the vocabulary. You
            should choose a value large enough to prevent hash collisions between
            distinct tokens.
        output_column: The name of the column that contains the hashed features.

    .. seealso::
        :class:`~ray.data.preprocessors.CountVectorizer`
            Use this preprocessor to generate inputs for :class:`FeatureHasher`.

        :class:`ray.data.preprocessors.HashingVectorizer`
            If your input data describes documents rather than token frequencies,
            use :class:`~ray.data.preprocessors.HashingVectorizer`.
    Fcolumnsnum_featuresoutput_columnc                 r    t                                                       || _        || _        || _        d S N)super__init__r   r   r   )selfr   r   r   	__class__s       q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/preprocessors/hasher.pyr   zFeatureHasher.__init__O   s:     	 )*    dfc                 >     fd}|j         d d  j        f                             |dd          }d t           j                  D             }||                                         }t          j        t          |                    |j         d d  j	        f<   |S )Nc                     t          j        t                    j        D ]-}t	          |j                  }|xx         | |         z  cc<   .fdt          j                  D             S )Nc                 (    i | ]}d | |         S hash_ ).0ihash_countss     r   
<dictcomp>zOFeatureHasher._transform_pandas.<locals>.row_feature_hasher.<locals>.<dictcomp>c   s%    RRRAKAKKQRRRr   )collectionsdefaultdictintr   r   r   range)rowcolumnhashed_valuer   r   s      @r   row_feature_hasherz;FeatureHasher._transform_pandas.<locals>.row_feature_hasher^   s    %1#66K, 9 9*643DEEL)))S[8))))RRRRt?P9Q9QRRRRr      expand)axisresult_typec                     g | ]}d | S r   r   )r   r   s     r   
<listcomp>z3FeatureHasher._transform_pandas.<locals>.<listcomp>j   s    FFFFFFr   )
locr   applyr$   r   to_numpypdSerieslistr   )r   r   r(   feature_columnshash_columnsconcatenateds   `     r   _transform_pandaszFeatureHasher._transform_pandas\   s    	S 	S 	S 	S 	S &DL177QH 8 
 

 GFU43D-E-EFFF&|4==?? )+	$|2D2D(E(Eqqq$$$%	r   returnc                     | j         S r   )r   r   s    r   get_input_columnszFeatureHasher.get_input_columnsr   s
    |r   c                     | j         gS r   )r   r;   s    r   get_output_columnsz FeatureHasher.get_output_columnsu   s    "##r   c                 P    | j         j         d| j        d| j        d| j        dS )Nz	(columns=z, num_features=z, output_column=))r   __name__r   r   r   r;   s    r   __repr__zFeatureHasher.__repr__x   sL    ~& 5 5 5 5 -5 5!/5 5 5	
r   )rA   
__module____qualname____doc___is_fittabler   strr#   r   r2   	DataFramer8   r<   r>   rB   __classcell__)r   s   @r   r
   r
      s        > >@ L+c+ + 	+ + + + + +BL    ,49    $DI $ $ $ $
 
 
 
 
 
 
r   r
   )r!   typingr   pandasr2   ray.data.preprocessorr   ray.data.preprocessors.utilsr   ray.util.annotationsr   r
   r   r   r   <module>rO      s                  . . . . . . 4 4 4 4 4 4 * * * * * * Wq
 q
 q
 q
 q
L q
 q
 q
 q
 q
r   