
    &`i4                         d dl mZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlmZmZ d dlmZ erd dlmZ  ed	           G d
 de
                      Z ed	           G d de
                      ZdS )    Counter)TYPE_CHECKINGCallableListOptionalN)Preprocessor)simple_hashsimple_split_tokenizer)	PublicAPI)Datasetalpha)	stabilityc                        e Zd ZdZdZ	 ddddee         dedee	egee         f                  deee                  f fd	Z
d
ej        fdZd Z xZS )HashingVectorizera  Count the frequency of tokens using the
    `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_.

    This preprocessors creates a list column for each input column. For each row,
    the list contains the frequency counts of tokens (for CountVectorizer) or hash values
    (for HashingVectorizer). For HashingVectorizer, the list will have length
    ``num_features``. If ``num_features`` is large enough relative to the size of your
    vocabulary, then each index approximately corresponds to the frequency of a unique
    token.

    :class:`HashingVectorizer` is memory efficient and quick to pickle. However, given a
    transformed column, you can't know which tokens correspond to it. This might make it
    hard to determine which tokens are important to your model.

    .. note::

        This preprocessor transforms each input column to a
        `document-term matrix <https://en.wikipedia.org/wiki/Document-term_matrix>`_.

        A document-term matrix is a table that describes the frequency of tokens in a
        collection of documents. For example, the strings `"I like Python"` and `"I
        dislike Python"` might have the document-term matrix below:

        .. code-block::

                    corpus_I  corpus_Python  corpus_dislike  corpus_like
                0         1              1               1            0
                1         1              1               0            1

        To generate the matrix, you typically map each token to a unique index. For
        example:

        .. code-block::

                        token  index
                0        I      0
                1   Python      1
                2  dislike      2
                3     like      3

        The problem with this approach is that memory use scales linearly with the size
        of your vocabulary. :class:`HashingVectorizer` circumvents this problem by
        computing indices with a hash function:
        :math:`\texttt{index} = hash(\texttt{token})`.

    .. warning::
        Sparse matrices aren't currently supported. If you use a large ``num_features``,
        this preprocessor might behave poorly.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import HashingVectorizer
        >>>
        >>> df = pd.DataFrame({
        ...     "corpus": [
        ...         "Jimmy likes volleyball",
        ...         "Bob likes volleyball too",
        ...         "Bob also likes fruit jerky"
        ...     ]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> vectorizer = HashingVectorizer(["corpus"], num_features=8)
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
                             corpus
        0  [1, 0, 1, 0, 0, 0, 0, 1]
        1  [1, 0, 1, 0, 0, 0, 1, 1]
        2  [0, 0, 1, 1, 0, 2, 1, 0]

        :class:`HashingVectorizer` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> vectorizer = HashingVectorizer(["corpus"], num_features=8, output_columns=["corpus_hashed"])
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
                               corpus             corpus_hashed
        0      Jimmy likes volleyball  [1, 0, 1, 0, 0, 0, 0, 1]
        1    Bob likes volleyball too  [1, 0, 1, 0, 0, 0, 1, 1]
        2  Bob also likes fruit jerky  [0, 0, 1, 1, 0, 2, 1, 0]

    Args:
        columns: The columns to separately tokenize and count.
        num_features: The number of features used to represent the vocabulary. You
            should choose a value large enough to prevent hash collisions between
            distinct tokens.
        tokenization_fn: The function used to generate tokens. This function
            should accept a string as input and return a list of tokens as
            output. If unspecified, the tokenizer uses a function equivalent to
            ``lambda s: s.split(" ")``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`CountVectorizer`
            Another method for counting token frequencies. Unlike :class:`HashingVectorizer`,
            :class:`CountVectorizer` creates a feature for each unique token. This
            enables you to compute the inverse transformation.

        :class:`FeatureHasher`
            This preprocessor is similar to :class:`HashingVectorizer`, except it expects
            a table describing token frequencies. In contrast,
            :class:`FeatureHasher` expects a column containing documents.
    FNoutput_columnscolumnsnum_featurestokenization_fnr   c                    t                                                       || _        || _        |pt          | _        t          j        ||          | _        d S N)	super__init__r   r   r   r   r	   #_derive_and_validate_output_columnsr   )selfr   r   r   r   	__class__s        u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/preprocessors/vectorizer.pyr   zHashingVectorizer.__init__}   sX     	(.H2H*N^
 
    dfc                    	 dt           t                   dt          f fd}t           j         j                  D ]\  }}||                              j                  }|                    |          }g }t           j	                  D ]9	|                    	fd          }d	 |_
        |                    |           :t          j        |d          j                                        ||<   |S )Ntokensreturnc                 >    fd| D             }t          |          S )Nc                 :    g | ]}t          |j                  S  )r
   r   ).0tokenr   s     r   
<listcomp>zKHashingVectorizer._transform_pandas.<locals>.hash_count.<locals>.<listcomp>   s&    WWWu[0ABBWWWr   r   )r"   hashed_tokensr   s     r   
hash_countz7HashingVectorizer._transform_pandas.<locals>.hash_count   s+    WWWWPVWWWM=)))r   c                     |          S r   r&   )countsis    r   <lambda>z5HashingVectorizer._transform_pandas.<locals>.<lambda>   s    6!9 r   hash_   axis)r   strr   zipr   r   mapr   ranger   nameappendpdconcatvaluestolist)
r   r    r+   col
output_col	tokenizedhashedhash_columnsseriesr.   s
   `        @r   _transform_pandasz#HashingVectorizer._transform_pandas   s   	*tCy 	*W 	* 	* 	* 	* 	* 	*  #4<1DEE 
	M 
	MOC3D$899I]]:..FL4,-- , ,$<$<$<$<==)akk##F++++Y|!<<<CJJLLBzNN	r   c           
          t          | j        d| j                  }| j        j         d| j        d| j        d| d| j        d
S )N__name__	(columns=z, num_features=, tokenization_fn=, output_columns=))getattrr   r   rF   r   r   r   r   fn_names     r   __repr__zHashingVectorizer.__repr__   sp    $.
D<PQQ~& 7 7 7 7 -7 7CJ7 7"17 7 7	
r   r   )rF   
__module____qualname____doc___is_fittabler   r4   intr   r   r   r:   	DataFramerD   rN   __classcell__r   s   @r   r   r      s        i iV L AE	
 /3
 
 
c
 
 "(C5$s)+;"<=	
 !c+
 
 
 
 
 
 BL    &
 
 
 
 
 
 
r   r   c                        e Zd ZdZ	 	 ddddee         deeegee         f                  dee         deee                  f fdZ	d	d
de
fdZdej        fdZd Z xZS )CountVectorizera  Count the frequency of tokens in a column of strings.

    :class:`CountVectorizer` operates on columns that contain strings. For example:

    .. code-block::

                        corpus
        0    I dislike Python
        1       I like Python

    This preprocessor creates a list column for each input column. Each list contains
    the frequency counts of tokens in order of their first appearance. For example:

    .. code-block::

                    corpus
        0    [1, 1, 1, 0]  # Counts for [I, dislike, Python, like]
        1    [1, 0, 1, 1]  # Counts for [I, dislike, Python, like]

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import CountVectorizer
        >>>
        >>> df = pd.DataFrame({
        ...     "corpus": [
        ...         "Jimmy likes volleyball",
        ...         "Bob likes volleyball too",
        ...         "Bob also likes fruit jerky"
        ...     ]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> vectorizer = CountVectorizer(["corpus"])
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
                             corpus
        0  [1, 0, 1, 1, 0, 0, 0, 0]
        1  [1, 1, 1, 0, 0, 0, 0, 1]
        2  [1, 1, 0, 0, 1, 1, 1, 0]

        You can limit the number of tokens in the vocabulary with ``max_features``.

        >>> vectorizer = CountVectorizer(["corpus"], max_features=3)
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
              corpus
        0  [1, 0, 1]
        1  [1, 1, 1]
        2  [1, 1, 0]

        :class:`CountVectorizer` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> vectorizer = CountVectorizer(["corpus"], output_columns=["corpus_counts"])
        >>> vectorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
                               corpus             corpus_counts
        0      Jimmy likes volleyball  [1, 0, 1, 1, 0, 0, 0, 0]
        1    Bob likes volleyball too  [1, 1, 1, 0, 0, 0, 0, 1]
        2  Bob also likes fruit jerky  [1, 1, 0, 0, 1, 1, 1, 0]

    Args:
        columns: The columns to separately tokenize and count.
        tokenization_fn: The function used to generate tokens. This function
            should accept a string as input and return a list of tokens as
            output. If unspecified, the tokenizer uses a function equivalent to
            ``lambda s: s.split(" ")``.
        max_features: The maximum number of tokens to encode in the transformed
            dataset. If specified, only the most frequent tokens are encoded.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.
    Nr   r   r   max_featuresr   c                    t                                                       || _        |pt          | _        || _        t          j        ||          | _        d S r   )	r   r   r   r   r   rY   r	   r   r   )r   r   r   rY   r   r   s        r   r   zCountVectorizer.__init__   sX     	.H2H(*N^
 
r   datasetr   r#   c                 d      fd j                             fdd  j                    S )Nc                    	 dt           j        dt          t                   ffd}
                    |d          }d j        D             }|                    d           D ]<}|                                D ]%\  }}|D ]}||                             |           &=dt          d	t          fd
		fd|
                                D             } fdt          j        |          D             S )Nr    r#   c                 :      fdfdj         D             S )Nc                     |                               j                  }|                                }t          |          S r   )applyr   sumr   )r>   token_seriesr"   r    r   s      r   get_token_countsz\CountVectorizer._fit.<locals>.stat_fn.<locals>.get_pd_value_counts.<locals>.get_token_counts  s8    #%c7==1E#F#FL)--//F"6??*r   c                 *    i | ]}| |          gS r&   r&   )r'   r>   rc   s     r   
<dictcomp>zVCountVectorizer._fit.<locals>.stat_fn.<locals>.get_pd_value_counts.<locals>.<dictcomp>  s*    MMM..s334MMMr   )r   )r    rc   r   s   `@r   get_pd_value_countszBCountVectorizer._fit.<locals>.stat_fn.<locals>.get_pd_value_counts  sB    + + + + + +
 NMMMMMMMr   pandas)batch_formatc                 ,    i | ]}|t                      S r&   r   )r'   r>   s     r   re   z9CountVectorizer._fit.<locals>.stat_fn.<locals>.<dictcomp>  s    CCCsCCCCr   )
batch_sizecounternc                 `    t          t          |                     |                              S r   )r   dictmost_common)rk   rl   s     r   ro   z:CountVectorizer._fit.<locals>.stat_fn.<locals>.most_common  s&    tG$7$7$:$:;;<<<r   c                 2    g | ]} |j                   S r&   )rY   )r'   rk   ro   r   s     r   r)   z9CountVectorizer._fit.<locals>.stat_fn.<locals>.<listcomp>  s8        GT%677  r   c                 .    i | ]\  }} |          |S r&   r&   )r'   r>   r-   key_gens      r   re   z9CountVectorizer._fit.<locals>.stat_fn.<locals>.<dictcomp>  s7       !S& f  r   )r:   rT   r   r   map_batchesr   iter_batchesitemsupdaterS   r<   r5   )rr   rf   value_countstotal_countsbatchr>   countersrk   
top_countsro   r[   r   s   `        @r   stat_fnz%CountVectorizer._fit.<locals>.stat_fn  sz   N Ng N N N N N N #..#( /  L DCdlCCCL%22d2CC : :%*[[]] : :MC#+ : :$S)009999::=W = = = = =    +2244  J
   %(z%B%B   r   c                      |           S r   r&   )rr   r|   s    r   r/   z&CountVectorizer._fit.<locals>.<lambda>%  s    GGG$4$4 r   c                     d|  dS )Ntoken_counts(rJ   r&   )r>   s    r   r/   z&CountVectorizer._fit.<locals>.<lambda>&  s    $:C$:$:$: r   )r|   stat_key_fnr   )stat_computation_planadd_callable_statr   )r   r[   r|   s   ``@r   _fitzCountVectorizer._fit  sc    	 	 	 	 	 	> 	"444444::L 	5 	
 	
 	
 r   r    c                 P  
 g }t          | j        | j                  D ]\  }}| j        d| d         }d |                                D             }||                             | j                                      t                    }g }|D ]6
|                    
fd          }	
|	_        |	                    |	           7|r1t          j        |d          j                                        ||<   ng gt          |          z  ||<   |	                    |           |S )Nr   rJ   c                     g | ]\  }}|S r&   r&   )r'   r(   counts      r   r)   z5CountVectorizer._transform_pandas.<locals>.<listcomp>0  s    TTT~uUTTTr   c                     |          S r   r&   )valr(   s    r   r/   z3CountVectorizer._transform_pandas.<locals>.<lambda>6  s    3u: r   r1   r2   )r5   r   r   stats_ro   r6   r   r   r8   r9   r:   r;   r<   r=   len)r   r    result_columnsr>   r?   token_countssorted_tokensr@   token_columnsrC   r(   s             @r   rD   z!CountVectorizer._transform_pandas,  s:   "4<1DEE 	. 	.OC;'=s'='='=>LTT9Q9Q9S9STTTM3D$899==gFFI M& - -"'='='='=>>#$$V,,,,  0!#=q!A!A!A!H!O!O!Q!Q:"$B:!!*----	r   c           
          t          | j        d| j                  }| j        j         d| j        d| d| j        d| j        d
S )NrF   rG   rH   z, max_features=rI   rJ   )rK   r   r   rF   r   rY   r   rL   s     r   rN   zCountVectorizer.__repr__C  sp    $.
D<PQQ~& 7 7 7 7&7 77;7H7 7"17 7 7	
r   )NN)rF   rO   rP   rQ   r   r4   r   r   rS   r   r	   r   r:   rT   rD   rN   rU   rV   s   @r   rX   rX      s       G GX AE&*	
 /3
 
 
c
 "(C5$s)+;"<=
 sm	
 !c+
 
 
 
 
 
 &I &, & & & &PBL    .
 
 
 
 
 
 
r   rX   )collectionsr   typingr   r   r   r   rg   r:   ray.data.preprocessorr	   ray.data.preprocessors.utilsr
   r   ray.util.annotationsr   ray.data.datasetr   r   rX   r&   r   r   <module>r      sB         : : : : : : : : : : : :     . . . . . . L L L L L L L L * * * * * * )(((((( WW
 W
 W
 W
 W
 W
 W
 W
t W_
 _
 _
 _
 _
l _
 _
 _
 _
 _
r   