
    PiS                         d dl mZmZmZmZmZmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ  G d de          Z	 	 	 	 	 	 ddedededededededee         deeef         deeef         fdZdS )    )AnyCallableDictListMappingOptionalUnion)load_dataset)Dataset)truncate)PackedDataset)ModelTokenizerc                       e Zd ZdZ	 	 	 ddedededed	ee         d
e	ee
f         ddfdZd Zdede	eee         f         fdZdeee
f         de	eee         f         fdZdS )TextCompletionDataseta  
    Freeform dataset for any unstructured text corpus. Quickly load any dataset
    from Hugging Face or local disk and tokenize it for your model.

    Args:
        tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
        source (str): path to dataset repository on Hugging Face. For local datasets,
            define source as the data file type (e.g. "json", "csv", "text") and pass
            in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
            (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
            for more details.
        column (str): name of column in the sample that contains the text data. This is typically required
            for Hugging Face datasets or tabular data. For local datasets with a single column
            (e.g. unstructured txt files), use the default "text" which is used by Hugging Face datasets
            when loaded into memory. Default is "text".
        add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
        filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
            the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
            details.
        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. See Hugging
            Face's `API ref <https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset>`_
            for more details.
    textTN	tokenizersourcecolumnadd_eos	filter_fnload_dataset_kwargsreturnc                     || _         t          |fi || _        || _        || _        |!| j                            |          | _        d S d S N)
_tokenizerr
   _data_columnr   filter)selfr   r   r   r   r   r   s          w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/datasets/_text_completion.py__init__zTextCompletionDataset.__init__)   sZ     $!&@@,?@@
 **955DJJJ !     c                 *    t          | j                  S r   )lenr   )r   s    r    __len__zTextCompletionDataset.__len__:   s    4:r"   indexc                 F    | j         |         }|                     |          S r   )r   _prepare_sample)r   r&   samples      r    __getitem__z!TextCompletionDataset.__getitem__=   s"    E"##F+++r"   r)   c                     || j                  }| j                            |d| j                  }| j        j        t          || j        j        dz
            }|                                }||dS )NT)r   add_bosr      )tokenslabels)r   r   encoder   max_seq_lenr   copy)r   r)   promptr.   r/   s        r    r(   z%TextCompletionDataset._prepare_sampleA   so    %''VT4<'XX ?&2fdo&AA&EFFF  F333r"   )r   TN)__name__
__module____qualname____doc__r   strboolr   r   r   r   r!   r%   intr   r*   r   r(    r"   r    r   r      s	        8 (,6 6!6 6 	6
 6 H%6  $CH~6 
6 6 6 6"  , ,c49n)= , , , ,4gc3h&7 4Dd3i<P 4 4 4 4 4 4r"   r   r   TFtrainNr   r   r   r   packedsplit_across_packsplitr   r   r   c           
          t          d| |||||d|}	|r-| j        t          d          t          |	| j        |          S |	S )a5  
    Build a configurable dataset from a freeform, unstructured text corpus similar
    to datasets used in pre-training. This method should be
    used to configure a custom text dataset from the yaml config instead of
    using :class:`~torchtune.datasets.TextCompletionDataset` directly, as it is made to be config friendly.

    Args:
        tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
        source (str): path to dataset repository on Hugging Face. For local datasets,
            define source as the data file type (e.g. "json", "csv", "text") and pass
            in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
            (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
            for more details.
        column (str): name of column in the sample that contains the text data. This is typically required
            for Hugging Face datasets or tabular data. For local datasets with a single column
            (e.g. unstructured txt files), use the default "text" which is used by Hugging Face datasets
            when loaded into memory. Default is "text".
        add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
        packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False.
        split_across_pack (bool): if the last sample in a pack does not fit in ``max_seq_len``,
            split the sample into the next pack, or move it entirely to the beginning of the next pack.
            For pre-training, typically this is set to True for general text completion. For
            fine-tuning, typically this is set to False to avoid truncating sentences in instruct
            tuning. This argument is ignored if ``packed=False``. Default is True.
        split (str): ``split`` argument for ``datasets.load_dataset``. You can use this argument to load a subset
            of a given split, e.g. ``split="train[:10%]"``. Default is "train".
        filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
            the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
            details.
        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.

    Examples:
        >>> from torchtune.datasets import text_completion_dataset
        >>> dataset = text_completion_dataset(
        ...   tokenizer=tokenizer,
        ...   source="allenai/c4",
        ...   column="text",
        ...   data_dir="realnewslike",
        ...   packed=False,
        ...   split="train",
        ... )

    This can also be accomplished via the yaml config::

        dataset:
            _component_: torchtune.datasets.text_completion_dataset
            source: allenai/c4
            column: text
            data_dir: realnewslike
            packed: False
            split: train

    Returns:
        Union[TextCompletionDataset, PackedDataset]: the configured :class:`~torchtune.datasets.TextCompletionDataset`
            or :class:`~torchtune.datasets.PackedDataset` if ``packed=True``

    Raises:
        ValueError: If ``packed=True`` and ``tokenizer.max_seq_len`` is not set.
    )r   r   r   r   r?   r   Nz@PackedDataset requires a max_seq_len to be set on the tokenizer.)r1   r>   r;   )r   r1   
ValueErrorr   )
r   r   r   r   r=   r>   r?   r   r   dss
             r    text_completion_datasetrC   P   s    L 
 

 
 
 
B  
 (R   I1EV
 
 
 	
 Ir"   )r   TFTr<   N)typingr   r   r   r   r   r   r	   datasetsr
   torch.utils.datar   torchtune.data._utilsr   torchtune.datasets._packedr   'torchtune.modules.transforms.tokenizersr   r   r8   r9   rC   r;   r"   r    <module>rJ      s   G F F F F F F F F F F F F F F F F F ! ! ! ! ! ! $ $ $ $ $ $ * * * * * * 4 4 4 4 4 4 B B B B B B=4 =4 =4 =4 =4G =4 =4 =4F "$(W WWW W 	W
 W W W !W  S>W  -/0W W W W W Wr"   