§
    ÇPƒi«
  ã                   óŒ   — d dl mZmZmZmZ d dlmZ d dlmZ 	 	 	 	 ddede	d	ee
         d
ee         de	dee	ef         defd„ZdS )é    )ÚAnyÚCallableÚDictÚOptional©ÚTextCompletionDataset)ÚModelTokenizerúccdv/cnn_dailymailNÚtrainÚ	tokenizerÚsourceÚmax_seq_lenÚ	filter_fnÚsplitÚload_dataset_kwargsÚreturnc                 ó*   — t          d| |d|||ddœ|¤ŽS )a  
    Support for family of datasets similar to `CNN / DailyMail <https://huggingface.co/datasets/ccdv/cnn_dailymail>`_,
    a corpus of news articles. This builder only extracts the articles and not the highlights for
    general text completion tasks.

    Args:
        tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
        source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``
            (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
        max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
            Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
            and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
        filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
            the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
            details.
        split (str): ``split`` argument for ``datasets.load_dataset``. You can use this argument to load a subset
            of a given split, e.g. ``split="train[:10%]"``. Default is "train".
        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.

    Returns:
        TextCompletionDataset: the configured TextCompletionDataset
    Úarticlez3.0.0)r   r   Úcolumnr   r   r   Úname© r   )r   r   r   r   r   r   s         úu/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/datasets/_cnn_dailymail.pyÚcnn_dailymail_articles_datasetr      sA   € õ> !ð ØØØØØØð ðð ð ðð ð ó    )r
   NNr   )Útypingr   r   r   r   Ú#torchtune.datasets._text_completionr   Ú'torchtune.modules.transforms.tokenizersr	   ÚstrÚintr   r   r   r   ú<module>r       sÓ   ðð 1Ð 0Ð 0Ð 0Ð 0Ð 0Ð 0Ð 0Ð 0Ð 0Ð 0Ð 0à EÐ EÐ EÐ EÐ EÐ Eà BÐ BÐ BÐ BÐ BÐ Bð
 'Ø!%Ø$(Øð+ð +Øð+àð+ð ˜#”ð+ð ˜Ô!ð	+ð
 ð+ð    S œ>ð+ð ð+ð +ð +ð +ð +ð +r   