
    Pi                           d dl mZmZmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ  G d d	e          Z G d
 de          ZdS )    )AnyCallableDictMappingOptionalN)load_dataset)Dataset)CROSS_ENTROPY_IGNORE_IDX)validate_messages)	Transformc                   z    e Zd ZdZdddedededee         deee	f         d	dfd
Z
d Zded	eee	f         fdZdS )
SFTDatasetaA  
    Primary class for creating any dataset for supervised fine-tuning either from
    Hugging Face Hub, local files, or remote files. This class supports instruct,
    chat, tool, or multimodal data for fine-tuning. At a high level, this class
    will load the data from source and apply the following pre-processing steps
    when a sample is retrieved:

    1. Dataset-specific transform. This is typically unique to each dataset and extracts
       the necessary columns into torchtune's :class:`~torchtune.data.Message` format,
       a standardized API for all model tokenizers.
    2. Model-specific transform or tokenization with optional prompt template


    All datasets are formatted into a list of :class:`~torchtune.data.Message`
    because for fine-tuning, datasets can be considered as "conversations" with the model,
    or AI assistant. Thus, we can standardize all text content as messages in a conversation assigned to
    a role:

    - ``"system"`` messages contain the system prompt
    - ``"user"`` messages contain the input prompt into the model
    - ``"assistant"`` messages are the response of the model and what you actually want
      to train for and compute loss directly against
    - ``"ipython"`` messages are the return from a tool call

    Chat datasets are multiple rounds of user-assistant messages. Instruct datasets
    are typically a single round involving a specific instruction and the model's response.
    Tool datasets are a type of chat dataset that includes ipython messages. Multimodal
    datasets are a type of chat dataset that incorporates media into the user messages.

    The :class:`~torchtune.data.Message` forms the core data unit that all tokenizer
    APIs expect. The key component of this class that ensures any dataset is transformed
    into this format is the ``message_transform``. This is a callable class that takes
    in a sample dictionary - typically a single row from the source dataset - that
    processes the sample in any configurable way to output a list of messages::

        [
            Message(
                role=<system|user|assistant|ipython>,
                content=<message>,
            ),
            ...
        ]

    For any custom dataset, use the ``message_transform`` to contain all pre-processing to
    return the list of messages.

    Any model-specific pre-processing that needs to happen can be configured with the ``model_transform``
    parameter. This is another callable class that contains any custom logic tied to the
    model you are fine-tuning and will carry over to inference. For example, text + image
    multimodal datasets requires processing the images in a way specific to the vision
    encoder being used by the model and is agnostic to the specific dataset.

    Tokenization is handled by the ``model_transform``. All
    :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` can be treated as
    a ``model_transform`` since it uses the model-specific tokenizer to transform the
    list of messages outputted from the ``message_transform`` into tokens used by the
    model for training. Text-only datasets will simply pass the
    :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` into ``model_transform``.
    Tokenizers handle prompt templating, if configured.

    Args:
        source (str): path to dataset repository on Hugging Face. For local datasets,
            define source as the data file type (e.g. "json", "csv", "text") and pass
            in the filepath in ``data_files``. See `Hugging Face's
            <https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path>`_
            ``load_dataset`` for more details.
        message_transform (Transform): callable that keys into the desired fields in the sample
            and converts text content to a list of :class:`~torchtune.data.Message`. It is expected that the final list
            of messages are stored in the ``"messages"`` key. See :ref:`message_transform_usage_label` for details.
        model_transform (Transform): callable that applies model-specific pre-processing to the sample after the list of
            messages is created from ``message_transform``. This includes tokenization and any modality-specific
            transforms. It is expected to return at minimum ``"tokens"`` and ``"mask"`` keys.
        filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
            the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
            details.
        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. See Hugging
            Face's `API ref <https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset>`_
            for more details.
    N)	filter_fnsourcemessage_transformmodel_transformr   load_dataset_kwargsreturnc                    || _         || _        t          |fi || _        || j                            |          | _        t          | j         | j                  | _        d S )N)r   r   )_message_transform_model_transformr   _datafilterSFTTransform_prepare_sample)selfr   r   r   r   r   s         k/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/datasets/_sft.py__init__zSFTDataset.__init__d   sr     #4 /!&@@,?@@
 **955DJ+"5 1 
  
  
    c                 *    t          | j                  S N)lenr   )r   s    r   __len__zSFTDataset.__len__y   s    4:r   indexc                 F    | j         |         }|                     |          S r!   )r   r   )r   r$   samples      r   __getitem__zSFTDataset.__getitem__|   s"    E"##F+++r   )__name__
__module____qualname____doc__strr   r   r   r   r   r   r#   intr'    r   r   r   r      s        N Nl )-
 
 
 
 %	

 #
 H%
  $CH~
 

 
 
 
*  , ,c3h , , , , , ,r   r   c                   l    e Zd Z	 	 ddee         dee         fdZdeeef         de	eef         fdZ
dS )	r   Nr   r   c                 H    ||t          d          || _        || _        d S )NzFAt least one of message_transform or model_transform must be provided.)
ValueErrorr   r   )r   r   r   s      r   r   zSFTTransform.__init__   s<    
 $)@X   #4 /r   r&   r   c                    | j         /|                      |          }d|v rt          |d                    n|}| j        |                     |          }d|v rd|v s<d                    |                                          }d| d}t          |          t          t          j        |d         t          |d                             |d<   t          |d                   t          |d                   k    sJ n|}|S )Nmessagestokensmaskz, z-model_transform returned the following keys: z*. Must return 'tokens' and 'mask' as keys.labels)r   r   r   joinkeysr1   listnpwherer
   r"   )r   r&   transformed_sampletokenized_dictkeys_strerror_messages         r   __call__zSFTTransform.__call__   s8   ".!%!8!8!@!@///!"4Z"@AAA!' ,!223EFFN..6^3K3K99^%8%8%:%:;;LL L L  !/// (,"6*,"8, ( (N8$ ~h/00Cx8P4Q4QQQQQQ/Nr   )NN)r(   r)   r*   r   r   r   r   r,   r   r   r@   r.   r   r   r   r      s         26/3
0 
0#I.
0 "),
0 
0 
0 
0wsCx0 T#s(^      r   r   )typingr   r   r   r   r   numpyr:   datasetsr   torch.utils.datar	   torchtune.data._commonr
   torchtune.data._messagesr   torchtune.modules.transformsr   r   r   r.   r   r   <module>rH      s    : 9 9 9 9 9 9 9 9 9 9 9 9 9     ! ! ! ! ! ! $ $ $ $ $ $ ; ; ; ; ; ; 6 6 6 6 6 6 2 2 2 2 2 2k, k, k, k, k, k, k, k,\, , , , ,9 , , , , ,r   