
    Pi                     v    d dl mZmZ d dlmZ d dlmZ d dlmZ  ej	        d          Z
 G d de          ZdS )	    )ListTuple)Dataset)utils)PackedDatasetDEBUGc                   n    e Zd ZdZdee         fdZdedeee         ee         f         fdZ	defdZ
dS )	ConcatDatasetaR	  
    A dataset class for concatenating multiple sub-datasets into a single dataset. This class enables the
    unified handling of different datasets as if they were a single dataset, simplifying tasks such as
    training models on multiple sources of data simultaneously.

    The class internally manages the aggregation of different datasets and allows transparent indexing across them.
    However, it requires all constituent datasets to be fully loaded into memory, which might not be optimal for
    very large datasets.

    Upon initialization, this class computes the cumulative length of all datasets and maintains an internal mapping
    of indices to the respective datasets. This approach allows the :class:`~torchtune.datasets.ConcatDataset`
    to delegate data retrieval to the appropriate sub-dataset transparently when a particular index is accessed.

    Note:
        Using this class with very large datasets can lead to high memory consumption, as it requires all datasets to
        be loaded into memory. For large-scale scenarios, consider other strategies that might stream data on demand.

    Args:
        datasets (List[Dataset]): A list of datasets to concatenate. Each dataset must be an instance of a class
            derived from :class:`~torch.utils.data.Dataset`.

    Raises:
        ValueError: if instanse of `PackedDataset` is in `datasets`

    Examples:
        >>> dataset1 = MyCustomDataset(params1)
        >>> dataset2 = MyCustomDataset(params2)
        >>> concat_dataset = ConcatDataset([dataset1, dataset2])
        >>> print(len(concat_dataset))  # Total length of both datasets
        >>> data_point = concat_dataset[1500]  # Accesses an element from the appropriate dataset

    This can also be accomplished by passing in a list of datasets to the YAML config::

        dataset:
          - _component_: torchtune.datasets.instruct_dataset
            source: vicgalle/alpaca-gpt4
            split: train
            train_on_input: True
            packed: True
          - _component_: torchtune.datasets.grammar_dataset
            split: train[:1%]
            train_on_input: False
            packed: True

    This class primarily focuses on providing a unified interface to access elements from multiple datasets,
    enhancing the flexibility in handling diverse data sources for training machine learning models.
    datasetsc                    || _         d |D             }t          |          rt          |          st          d          t          |          | _        t          d |D                       | _        g | _        d}t          |          D ]6\  }}|t          |          z   }| j        
                    |||f           |}7d S )Nc                 8    g | ]}t          |t                    S  )
isinstancer   .0datasets     n/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/datasets/_concat.py
<listcomp>z*ConcatDataset.__init__.<locals>.<listcomp>F   s"    PPPGZ77PPP    zDConcatDataset can't process a mix of packed and non-packed datasets.c              3   4   K   | ]}t          |          V  d S N)lenr   s     r   	<genexpr>z)ConcatDataset.__init__.<locals>.<genexpr>L   s(      BBgS\\BBBBBBr   r   )	_datasetsanyall
ValueErrorpackedsum_len_indexes	enumerater   append)selfr   	is_packedcumulative_indexidxr   next_cumulative_indexs          r   __init__zConcatDataset.__init__C   s    (0PPxPPP	y>> 	#i.. 	V   )nnBBBBBBB	46 %h// 	5 	5LC$4s7||$C!M  "24I3!OPPP4	5 	5r   indexreturnc                 v    | j         D ]0\  }}}||cxk    r|k     rn | j        |         }|||z
           c S 1d S r   )r!   r   )r$   r*   startstopdataset_indexr   s         r   __getitem__zConcatDataset.__getitem__V   sl    *.- 	. 	.&E4$$$$$$$$$.7uu}---- %	. 	.r   c                     | j         S r   )r    )r$   s    r   __len__zConcatDataset.__len__\   s
    yr   N)__name__
__module____qualname____doc__r   r   r)   intr   r0   r2   r   r   r   r
   r
      s        . .`5g 5 5 5 5&. .tCy$s)/C)D . . . .      r   r
   N)typingr   r   torch.utils.datar   	torchtuner   torchtune.datasets._packedr   
get_loggerlogr
   r   r   r   <module>r>      s            $ $ $ $ $ $       4 4 4 4 4 4ewK K K K KG K K K K Kr   