
    Pi*                     r    d dl mZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ d dlmZ  G d de	          ZdS )	    )DictListOptionalN)
functional)Dataset)CROSS_ENTROPY_IGNORE_IDX	PACK_TYPE)tqdmc                       e Zd ZdZdddddededed	ee         d
eddfdZddZ	defdZ
dedefdZdeddfdZdedefdZdededefdZdefdZdedeeej        f         fdZdS )PackedDataseta  
    Performs greedy sample packing on a provided dataset. This is done as a single
    preprocessing step before training begins. Shuffling is done outside of this
    class on packed samples with a ``Sampler`` as part of the dataloader. Currently,
    this only supports in-memory map-style datasets.

    The class loads, tokenizes, and packs examples on initialization - no tokenization is done during training.

    The general flow on initialization is: load tokenized sample -> add to buffer ->
    when buffer is long enough, add to ``self.packs``.

    During training, returns self.packs[idx] as input, label, attention mask, and
    position ids. The attention mask is a lower triangular block mask to prevent
    samples from cross-attending within a pack. The position ids indicate the position
    of each token relative to its sample within a pack. These are all padded to max
    sequence length, so a batch-wise collator is not needed.

    A packed sample is made up of individual smaller sequence length samples jammed together
    within ``max_seq_len``. For example, if max_seq_len is 6 and there are varied
    length samples::

        tokens = [
            [S1, S1, S1, S2, S2, pad],
            [S3, S3, S4, S4, pad, pad],
            ...,
        ]

    To prevent cross-contamination, the following mask would be returned for the
    first pack in the example::

        mask = [
            [1, 0, 0, 0, 0, 0],
            [1, 1, 0, 0, 0, 0],
            [1, 1, 1, 0, 0, 0],
            [0, 0, 0, 1, 0, 0],
            [0, 0, 0, 1, 1, 0],
            [0, 0, 0, 0, 0, 1],
        ]

    The position ids would be::

        input_pos = [
            [0, 1, 2, 0, 1, 2],
            [0, 1, 0, 1, 2, 3],
            ...,
        ]

    The identity matrix is used in the mask for pad tokens instead of a causal mask.
    For position ids for pad tokens, we simply continue to increment from the previous
    sample normally.

    Args:
        ds (Dataset): dataset to sample pack. This should return a dictionary with field
            "tokens" and "labels" containing the tokenized and label samples.
        max_seq_len (int): Maximum number of tokens to pack
        padding_idx (int): padding index for the tokenizer. Default is 0.
        max_packs (Optional[int]): Maximum number of packs. Default is None, which will create as many
            packs as possible.
        split_across_pack (bool): if the last sample in a pack does not fit in ``max_seq_len``,
            split the sample into the next pack, or move it entirely to the beginning of the next pack.
            For pre-training, typically this is set to True for general text completion. For
            fine-tuning, typically this is set to False to avoid truncating sentences in instruct
            tuning. Default is False.
    r   NF)padding_idx	max_packssplit_across_packdsmax_seq_lenr   r   r   returnc                    || _         || _        || _        || _        || _        g | _        d| _        |                                  d S )Nr   )r   r   r   r   r   packsprevious_sample_boundary_pack)selfr   r   r   r   r   s         n/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/datasets/_packed.py__init__zPackedDataset.__init__S   sJ     &&"!2&(
-.%

    c                     g g g g d}t           j                                        r<t           j                                        rt           j                                        nd}|dk    r$t          t           j                  dd          } j        D ]l}|d         |d         }}t          |          }| j        k    r" j	        st          d| d	 j         d
          |dxx         |z  cc<   |dxx         |z  cc<   |dxx          fdt          |          D             z  cc<   |dxx         |gz  cc<   t          |d                    j        k    r[                                 sG                     |          }t          |d                    j        k    r                                 G|dk    r|                                 t          |d                    _                                         r nnt          |d                   dk    r; j        t           j                   j        k     r                     |           dS dS dS )zIterate through the dataset. Use a buffer to hold samples until max_seq_len,
        then append the buffer to self.packs as a single "packed" sample. Continue
        until max_packs or end of dataset.tokenslabels	input_posseq_lensr   zPacking datasetT)totaldescdynamic_ncolsr   r   zDataset sample is too long (z > zA). Please set `split_across_pack=True` or increase `max_seq_len`.r   c                 $    g | ]}|j         z  S  )r   ).0xr   s     r   
<listcomp>z'PackedDataset._pack.<locals>.<listcomp>   s!    )W)W)W1!d.>*>)W)W)Wr   r    N)torchdistributedis_availableis_initializedget_rankr
   lenr   r   r   
ValueErrorrange_should_stop_packing_split_and_add_packupdater   r   r   	_add_pack)r   current_packrankpbarsampler   r   seq_lens   `       r   r   zPackedDataset._packf   s    	
 
  --//494E4T4T4V4VE&&((( 	
 199c$'ll1BRVWWWDg "	 "	F#H-vh/?FF
 &kkG)))$2H) U7 U Ut?O U U U   """f,""""""f,"""%%%)W)W)W)Wg)W)W)WW%%%$$$	1$$$
 L*++d.>>>1133 ?  $77EE L*++d.>>>1133 ?
 qyy -0X0F,G,GD)((**  |H%&&**N"c$*oo&F&F NN<(((((	 +*&F&Fr   c                 R    | j         t          | j                  | j         k    rdS dS )z<If max packs is set, stop packing when we reach that number.NTF)r   r.   r   r   s    r   r1   z"PackedDataset._should_stop_packing   s*     >%#dj//T^*K*K4ur   r5   c                    | j         r8| j        }| j        t          |d         dd                   z
  }|dk    r|gng }n	| j        }g }|d         d|         |d         d|         |d         d|         |d         dd         |z   d}|                     |           | j         rt          |d         |d                   n|d         d         }|d         |d         |d         |d         |d         |d         |gdS )	zSplits the current pack at the boundary, processes it, adds it to ``self.packs`` and
        returns the start of the next pack.r    Nr   r   r   r   r   )r   r   sumr   r4   r.   )r   r5   boundaryleftover_seq_lenseq_len_paddingpacknext_seq_lens          r   r2   z!PackedDataset._split_and_add_pack   sS    ! 		!'H#/#l:6NsPRs6S2T2TT4Dq4H4H/00bOO4H !O #8,YhY7"8,YhY7%k29H9=$Z0"5G	
 
 	t %.CX&xyy1222j)"- 	 #8,XYY7"8,XYY7%k2899=%	
 
 	
r   rB   c                     |                      |          }|                     || j                  }| j                            |           dS )z2Processes, pads and adds a pack to ``self.packs``.)r   N)_convert_to_tensors	_pad_packr   r   appendr   rB   s     r   r4   zPackedDataset._add_pack   sI    ''--~~d0@~AA
$r   c                 0   t          j        |d         t           j                  t          j        |d         t           j                  t          j        |d         t           j                  t          j        |d         t           j                  dS )z[Converts a pack into tensors. Pack comes in as a dict of lists and is converted to tensors.r   )dtyper   r   r    r   )r)   tensorlongrH   s     r   rE   z!PackedDataset._convert_to_tensors   su     l4>DDDl4>DDDd;&7uzJJJT*%5UZHHH	
 
 	
r   c           	         | j         t          |d                   z
  }t          j        |d         d|f|          }t          j        |d         d| j         t          |d                   z
  ft                    }|dk    r/t          j        |d         t          j        |g          g          n|d         }t          j        |d         d         dz   |d         d         | j         z   t          |d                   z
  dz             }t          j	        |d| j         dz
            }t          j        |d         |g          }	|||	|d	S )
z$Pads a pack to ``self.max_seq_len``.r   r   )valuer   r    r   r=      r   )
r   r.   Fpadr   r)   catrK   arangeclamp)
r   rB   r   num_padding_tokenspadded_tokenspadded_labelspadded_seq_lens	num_rangeclamped_num_rangepadded_input_poss
             r   rF   zPackedDataset._pad_pack   sq    "-DN0C0CCN"#
 
 
 N 3tH~#6#667*
 
 
 "A%% ItJ'7I6J)K)KLMMMj! 	 Lb!A%b!D$44s4;L7M7MMPQQ
 
	
 "K	1d6F6JKK 9d;&79J%KLL $#)'	
 
 	
r   c                 *    t          | j                  S N)r.   r   r;   s    r   __len__zPackedDataset.__len__  s    4:r   idxc                     | j         |         S r]   )r   )r   r_   s     r   __getitem__zPackedDataset.__getitem__  s    z#r   )r   N)__name__
__module____qualname____doc__r   intr   boolr   r   r1   r	   r2   r4   rE   rF   r^   r   strr)   Tensorra   r%   r   r   r   r      s       ? ?L #'"'   	
  C=   
   &>) >) >) >)@d    &
	 &
i &
 &
 &
 &
P i  D        
	 
i 
 
 
 
(
i (
c (
i (
 (
 (
 (
T    s tC,='>      r   r   )typingr   r   r   r)   torch.nnr   rP   torch.utils.datar   torchtune.data._commonr   r	   r
   r   r%   r   r   <module>rn      s    ( ' ' ' ' ' ' ' ' '  $ $ $ $ $ $ $ $ $ $ $ $ F F F F F F F F      A A A A AG A A A A Ar   