
    Pi[                        d dl mZmZmZmZmZmZ d dlZd dlm	c m
Z d dlmZ d dlmZmZ d dlmZ 	 	 ddeej                 ded	ed
ej        fdZdeeeee         f                  dedee         deeeeef         f         fdZd efdeeeee         f                  deded
eeej        f         fdZd edddfdeeeef                  dedededee         dee         d
eeej        f         fdZdee         d
eeej        f         fdZd efdeeeee         f                  deded
eej        ej        f         fdZdS )    )AnyDictListOptionalTupleUnionNpad_sequence)CROSS_ENTROPY_IGNORE_IDX	PACK_TYPE)packed_block_causal_maskF	sequencesbatch_firstpadding_valuereturnc                     t          t          d |           ||                              t          |          g          S )a  
    This function is identical to :func:`torch.nn.utils.rnn.pad_sequence`, but
    instead pads a list of variable length Tensors from the left to the length
    of the longest sequence.

    Note:
        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
        where `T` is the length of the longest sequence. This function assumes
        trailing dimensions and type of all the Tensors in sequences are same.

    Args:
        sequences (List[torch.Tensor]): list of variable length sequences.
        batch_first (bool): if ``True``, the output will be in ``B x T x *``
            format, ``T x B x *`` otherwise. Default False.
        padding_value (float): value for padded elements. Default: 0.

    Returns:
        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
        Tensor of size ``B x T x *`` otherwise

    Example:
        >>> a = torch.tensor([1, 2, 3])
        >>> b = torch.tensor([4, 5, 6, 7])
        >>> c = torch.tensor([8, 9, 10, 11, 12])
        >>> left_pad_sequence([a, b, c], batch_first=True, padding_value=0)
        tensor([[ 0,  0,  1,  2,  3],
                [ 0,  4,  5,  6,  7],
                [ 8,  9, 10, 11, 12]])
    c                 0    t          j        | dg          S )Nr   dims)torchflip)xs    k/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/data/_collate.py<lambda>z#left_pad_sequence.<locals>.<lambda>2   s    ej!---     r   r   r   )r
   mapr   int)r   r   r   s      r   left_pad_sequencer      sO    D --y99#   d[!!"d##	$r   batchpad_directionkeys_to_padpadding_idxc          	      p    |dvrt          d|           t          t                    rst          d d          t                    t          |t                    rt          |                                          k    s(t          d|                                 d d          t           d                                                   k    s;t          d	 d
t           d                                                    d          fd d                                         D             } fd|D             }|dk    rt          j        j        j	        j
        nt          }D ]< |fd D             dt          |t                    r|         n|          |<   =|S )a  
    A generic padding collation function which pads ``keys_to_pad`` entries in a
    batch of sequences from the given ``pad_direction`` to the maximum sequence length for
    each entry in the batch.

    Note:
        This function assumes all batch elements which are not in ``keys_to_pad`` do not require
        any collation (see example below).

    Args:
        batch (List[Dict[str, List[int]]]): A list of dictionaries containing inputs.
        pad_direction (str): whether to pad entries from the left, or right. If ``pad_direction="right"``, we use
            :func:`torch.nn.utils.rnn.pad_sequence`, otherwise if ``pad_direction="left"``,
            we use :func:`torchtune.data.left_pad_sequence`.
        keys_to_pad (List[str]): Batch element keys to apply padding to. Should be a subset
            of keys in the batch.
        padding_idx (Union[int, Dict[str, int]]): Either a single integer padding value to apply to all
            ``keys_to_pad`` elements, or a mapping with keys identical to ``keys_to_pad`` with per-key
            padding values.

    Returns:
        torch.Tensor: The padded tensor of input ids with shape ``[batch_size, max_seq_len]``.

    Raises:
        ValueError:
            If ``pad_direction`` is not one of "left" or "right", **or**
            if ``keys_to_pad`` is empty, or is not a list, **or**
            if ``keys_to_pad`` is not a subset of keys in the batch, **or**
            if ``padding_idx`` is provided as a dictionary, but the keys are not identical to ``keys_to_pad``

    Example:
        >>> a = [1, 2, 3]
        >>> b = [4, 5, 6, 7]
        >>> c = [8, 9, 10, 11, 12]
        >>> batch = [
        >>>     {"tokens": a, "labels": 1},
        >>>     {"tokens": b, "labels": 3},
        >>>     {"tokens": c, "labels": 0},
        >>> ]
        >>> padded_collate(
        >>>     batch,
        >>>     pad_direction="left",
        >>>     keys_to_pad=["tokens"],
        >>>     padding_idx=-10
        >>> )
        {
            'labels': tensor([1, 3, 0]),
            'tokens': tensor([[-10, -10,   1,   2,   3],
                              [-10,   4,   5,   6,   7],
                              [  8,   9,  10,  11,  12]])
        }
    leftright;pad_direction should be one of 'left' or 'right' but found zMkeys_to_pad should be a list of strings with at least one element, but found !z8padding_idx was provided as a dictionary, but the keys (z#) are not the same as keys_to_pad ()r   z?keys_to_pad should be a subset of keys in the batch, but found z and z, respectively.c                     g | ]}|v|	S  r,   ).0kr"   s     r   
<listcomp>z"padded_collate.<locals>.<listcomp>   s#    EEE0D0D!0D0D0Dr   c                 T    i | ]#t          j        fd D                       $S )c                      g | ]
}|         S r,   r,   r-   r   r.   s     r   r/   z-padded_collate.<locals>.<dictcomp>.<listcomp>   s    #8#8#8QAaD#8#8#8r   r   tensor)r-   r.   r    s    @r   
<dictcomp>z"padded_collate.<locals>.<dictcomp>   s;    NNNa1el#8#8#8#8%#8#8#899NNNr   r'   c                 D    g | ]}t          j        |                   S r,   r3   r2   s     r   r/   z"padded_collate.<locals>.<listcomp>   s'    ///AU\!A$///r   Tr   )
ValueError
isinstancelistsetdictkeysr   nnutilsrnnr
   r   )r    r!   r"   r#   
batch_keysoutput_dictpad_fnr.   s   ` `    @r   padded_collaterC   8   s6   v ---Y-YY
 
 	
 k4(( 
 
j\gjjj
 
 	
 k""K+t$$ 
;##%%&&+55C;K[K[K]K] C C4?C C C   c%(--//2222KK K%(q%9%9K K K   FEEEU1X]]__EEEJNNNN:NNNK
 G## 	'' 
  
 
///////",[$"?"?PA[	
 
 
A r   
ignore_idxc                    t          d | D             d|          }t          d | D             d|          }|j        d         }|j        d         }||k    rt          j        |d||z
  f|          }n"||k    rt          j        |d||z
  f|          }|                                |                                dS )	a  Pad a batch of sequences to the longest sequence length in the batch, and
    convert integer lists to tensors.

    Args:
        batch (List[Dict[str, List[int]]]): A list of dictionaries containing input, label pairs.
        padding_idx (int): Padding index for input ids. Defaults to 0.
        ignore_idx (int): Padding index for labels. Defaults to -100.

    Returns:
        Dict[str, torch.Tensor]: Collated input and label tensors.

    Example:
        >>> token_pairs = [
        >>>    {"tokens": [1, 2, 3], "labels": [4, 5, 6]},
        >>>    {"tokens": [7,], "labels": [10,]},
        >>> ]
        >>> collated = padded_collate(
        >>>    batch=token_pairs,
        >>>    padding_idx=padding_idx,
        >>>    ignore_idx=ignore_idx,
        >>> )
        >>> collated["tokens"]
        >>> tensor([[1, 2, 3], [7, 0, 0]])
        >>> collated["labels"]
        >>> tensor([[4, 5, 6], [10, -100, -100]])
    c                 B    g | ]}t          j        |d                    S tokensr3   r-   r   s     r   r/   z&padded_collate_sft.<locals>.<listcomp>   &    222qak	"	"222r   Tr   c                 B    g | ]}t          j        |d                    S labelsr3   rI   s     r   r/   z&padded_collate_sft.<locals>.<listcomp>   rJ   r   r   valuerH   rM   )r
   shapeFpadlong)r    r#   rD   	input_idsrM   input_ids_seq_lenlabels_seq_lens          r   padded_collate_sftrY      s   > 22E222!  I
 22E222   F "+\"%N >))Q)N:;:
 
 
 
+	+	+E!223
 
 
	
  nn&&&++--@@@r   r'   pad_max_tilespad_max_imagesc                 ^   |dvrt          d|           |dk    rd | D             }t          |||          }n$|dk    rdt          d | D             d|	          i}|d         j        d
         }t	          |           }	t          d | D                       }
|||
k     rt          d|
 d|           |}
g }g }g }| D ]R}g }g }t          |d         d         |d                   D ]\  }}|j        d         }|j        \  }}||z  }|
|z
  }|dk    r||z
  nd}|dk    r||z
  nd}t          j        |ddddddd|fd          }t          j        |d||z  ||fd          }|	                    |           |	                    |           |	                    t          j        |                     |	                    t          j        |                     |	                    t          j        |d         d                              Tt          |dd	          }t          |dd	          }t          |dd	          }|                    |	|d
          }|+|j        \  } } }!t          j        |d||
z  |z  |!z
  f          }|d         ||d|d}"d|v r|d         |"d<   |"S )a$  Pad a batch of text sequences, tiled image tensors, aspect ratios,
    and cross attention masks. This can be used for both training and inference.

    ``batch`` is expected to be a list of sample dicts containing the following::
        - "tokens": List[int] of length text_seq_len, varies across samples
        - "labels": List[int] of length text_seq_len, varies across samples
        - "encoder_input": Dict[str, List[torch.Tensor]]
            - "images": List[torch.Tensor], each with shape (n_tiles, c, h, w)
            - "aspect_ratio": List[torch.Tensor], each with shape (2, ) to indicate h_ratio, w_ratio
        - "encoder_mask": List[Tensor], each with shape (text_seq_len, image_seq_len)

    Shape notation:
        - c = channel dim
        - h = height dim
        - w = weight dim

    Note:
        For each element in the batch, ``len(images) == len(encoder_mask) == len(aspect_ratio)``.

    This collater does the following:
        (1) Pad text sequence and encoder mask to the longest sequence length in the batch
        (2) Pad image tensors in the tile dimension with zeros to the largest number
            of tiles in the batch
        (3) Add empty images of zeros to samples up to max number of images in the batch
        (4) Pad aspect ratios with (1,1) for all added padding images

    Args:
        batch (List[Dict[str, Any]]): A list of sample dicts containing tokens,
            labels, images, encoder_mask, and aspect_ratio.
        padding_idx (int): Padding index for input token ids. Defaults to 0.
        ignore_idx (int): Padding index for labels. Defaults to -100.
        pad_direction (str): whether to pad entries from the left, or right. If ``pad_direction="right"``, we use
            :func:`torch.nn.utils.rnn.pad_sequence`, otherwise if ``pad_direction="left"``,
            we use :func:`torchtune.data.left_pad_sequence`. For training, we typically want to pad from the right.
            For inference, we typically want to pad from the left. Defaults to "right".
        pad_max_tiles (Optional[int]): Maximum number of tiles to pad to. If None, will pad to the largest number of tiles
            in the batch. Defaults to None.
        pad_max_images (Optional[int]): Maximum number of images to pad to. If None, will pad to the largest number of images
            in the batch. Defaults to None.

    Returns:
        Dict[str, Tensor]: Collated tokens, labels, images, encoder_mask, aspect_ratio tensors.
            - tokens: Tensor of shape (bsz, max_seq_len)
            - labels: Tensor of shape (bsz, max_seq_len)
            - images: Tensor of shape (bsz, max_num_images, max_num_tiles, c, h, w)
            - encoder_mask: Tensor of shape (bsz, max_seq_len, tokens_per_tile * max_num_tiles * max_num_images)
            - aspect_ratio: Tensor of shape (bsz, max_num_images, 2)

    Raises:
        ValueError:
            If ``pad_direction`` is not one of "left" or "right", **or**
            if pad_max_tiles is set to a value less than the largest number of tiles in an image.

    Example:
        >>> image_id = 1
        >>> tokens_per_tile = 5
        >>> c, h, w = 1, 1, 1
        >>> batch = [
        ...     {
        ...         "tokens": [1, 2, 1, 3], "labels": [4, 5, 6, 7],
        ...         "encoder_input": {
        ...             # One image with two tiles, one image with three tiles
        ...             "images": [torch.ones(2, c, h, w), torch.ones(3, c, h, w)],
        ...             "aspect_ratio": [torch.tensor([1, 2]), torch.tensor([1, 3])],
        ...         },
        ...         # Mask is shape (text_seq_len, tokens_per_tile * n_tiles)
        ...         "encoder_mask": [torch.ones(4, 5 * 2), torch.ones(4, 5 * 3)],
        ...     },
        ...     {
        ...         "tokens": [1, 4], "labels": [8, 9],
        ...         "encoder_input": {
        ...             # One image with four tiles
        ...             "images": [torch.ones(4, c, h, w)],
        ...             "aspect_ratio": [torch.tensor([2, 2])],
        ...         },
        ...         # Mask is shape (text_seq_len, tokens_per_tile * n_tiles)
        ...         "encoder_mask": [torch.ones(2, 5 * 4)],
        ...     },
        ... ]
        >>> model_inputs = padded_collate_tiled_images_and_mask(batch=batch)
        >>> print(model_inputs["tokens"])
        tensor([[1, 2, 1, 3],
                [1, 4, 0, 0]])
        >>> print(model_inputs["labels"])
        tensor([[4, 5, 6, 7],
                [8, 9, -100, -100]])
        >>> print(model_inputs["encoder_input"]["images"].shape)  # (bsz, max_num_images, max_num_tiles, c, h, w)
        torch.Size([2, 2, 4, 1, 1, 1])
        >>> print(model_inputs["encoder_mask"].shape)  # (bsz, max_text_seq_len, tokens_per_tile * max_num_tiles * max_num_images)
        torch.Size([2, 4, 40])
        >>> print(model_inputs["encoder_input"]["aspect_ratio"].shape)  # (bsz, max_num_images, 2)
        torch.Size([2, 2, 2])
        >>> print(model_inputs["encoder_input"]["images"][0, 0, ...])  # Image with two tiles got padded to four
        tensor([[[[1.]]], [[[1.]]], [[[0.]]], [[[0.]]]])
        >>> print(model_inputs["encoder_input"]["images"][0, 1, ...])  # Image with three tiles got padded to four
        tensor([[[[1.]]], [[[1.]]], [[[1.]]], [[[0.]]]])
        >>> print(model_inputs["encoder_input"]["images"][1, 0, ...])  # Image with four tiles did not get padded
        tensor([[[[1.]]], [[[1.]]], [[[1.]]], [[[1.]]]])
        >>> print(model_inputs["encoder_input"]["images"][1, 1, ...])  # Extra padding image was added to second sample
        tensor([[[[0.]]], [[[0.]]], [[[0.]]], [[[0.]]]])
    r%   r(   r'   c                 0    g | ]}|d          |d         dS )rH   rM   rQ   r,   )r-   samples     r   r/   z8padded_collate_tiled_images_and_mask.<locals>.<listcomp>Q  s7     
 
 
IOvh'6(3CDD
 
 
r   r&   rH   c                 B    g | ]}t          j        |d                    S rG   r3   rI   s     r   r/   z8padded_collate_tiled_images_and_mask.<locals>.<listcomp>Y  s&    :::qak**:::r   Tr   rN   c              3   R   K   | ]"}|d          d         D ]}|j         d         V  #dS )encoder_inputimagesr   N)rR   )r-   r^   images      r   	<genexpr>z7padded_collate_tiled_images_and_mask.<locals>.<genexpr>g  s]        O,X6   	A      r   NzMore tiles in image z, than pad_max_tiles ra   rb   encoder_maskr   rO   aspect_ratio   )rb   rf   )rH   ra   re   rM   )r7   rY   r   rR   lenmaxziprS   rT   appendr   stackr
   view)#r    r#   rD   r!   rZ   r[   	text_onlycollated_textmax_seq_lenbszmax_num_tilesbatch_imagesbatch_masksbatch_aspect_ratiosr^   sample_imagessample_masksrc   maskn_tilestext_seq_lenimage_seq_lentokens_per_tilepadding_tilesright_padding_textleft_padding_textpadded_imagepadded_maskcollated_imagescollated_maskscollated_aspect_ratiosconcat_masks_img_seq
batch_dicts#                                      r   $padded_collate_tiled_images_and_maskr      s   Z ---Y-YY
 
 	

 
 
SX
 
 
	 +9k:NN	&	 	 '::E::: )  
  )/3K
e**C       M
  =((Z}ZZ=ZZ   & LK (Y (Y?#H-vn/E
 
 !	- !	-KE4 k!nG +/*'L-+w6O)G3M.;w.F.Fl**A  /<v.E.El**1 
 5Aq!Q1m(LTUVVVL %!O3%&	 	 	 	K   ...,,,,EK667775;|44555""5;vo/F~/V#W#WXXXX #<TQRSSSO!+4qQQQN)Q  
 "&&sK<<L!$*1gu.@7JK
 
  )%2
 
 % J =  ,X6
8r   c                     t          j        d | D                       }t          j        d | D                       }t          j        d | D                       }d | D             }t          |          }||||dS )a5  Collate packed sequences into a batch. Only convert the seq lens into
    a block mask for use with attention. Tokens, labels, and input_pos are
    already padded to the same length within :class:`~torchtune.datasets.PackedDataset`.

    Args:
        batch (List[PACK_TYPE]): A list of pack dictionaries containing the following keys:
            - tokens: input token ids
            - labels: label token ids
            - input_pos: relative position ids for each sequence in pack
            - seq_lens: lengths of each sample within the pack

    Returns:
        Dict[str, torch.Tensor]: Collated input, label, input_pos, mask tensors.

    Example:
        >>> token_pairs = [
        >>>    {"tokens": [1, 2, 3, 4, 5, 6], "labels": [7, 8, 9, 10, 11, 12],
        >>>     "input_pos": [0, 1, 2, 0, 1, 0], "seq_lens": [3, 2, 1]},
        >>>    {"tokens": [13, 14, 15, 16, 17, 18], "labels": [19, 20, 21, 22, 23, 24],
        >>>     "input_pos": [0, 1, 0, 1, 0, 1], "seq_lens": [2, 2, 2]},
        >>> ]
        >>> collated = padded_collate_packed(
        >>>    batch=token_pairs,
        >>>    device=device,
        >>> )
        >>> collated["mask"]
        >>> tensor([
        >>> [[1, 0, 0, 0, 0, 0],
        >>>  [1, 1, 0, 0, 0, 0],
        >>>  [1, 1, 1, 0, 0, 0],
        >>>  [0, 0, 0, 1, 0, 0],
        >>>  [0, 0, 0, 1, 1, 0],
        >>>  [0, 0, 0, 0, 0, 1]],
        >>> [[1, 0, 0, 0, 0, 0],
        >>>  [1, 1, 0, 0, 0, 0],
        >>>  [0, 0, 1, 0, 0, 0],
        >>>  [0, 0, 1, 1, 0, 0],
        >>>  [0, 0, 0, 0, 1, 0],
        >>>  [0, 0, 0, 0, 1, 1]])
    c                     g | ]
}|d          S rG   r,   rI   s     r   r/   z)padded_collate_packed.<locals>.<listcomp>      555!!H+555r   c                     g | ]
}|d          S rL   r,   rI   s     r   r/   z)padded_collate_packed.<locals>.<listcomp>  r   r   c                     g | ]
}|d          S )	input_posr,   rI   s     r   r/   z)padded_collate_packed.<locals>.<listcomp>  s    ;;;Q{^;;;r   c                     g | ]
}|d          S seq_lensr,   rI   s     r   r/   z)padded_collate_packed.<locals>.<listcomp>  s    ---!*---r   r   )rH   rM   r   rx   )r   rl   r   )r    rH   rM   r   r   
block_masks         r   padded_collate_packedr     s    X [55u55566F[55u55566F;;U;;;<<I--u---H)  J
 	  r   c                     d | D             }d | D             }d | D             }d | D             }||z   }||z   }t          |d|          }	t          |d|          }
|	|
fS )a  Pad a batch of sequences for Direct Preference Optimization (DPO).

    This function takes a batch of sequences, where each sequence is represented
    as a dictionary with multiple key-value pairs. Each key corresponds to a different
    sequence component, such as input_ids or labels.

    Args:
        batch (List[Dict[str, List[int]]]): A list of dictionaries, where each dictionary
            represents a sequence with multiple components, 'chosen_input_ids',
            'chosen_labels', 'rejected_input_ids', and 'rejected_labels' are required.
        padding_idx (int): Padding index for input ids. Defaults to 0.
        ignore_idx (int): Padding index for labels. Defaults to -100.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing concatenated and padded
        input ids and labels.

    Example:
        >>> batch = [
        >>>    {'chosen_input_ids': [1, 2, 3], 'rejected_input_ids': [4, 5],
        >>>      'chosen_labels': [6, 7, 8], 'rejected_labels': [9, 10]},
        >>>    {'chosen_input_ids': [11, 12], 'rejected_input_ids': [13, 14, 15],
        >>>      'chosen_labels': [16, 17], 'rejected_labels': [18, 19, 20]},
        >>> ]
        >>> padded_collate_dpo(batch)
        >>> (tensor([[ 1,  2,  3],
        >>>          [11, 12,  0],
        >>>          [ 4,  5,  0],
        >>>          [13, 14, 15]]),
        >>>  tensor([[ 6,  7,  8],
        >>>          [16, 17, -100],
        >>>          [ 9, 10, -100],
        >>>          [18, 19, 20]]))
    c                 B    g | ]}t          j        |d                    S )chosen_input_idsr3   r-   exs     r   r/   z&padded_collate_dpo.<locals>.<listcomp>&  s(    MMMR(:%;<<MMMr   c                 B    g | ]}t          j        |d                    S )rejected_input_idsr3   r   s     r   r/   z&padded_collate_dpo.<locals>.<listcomp>'  s(    QQQR%,r*>'?@@QQQr   c                 B    g | ]}t          j        |d                    S )chosen_labelsr3   r   s     r   r/   z&padded_collate_dpo.<locals>.<listcomp>(  s'    GGG2U\"_"566GGGr   c                 B    g | ]}t          j        |d                    S )rejected_labelsr3   r   s     r   r/   z&padded_collate_dpo.<locals>.<listcomp>)  s(    KKKru|B'8$9::KKKr   Tr   r	   )r    r#   rD   r   r   r   r   to_pad_input_idsto_pad_labelsconcatenated_input_idsconcatenated_labelss              r   padded_collate_dpor     s    N NMuMMMQQ5QQQGGGGGMKKUKKKO'*<<!O3M)d+   '4z   "#666r   )Fr   )typingr   r   r   r   r   r   r   torch.nn.functionalr=   
functionalrS   torch.nn.utils.rnnr
   torchtune.data._commonr   r   !torchtune.modules.attention_utilsr   Tensorboolfloatr   strr   rC   rY   r   r   r   r,   r   r   <module>r      s   ; : : : : : : : : : : : : : : :           + + + + + + F F F F F F F F F F F F F F
 &$ &$EL!&$&$ &$ \	&$ &$ &$ &$ReS$s)^$%e e c	e
 sDcN*+e e e eT .8A 8AS$s)^$%8A8A 8A 
#u|
	8A 8A 8A 8A~ . #'$(b bS#Xbb b 	b
 C=b SMb 
#u|
b b b bJ:	?:	#u|
: : : :~ .67 67S$s)^$%6767 67 5<%&	67 67 67 67 67 67r   