
    Pis                     $   d dl mZ d dlmZmZmZmZmZmZm	Z	 d dl
mZmZ d dlmZ ed         Z G d d          Z G d d	e          Z G d
 de          Z G d de          Z G d de          Zdee         ddfdZ G d de          ZdS )    )Path)AnyDictListLiteralMappingOptionalUnion)format_content_with_images
load_image)	Transform)systemuser	assistantipythonc                      e Zd ZdZ	 	 	 ddedeeeeee	f                  f         de
de
de
f
d	Zd
eeee	f                  fdZeded
d fd            Zd
ed         fdZed
e
fd            Zed
efd            ZddZd
efdZdS )Messageai  
    This class represents individual messages in a fine-tuning dataset. It supports
    text-only content, text with interleaved images, and tool calls. The
    :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` will tokenize
    the content of the message using ``tokenize_messages`` and attach the appropriate
    special tokens based on the flags set in this class.

    Args:
        role (Role): role of the message writer. Can be "system" for system prompts,
            "user" for human prompts, "assistant" for model responses, or "ipython"
            for tool call returns.
        content (Union[str, List[Dict[str, Any]]]): content of the message. If it is text only content,
            you can pass in a string. If it is multimodal content, pass in a list of dictionaries formatted
            as follows::

                [
                    {"type": "image", "content": <PIL.Image.Image>},
                    {"type": "text", "content": "What is in this image?"},
                ]

        masked (bool): whether the message is masked in the sample. If True, do not use
            in loss calculation. Default: False
        ipython (bool): whether the message is a tool call. Default: False
        eot (bool): whether the message corresponds to the end of a turn, where control is handed over
            to the assistant from the user or the user from the assistant. Default: True. Should be true
            in most cases except for:

            - For multiple consecutive assistant messages (i.e., tool calls
              by assistant), only the last assistant message will have ``eot=True``
            - All ipython messages (tool call returns) should set ``eot=False``.

    Note:
        Message class expects any image content to be in
        `PIL Image format <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image>`_.
    FTrolecontentmaskedr   eotc                     || _         |                     |          | _        || _        || _        || _        |                                  d S )N)r   _convert_to_list_of_dictr   r   r   r   _validate_message)selfr   r   r   r   r   s         l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/data/_messages.py__init__zMessage.__init__;   sM     	44W==         returnc                     t          |t                    rd|dgS t          |t                    sJ d|             |S )zUser is currently allowed to pass in a string for text-only content.
        This ensures that the content is formatted as a list of dictionaries.texttyper   z2content must be of type List[Dict[str, Any]], got )
isinstancestrlist)r   r   s     r   r   z Message._convert_to_list_of_dictK   sh     gs## 	:#8899T
 
 	J 	JIII	J 	J 
 r   dc           
           | |d         |d         |                     dd          |                     dd          |                     dd                    S )	z
        Construct a Message from a dictionary.

        Args:
            d (dict): dictionary containing the fields of the Message.

        Returns:
            Message: constructed Message.
        r   r   r   Fr   r   T)r   r   r   r   r   )get)clsr'   s     r   	from_dictzMessage.from_dictW   s^     s6iL555))EE)U++eT""
 
 
 	
r   zPIL.Image.Imagec                 $    d | j         D             S )z7
        Returns media content of the message.
        c                 6    g | ]}|d          dk    |d         S )r#   imager    .0r   s     r   
<listcomp>z%Message.get_media.<locals>.<listcomp>n   s3     
 
 
#*gfoQX>X>XGI>X>X>Xr   r   r   s    r   	get_mediazMessage.get_mediaj   s%    
 
.2l
 
 
 	
r   c                 >    t          d | j        D                       S )z=
        Returns whether the message contains media.
        c              3   .   K   | ]}|d          dk    V  dS )r#   r.   Nr/   r0   s     r   	<genexpr>z)Message.contains_media.<locals>.<genexpr>w   s+      JJ'76?g-JJJJJJr   )anyr   r4   s    r   contains_mediazMessage.contains_mediar   s#    
 JJT\JJJJJJr   c                 J    d                     d | j        D                       S )z;
        Returns text-only content of the message.
         c              3   >   K   | ]}|d          dk    |d         V  dS )r#   r!   r   Nr/   r0   s     r   r8   z'Message.text_content.<locals>.<genexpr>~   s?       
 
#*gfoQW>W>WGI>W>W>W>W
 
r   )joinr   r4   s    r   text_contentzMessage.text_contenty   s8    
 ww 
 
.2l
 
 
 
 
 	
r   Nc                     | j         r| j        rt          d| j                   | j         r*| j        dk    r!t          d| j         d| j                   d S d S )NzGMedia tokens in tool calls are not supported. Both are set in message: r   z6Only assistant messages can be tool calls. Found role z in message: )r   r:   
ValueErrorr?   r   r4   s    r   r   zMessage._validate_message   s    < 	D/ 	mZ^Zkmm   < 	DI44tttaeartt  	 	44r   c                 @    d | j         D             }d| j         d|dS )Nc                     g | ]
}|d          S r3   r/   r0   s     r   r2   z$Message.__repr__.<locals>.<listcomp>   s    GGGw	*GGGr   zMessage(role='z', content=))r   r   )r   content_onlys     r   __repr__zMessage.__repr__   s2    GG$,GGGG	GGlGGGGr   )FFT)r   N)__name__
__module____qualname____doc__Roler
   r%   r   r   r   boolr   r   classmethoddictr+   r5   propertyr:   r?   r   rF   r/   r   r   r   r      s       " "P ! !! sDc3h001! 	!
 ! ! ! ! ! 
4S#X3G 
 
 
 
 
$ 
9 
 
 
 [
$
4 12 
 
 
 
 K K K K XK 
c 
 
 
 X
   H# H H H H H Hr   r   c            
           e Zd ZdZ	 	 	 	 ddedeeeef                  dee         dee         fdZ	d	e
eef         d
e
eef         fdZdS )InputOutputToMessagesae  
    Message transform class that converts a single sample with "input" and "output" fields,
    (or equivalent fields specified in column_map) to user and assistant messages,
    respectively. This is useful for datasets that have two columns, one containing
    the user prompt string and the other containing the model response string::

        |  input          |  output          |
        |-----------------|------------------|
        | "user prompt"   | "model response" |

    Args:
        train_on_input (bool): Whether the model is trained on the user prompt or not.
            Default is False.
        column_map (Optional[Dict[str, str]]): a mapping to change the expected "input"
            and "output" column names to the actual column names in the dataset. Keys should
            be "input" and "output" and values should be the actual column names. Default is None,
            keeping the default "input" and "output" column names.
        new_system_prompt (Optional[str]): if specified, prepend a system message. This can
            serve as instructions to guide the model response. Default is None.
        image_dir (Optional[Path]): path to the directory containing the images that is prepended to all image
            paths in the dataset. For example, if ``image_dir="/home/user/dataset/"` and the sample image path
            was ``"images/1.jpg"``, the final image path that will be loaded is ``"/home/user/dataset/images/1.jpg"``.
            If None, assume images are available in current working directory or are located
            on a remote url. For text-only, leave as None. Default is None.

    Raises:
        ValueError:
            If ``column_map`` is provided and ``input`` not in ``column_map``, or
                ``output`` not in ``column_map``, **or**
            if ``image_dir`` is provided but ``image`` not in ``column_map``.
    FNtrain_on_input
column_mapnew_system_prompt	image_dirc                    || _         || _        || _        | j        gd| j        vr*t          d| j                                         d          d| j        vr*t          d| j                                         d          ndddd| _        d| j                                        vr|t          d| d	          || _        d S )
Ninputz2Expected a key of 'input' in column_map but found .outputz3Expected a key of 'output' in column_map but found r.   )rW   rY   r.   zimage_dir is specified as zO but 'image' is not in column_map. Please specify an 'image' key in column_map.)rR   rT   rS   rA   keysrU   )r   rR   rS   rT   rU   s        r   r   zInputOutputToMessages.__init__   s    -!2$?&do-- bI]I]I_I_bbb   t.. c$/J^J^J`J`ccc   /
 )08gVVDO $/..0000Y5J?Y ? ? ?  
 #r   sampler   c                 ,   d|v pd| j         v o| j         d         |v }|rv|| j         d                  }t          |t                    r0t          |          }| j        
| j        |z  }t          |          }n|}d|dd|| j         d                  dg}nd|| j         d                  dg}d|| j         d                  dg}t          d|| j         d          t          d	|d
d          g}| j        t          d| j        dd          g|z   }d|iS )Nr.   r"   r!   rW   rY   r   Tr   r   r   r   r   Fr   messages)	rS   r$   r%   r   rU   r   r   rR   rT   )r   r[   is_multimodal
image_path	pil_imager   output_contentr^   s           r   __call__zInputOutputToMessages.__call__   s   6) 
t&M4?7+Cv+M 	  	V 89J*c** '!*--
 >-!%*!<J 'z22		&	 Y77F4?73K,LMMGG
 !'6$/':R3STTUG tx/H(IJJ

 ..	    &	  
 !-!4+A$TX   	H
 H%%r   )FNNNrG   rH   rI   rJ   rL   r	   r   r%   r   r   r   r   rc   r/   r   r   rQ   rQ      s         D  %/3+/$( #  # # T#s(^, # $C=	 #
 D> #  #  #  #D2&wsCx0 2&WS#X5F 2& 2& 2& 2& 2& 2&r   rQ   c                       e Zd ZdZ	 	 	 ddedeeeef                  dee         fdZde	ee
f         d	e	ee
f         fd
ZdS )ChosenRejectedToMessagesa^  
    Transform for converting a single sample from datasets with "chosen" and "rejected" columns
    containing conversations to a list of chosen and rejected messages. For example::

        |  chosen                                |  rejected                              |
        |----------------------------------------|----------------------------------------|
        | [{"role": "user", "content": Q1},      | [{"role": "user", "content": Q1},      |
        |  {"role": "assistant", "content": A1}] |  {"role": "assistant", "content": A2}] |

    will be converted to:

    .. code-block:: python

        chosen = [
            Message(role="user", content="Q1"),
            Message(role="assistant", content="A1"),
        ]
        rejected = [
            Message(role="user", content="Q1"),
            Message(role="assistant", content="A2"),
        ]

    A single sample typically consists of a single optional system prompt and one or multiple
    turns of user and assistant messages.

    Args:
        train_on_input (bool): Whether the model is trained on the user prompt or not.
            Default is False.
        column_map (Optional[Dict[str, str]]): a mapping to change the expected
            "chosen" and "rejected" column names to the actual column names in the dataset.
            Keys should be "chosen" and "rejected" and values should be the actual column names.
            Default is None, keeping the default column names.
        new_system_prompt (Optional[str]): if specified, prepend a system message. This can
            serve as instructions to guide the model response. Setting this will OVERRIDE any system
            messages already present in the dataset. Default is None.

    Raises:
        ValueError: If ``column_map`` is provided and ``chosen`` not in ``column_map``, or
            ``rejected`` not in ``column_map``.
    FNrR   rS   rT   c                     || _         || _        |r[d|vr%t          d|                                 d          d|vr%t          d|                                 d          || _        d S ddd| _        d S )Nchosenz3Expected a key of 'chosen' in column_map but found rX   rejectedz5Expected a key of 'rejected' in column_map but found rh   ri   rR   rT   rA   rZ   _column_mapr   rR   rS   rT   s       r   r   z!ChosenRejectedToMessages.__init__3  s     -!2 	Lz)) ^*//J[J[^^^   ++ `JOOL]L]```    *D*2
KKDr   r[   r   c                 J   g }|| j         d                  D ]Z}|d         dk    r| j        |d         dk    o| j         |d<   |                    t                              |                     [g }|| j         d                  D ]Z}|d         dk    r| j        |d         dk    o| j         |d<   |                    t                              |                     [| j        8t	          d| j        dd          g|z   }t	          d| j        dd          g|z   }||d	S )
Nrh   r   r   r   r   ri   Tr]   rj   )rl   rT   rR   appendr   r+   )r   r[   chosen_messagesmessagerejected_messagess        r   rc   z!ChosenRejectedToMessages.__call__H  s   d.x89 	? 	?Gv(**t/E/Q!(K!? !'' H ""7#4#4W#=#=>>>>d.z:; 	A 	AGv(**t/E/Q!(K!? !'' H $$W%6%6w%?%?@@@@!-!4+A$TX    	 O !4+A$TX  ! "	!" *7HIIIr   FNNrG   rH   rI   rJ   rL   r	   r   r%   r   r   r   rc   r/   r   r   rf   rf   	  s        ' 'V  %/3+/	L LL T#s(^,L $C=	L L L L*JwsCx0 JWS#X5F J J J J J Jr   rf   c                       e Zd ZdZ	 	 	 	 	 ddedeeeef                  dee         dee         d	ee         f
d
Z	de
eef         de
eef         fdZdS )ShareGPTToMessagesa	  
    Convert a single chat sample adhering to the ShareGPT JSON structure to torchtune's :class:`~torchtune.data.Message`
    structure.

    A single sample typically consists of a single optional system prompt and one or multiple
    turns of user and assistant messages.

    ShareGPT follows::

        {
            "conversations": [
                {
                    "from": <system|human|gpt>,
                    "value": <message>,
                },
                ...
            ]
        }

    :class:`~torchtune.data.Message` follows::

        [
            {
                "role": <system|user|assistant>,
                "content": <message>,
            },
            ...
        ]

    Args:
        train_on_input (bool): whether the prompt should remain unmasked. For multimodal datasets, ``train_on_input``
            is always False and this value is ignored. Default: False
        column_map (Optional[Dict[str, str]]): a mapping from the expected columns ("conversations")
            to the new column names in the dataset. Key should be "conversations" and value should
            be the new column name. If None, keep the default "conversations".
            Default is None.
        new_system_prompt (Optional[str]): if specified, prepend a system message. This can
            serve as instructions to guide the model response. Setting this will OVERRIDE any system
            messages already present in the dataset. Default is None.
        image_dir (Optional[Path]): path to the directory containing the images that is prepended to all image
            paths in the dataset. For example, if ``image_dir="/home/user/dataset/"` and the sample image path
            was ``"images/1.jpg"``, the final image path that will be loaded is ``"/home/user/dataset/images/1.jpg"``.
            If None, assume images are available in current working directory or are located
            on a remote url. For text-only, leave as None. Default is None.
        image_tag (Optional[str]): placeholder tags in the text content of each message to be replaced by image
            special tokens. If images are present and this is None, then will prepend image tokens to the first
            user message in the sample by default. If text-only, this field is ignored. Default is ``"<image>"``.

    Raises:
        ValueError: If ``column_map`` is provided and ``conversations`` not in ``column_map``.
    FN<image>rR   rS   rT   rU   	image_tagc                     || _         || _        |r1d|vr%t          d|                                 d          || _        n
ddd| _        || _        || _        d S )Nconversationsz:Expected a key of 'conversations' in column_map but found rX   r.   )rz   r.   )rR   rT   rA   rZ   rl   rU   rx   )r   rR   rS   rT   rU   rx   s         r   r   zShareGPTToMessages.__init__  s     -!2 	Tj00 eQ[Q`Q`QbQbeee    *D1@7SSD""r   r[   r   c                 j   dddd}g }| j         +|                    t          d| j         dd                     d|v pd| j        v o| j        d         |v }d	}|| j        d
                  D ]}||d                  }|d         }|dk    r| j         &|dk    rc|ra|s_|| j        d                  }	| j        
| j        |	z  }	t          |	          }
| j        d|
dd|dg}nt          || j        |
g          }d}|dk    o	| j         p|}|                    t          |||                     d|iS )J  
        Return a list of Message objects from the provided sample dict.

        Args:
            sample (Mapping[str, Any]): a single data sample with "messages" field pointing
                to a list of dict messages.

        Returns:
            List[Message]: A list of messages with "role" and "content" fields.
        r   r   r   )r   humangptNTr]   r.   Frz   fromvaluer"   r!   )rx   imagesr   r   r   r^   )	rT   ro   r   rl   rU   r   rx   r   rR   )r   r[   role_mapr^   r_   image_loadedrq   r   r   r`   ra   r   s               r   rc   zShareGPTToMessages.__call__  s    'LL!-OO!4+A$TX      6) 
t''OD,<W,E,O 	
 d.?@ 	P 	PGGFO,Dg&GxD$:$Fv~~  ( (!'(8(A!BJ~1%)^j%@
 *: 6 6I~-%,CC%+@@#
 #=#&*n$-;# # #
 $(L k) ''8=  OOGwvNNNOOOOH%%r   )FNNNrw   rd   r/   r   r   rv   rv   j  s        2 2l  %/3+/$(#,# ## T#s(^,# $C=	#
 D># C=# # # #*:&wsCx0 :&WS#X5F :& :& :& :& :& :&r   rv   c                       e Zd ZdZ	 	 	 ddedeeeef                  dee         fdZde	eee
f                  d	e	eee
f                  fd
Zdeee
f         d	eee
f         fdZdS )OpenAIToMessagesa  
    Convert a single chat sample adhering to the `OpenAI chat completion <https://platform.openai.com/docs/api-reference/chat>`_
    JSON structure to torchtune's :class:`~torchtune.data.Message` structure. This supports both
    text and image messages.

    A single sample typically consists of a single optional system prompt and one or multiple
    turns of user and assistant messages.

    For example::

        {
            "messages": [
                {
                    "role": <system|user|assistant>,
                    "content": [
                        {
                            "type": "text",
                            "text": "What'''s in this image?",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": <url>,
                            },
                        },
                },
                ...
            ]
        }

    :class:`~torchtune.data.Message` follows::

        [
            {
                "role": <system|user|assistant>,
                "content": [
                    {
                        "type": "text",
                        "content": "What'''s in this image?",
                    },
                    {
                        "type": "image",
                        "content": <PIL.Image.Image>,
                    },
                ],
            },
            ...
        ]

    Args:
        train_on_input (bool): whether the prompt should remain unmasked. Default: False
        column_map (Optional[Dict[str, str]]): a mapping from the expected columns ("messages")
            to the new column names in the dataset. Key should be "messages" and value should be
            the new column name. If None, keep the default "messages".
            Default is None.
        new_system_prompt (Optional[str]): if specified, prepend a system message. This can
            serve as instructions to guide the model response. Setting this will OVERRIDE any system
            messages already present in the dataset. Default is None.

    Raises:
        ValueError: If ``column_map`` is provided and ``messages`` not in ``column_map``.
    FNrR   rS   rT   c                     || _         || _        |r2d|vr%t          d|                                 d          || _        d S ddi| _        d S )Nr^   z5Expected a key of 'messages' in column_map but found rX   rk   rm   s       r   r   zOpenAIToMessages.__init__1  st     -!2 	8++ `JOOL]L]```    *D *J7Dr   r   r   c                     g }|D ]j}|d         dk    r|                     d|d         d           -|d         dk    r1|                     dt          |d         d                   d           k|S )zPConverts a list of content dicts from the OpenAI format to the torchtune format.r#   r!   r"   	image_urlr.   url)ro   r   )r   r   converted_contentcontent_dicts       r   _convert_from_openai_contentz-OpenAIToMessages._convert_from_openai_contentB  s     # 	 	LF#v--!((#V0DEE    f%44!(( '#-l;.G.N#O#O    ! r   r[   c                    g }| j         +|                    t          d| j         dd                     || j        d                  D ]}|d         dk    r| j         |d         dk    o| j         }t          |d         t                    r|                     |d                   }n#t          |d         t                    r|d         }|                    t          |d         ||	                     d|iS )
r|   Nr   Tr]   r^   r   r   r   r   )	rT   ro   r   rl   rR   r$   r&   r   r%   )r   r[   updated_messagesrq   r   r   s         r   rc   zOpenAIToMessages.__call__U  s7    !-##!4+A$TX    
 d.z:; 	 	Gv(**t/E/Qfo4St?R;RF'),d33 -;;GI<NOOGI.44 -!),## #!      ,--r   rs   )rG   rH   rI   rJ   rL   r	   r   r%   r   r   r   r   r   rc   r/   r   r   r   r     s        = =B  %/3+/	8 88 T#s(^,8 $C=	8 8 8 8"!DcN+!	d38n	! ! ! !&".wsCx0 ".WS#X5F ". ". ". ". ". ".r   r   r^   r   Nc                    t          |           dk     r t          dt          |            d          d}t          |           D ]~\  }}|j        dk    r|dk    rt          d| d          |j        dk    r|dk    rt          d| d	|d
z
   d          |j        dk    r|dk    rt          d| d          |j        }dS )a  
    Given a list of messages, ensure that messages form a valid
    back-and-forth conversation. An error will be raised if:

    - There is a system message that's not the first message
    - There are two consecutive user messages
    - An assistant message comes before the first user message
    - The message is empty
    - Messages are shorter than length of 2 (min. one user-assistant turn)


    Args:
        messages (List[Message]): the messages to validate.

    Raises:
        ValueError: If the messages are invalid.
       z,Messages must be at least length 2, but got z	 messagesr   r   z8Assistant message before expected user message at index z in messagesz'Two consecutive user messages at index z and    r   r   zSystem message at index z1 in messages, but system messages must come firstN)lenrA   	enumerater   )r^   	last_turnirq   s       r   validate_messagesr   z  s%   ( 8}}qS3x==SSS
 
 	
 I)) ! !
7<;&&9+>+>Z1ZZZ   <6!!i6&9&9U!UU!a%UUU   <8##A_1___   L		! !r   c                   r    e Zd ZdZ	 d
dedeeeef                  fdZde	ee
f         de	ee
f         fd	ZdS )AlpacaToMessagesa|  
    Message transform class for Alpaca-style datasets with "instruction", "input", and "output"
    (or equivalent fields specified in column_map) columns. User messages are formed from the
    instruction + input columns and assistant messages are formed from the output column. Prompt
    templating is conditional on the presence of the "input" column, and thus is handled directly
    in this transform class instead of a dedicated :class:`~torchtune.data.PromptTemplate` class
    due to this custom logic.

    Args:
        train_on_input (bool): Whether the model is trained on the user prompt or not.
            Default is True.
        column_map (Optional[Dict[str, str]]): a mapping to change the expected "instruction", "input",
            and "output" column names to the actual column names in the dataset. Default is None,
            keeping the default column names.
    TNrR   rS   c                 6    || _         || _        ddd| _        d S )NzBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
zBelow is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
)prompt_inputprompt_no_input)rR   rS   template)r   rR   rS   s      r   r   zAlpacaToMessages.__init__  s1     -$\
E
 
r   r[   r   c                    | j         pi }|                    dd          }|                    dd          }|                    dd          }||v r7||         r/| j        d                             ||         ||                   }n'| j        d                             ||                   }t	          d|| j         d	
          t	          d||         dd	
          g}d|iS )NrW   instructionrY   r   )r   rW   r   )r   r   Tr]   r   Fr^   )rS   r)   r   formatr   rR   )r   r[   rS   	key_inputkey_instruction
key_outputpromptr^   s           r   rc   zAlpacaToMessages.__call__  s   _*
NN7G44	$..FF^^Hh77
6)#4]>299"?36);L :  FF ]#45<<"?3 =  F
 ..	    z*	  
 H%%r   )TNrt   r/   r   r   r   r     s         " SW
 
"
7?S#X7O
 
 
 
$&wsCx0 &WS#X5F & & & & & &r   r   )pathlibr   typingr   r   r   r   r   r	   r
   torchtune.data._utilsr   r   torchtune.modules.transformsr   rK   r   rQ   rf   rv   r   r   r   r/   r   r   <module>r      s         E E E E E E E E E E E E E E E E E E H H H H H H H H 2 2 2 2 2 2xH xH xH xH xH xH xH xHvu& u& u& u& u&I u& u& u&p^J ^J ^J ^J ^Jy ^J ^J ^JBD& D& D& D& D& D& D& D&NF. F. F. F. F.y F. F. F.R'!7m'!	'! '! '! '!T@& @& @& @& @&y @& @& @& @& @&r   