
     `iD              
       z   d Z ddlmZmZ ddlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZ ddlmZmZ  G d	 d
ed          Z G d ded          Zdee         dedeee                  fdZdeeee                           deee                  dededej        f
dZdedededefdZ G d de          ZdgZdS )zProcessor class for Mllama.    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)ImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   &    e Zd ZU ee         ed<   dS )MllamaImagesKwargsmax_image_tilesN)__name__
__module____qualname__r   int__annotations__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mllama/processing_mllama.pyr   r      s"         c]"""""r   r   F)totalc                   &    e Zd ZU eed<   dddiiZdS )MllamaProcessorKwargsimages_kwargsimage_kwargsr      N)r   r   r   r   r   	_defaultsr   r   r   r   r       s2         %%%% 	q
IIIr   r   	input_idsimage_token_idreturnc                    fdt          |           D             }t          |          dk    rg S t          |          dk    r|d         dggS d t          |dd         |dd                   D             }|                    |d         t          |           g           |d         d         }|ddd         D ]$}|d         |d         dz
  k    r||d<   |d         }%|S )a  
    Generate a cross-attention token mask for image tokens in the input sequence.

    This function identifies the positions of image tokens in the input sequence and creates
    a mask that defines which subsequent tokens each image token should attend to.

    Args:
        input_ids (list[int]): A list of token ids representing the input sequence.
        image_token_id (int): The id of the token used to represent images in the sequence.

    Returns:
        list[list[int]]: A list of [start, end] pairs, where each pair represents the range
        of tokens an image token should attend to.

    Notes:
        - If no image tokens are present, an empty list is returned.
        - For a single image token, it attends to all subsequent tokens until the end of the sequence.
        - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
        - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
    c                 &    g | ]\  }}|k    |S r   r   ).0itokenr"   s      r   
<listcomp>z2get_cross_attention_token_mask.<locals>.<listcomp>@   s(    ___81euP^G^G^QG^G^G^r   r      c                     g | ]	\  }}||g
S r   r   )r&   loc1loc2s      r   r)   z2get_cross_attention_token_mask.<locals>.<listcomp>I   s     nnnZT4T4Lnnnr   N)	enumeratelenzipappend)r!   r"   image_token_locationsvision_maskslast_mask_endvision_masks    `    r   get_cross_attention_token_maskr7   *   s"   , `___y/C/C___
 !!Q&&	  !!Q&&&q)2.//nn37LSbS7QShijikikSl3m3mnnnL .r2C	NNCDDD
 !$Q'M#DDbD) ' 'q>[^a///*KN#Ar   cross_attention_token_mask	num_tilesmax_num_tileslengthc           	         t          |           }t          d | D                       }t          j        ||||ft          j                  }t          t          | |                    D ]k\  }\  }}	t          t          ||	                    D ]E\  }
\  }}t          |          dk    r*|\  }}t          ||          }|dk    r|}d|||||
d|f<   Fl|S )a  
    Convert the cross attention mask indices to a cross attention mask 4D array.

    This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
    The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

    Args:
        cross_attention_token_mask (list[list[list[int]]]): A nested list structure where:
            - The outer list represents the batch dimension.
            - The middle list represents different images within each batch item.
            - The inner list contains pairs of integers [start, end] representing token ranges for each image.
        num_tiles (list[list[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
        max_num_tiles (int): The maximum possible number of tiles.
        length (int): The total sequence length of the input.

    Returns:
        np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
            The array contains `1` where attention is allowed and `0` where it is not.

    Note:
        - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
    c              3   4   K   | ]}t          |          V  d S Nr0   )r&   maskss     r   	<genexpr>z?convert_sparse_cross_attention_mask_to_dense.<locals>.<genexpr>x   s(      LLULLLLLLr   )shapedtype   r+   r*   N)r0   maxnpzerosint64r/   r1   min)r8   r9   r:   r;   
batch_sizemax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartends                  r   ,convert_sparse_cross_attention_mask_to_denserU   Z   s   : /00JLL1KLLLLLN86>=Ah  
 9B#F`bkBlBl8m8m [ [4
4\#35>s<Qa?b?b5c5c 	[ 	[1H1y.9~~""&
s#v&&"99 CYZ$ZsHo~o%UV	[  r   prompt	bos_tokenimage_tokenc                     || v r| S d}|                      |          r1| t          |          d         } |dz  }|                      |          1||z   | |  S )a\  
    Builds a string from the input prompt by adding `bos_token` if not already present.

    Args:
        prompt (`str`):
            The input prompt string.
        bos_token (`str`):
            The beginning of sentence token to be added.
        image_token (`str`):
            The image token used to identify the start of an image sequence.

    Returns:
        str: The modified prompt string with the `bos_token` added if necessary.

    Examples:
        >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'

        >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
        '<|image|><begin_of_text>Hello world'

        >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'
    r   Nr*   )
startswithr0   )rV   rW   rX   num_image_tokens_on_starts       r   build_string_from_inputr\      s    4 F !


K
(
( 'K((**+!Q&! 

K
(
( ' 55JyJ&JJJr   c                        e Zd ZdZddgZdZdZd fd	Z	 	 	 	 ddee	         d	ee
eeee         ee         f                  d
ee         defdZ	 ddZed             Z xZS )MllamaProcessora  
    Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
    [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
    tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
        ```python
        from transformers import MllamaProcessor
        from PIL import Image

        processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")

        processor(
            images=your_pil_image,
            text=["<|image|>If I had to write a haiku for this one"],
            images_kwargs = {"size": {"height": 448, "width": 448}},
            text_kwargs = {"padding": "right"},
            common_kwargs = {"return_tensors": "pt"},
        )
        ```

    Args:
        image_processor ([`MllamaImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.

    image_processor	tokenizerMllamaImageProcessorPreTrainedTokenizerFastNc                 R   t          |d          s'd| _        |                    | j                  | _        n|j        | _        |j        | _        d| _        |                    | j                  | _        |j        | _        t                                          |||           d S )NrX   z	<|image|>z<|python_tag|>)chat_template)	hasattrrX   convert_tokens_to_idsr"   python_tokenpython_token_idrW   super__init__)selfr_   r`   rd   	__class__s       r   rj   zMllamaProcessor.__init__   s    y-00 	;*D"+"A"A$BR"S"SD(4D"+":D,(>>t?PQQ",)=QQQQQr   imagestextkwargsr#   c           
          ||t          d            j        t          fd j        j        i|}|d         }d|d<   |d         }|d         }	i }
|t          |t                    r|g}nDt          |t          t          f          rt          d |D                       st          d	           fd
|D             } fd|D             }|
                    dd          }  j        |fi |}                     ||dg            fd|d         D             }|
                    |           dg}|5 j                            |          }t          |          }d |D             }|t!          d |D                       r(t          d |D                       st          d          t#          |          dk    rh||k    s||k    r\|t          d          d}t#          |          t#          |          k    r	||k    rd}n||k    rd}t          d| d| d|           |8  j        |fi |}|
                    d          }|
                    |           |U|S fd|d         D             }t%          || j        j        t)          d |d         D                                  }||
d!<   |	
                    dd          }t+          |
|"          }|S )#a&	  
        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` arguments to
        MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
        to the docstring of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
        Nz'You must specify either text or images.tokenizer_init_kwargstext_kwargsreturn_tensorsr   common_kwargsc              3   @   K   | ]}t          |t                    V  d S r>   )
isinstancestr)r&   ts     r   rA   z+MllamaProcessor.__call__.<locals>.<genexpr>  s-      =_=_UVjC>P>P=_=_=_=_=_=_r   zAInvalid input text. Please provide a string, or a list of stringsc                 D    g | ]}|                     j                  S r   )countrX   )r&   rx   rk   s     r   r)   z,MllamaProcessor.__call__.<locals>.<listcomp>  s(    HHHa(8 9 9HHHr   c                 F    g | ]}t          |j        j                  S r   )r\   rW   rX   )r&   	text_itemrk   s     r   r)   z,MllamaProcessor.__call__.<locals>.<listcomp>  s,    ooo]f+It~tGWXXooor   padding_sideimage)
modalitiesc                 D    g | ]}|                     j                  S r   )rz   r"   r&   	token_idsrk   s     r   r)   z,MllamaProcessor.__call__.<locals>.<listcomp>   s(    kkk	yt/BCCkkkr   r!   r   c                 ,    g | ]}t          |          S r   r?   )r&   samples     r   r)   z,MllamaProcessor.__call__.<locals>.<listcomp>'  s    !C!C!C&#f++!C!C!Cr   c              3   "   K   | ]
}|d k    V  dS r   Nr   r&   	batch_imgs     r   rA   z+MllamaProcessor.__call__.<locals>.<genexpr>*  s&      DDi9>DDDDDDr   c              3   "   K   | ]
}|d k    V  dS r   r   r   s     r   rA   z+MllamaProcessor.__call__.<locals>.<genexpr>*  s?       Q Q#,	QQ Q Q Q Q Qr   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the prompt zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). r9   c                 :    g | ]}t          |j                  S r   )r7   r"   r   s     r   r)   z,MllamaProcessor.__call__.<locals>.<listcomp>H  s4     * * *S\.y$:MNN* * *r   c              3   4   K   | ]}t          |          V  d S r>   r?   )r&   r!   s     r   rA   z+MllamaProcessor.__call__.<locals>.<genexpr>O  s(      QQi3y>>QQQQQQr   )r9   r:   r;   rL   )datatensor_type)
ValueError_merge_kwargsr   r`   init_kwargsrv   rw   listtupleallpop_check_special_mm_tokensupdater_   fetch_imagesr   anysumrU   r   rE   r   )rk   rm   rn   audiovideosro   output_kwargsrr   r   rt   r   n_images_in_text_encodingn_images_in_idsn_images_in_imagesadd_messageimage_featuresr9   r8   rL   rs   batch_features   `                      r   __call__zMllamaProcessor.__call__   s%   N <FNFGGG**!
 
"&."<
 
 
 $M2(,$%%o6%o6$$$ fv e}55 f#=_=_Z^=_=_=_:_:_ f !deeeHHHH4HHHoooojnoooD55A%t~d::k::H))$gY)OOOkkkkU]^iUjkkkOKK!!!S)66v>>F/77F!C!CF!C!C!CDD3CDDDDD S Q Q0@Q Q Q N N  !w   #$$q(("&666/M_:_:_>$%ghhh"$K-..#6F2G2GGGL^brLrLr 'C(,>>> 'Q$eDT e e@Re eWbe e  
 1T1&JJMJJN&**;77IKK''' $"2* * * *`hit`u* * *& $P*#"2BQQ8K;PQQQQQ	$ $ $  ,@D'(&**+;TBB$$NKKKr   TFc                 .     | j         j        |f||d|S )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[str]`: The decoded text.
        )skip_special_tokensclean_up_tokenization_spaces)r`   batch_decode)rk   generated_outputsr   r   ro   s        r   post_process_image_text_to_textz/MllamaProcessor.post_process_image_text_to_textX  s:    ( +t~*
 3)E
 
 	
 
 	
r   c                 v    | j         j        }| j        j        }d |D             }t          ||z   dgz             S )Nc                     g | ]
}|d k    |S )r9   r   )r&   names     r   r)   z5MllamaProcessor.model_input_names.<locals>.<listcomp>z  s$    &k&k&kW[_jWjWjtWjWjWjr   rL   )r`   model_input_namesr_   r   )rk   tokenizer_input_namesimage_processor_input_namess      r   r   z!MllamaProcessor.model_input_namess  sO     $ @&*&:&L# 'l&k8S&k&k&k#),GGKaJbbcccr   r>   )NNNN)TF)r   r   r   __doc__
attributesimage_processor_classtokenizer_classrj   r   r   r   r   r   r   r   r   r   r   r   propertyr   __classcell__)rl   s   @r   r^   r^      s!        > $[1J2/OR R R R R R (,hlv v$v uY(94	?DQbLccdev ./v 
v v v vr Y^
 
 
 
6 d d Xd d d d dr   r^   )r   typingr   r   numpyrF   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   r   r   r   r   r7   ndarrayrU   rw   r\   r^   __all__r   r   r   <module>r      s    " ! " " " " " " " "     4 4 4 4 4 4 A A A A A A A A V V V V V V V V V V V V C C C C C C C C# # # # #U # # # #    ,E    -d3i - -QUVZ[^V_Q` - - - -`-  $T$s)_ 5- DI-  -  	- 
 Z-  -  -  - `"KC "KC "Kc "Kc "K "K "K "KJLd Ld Ld Ld Ldn Ld Ld Ld^ 
r   