
     `i.                     "   d Z ddlmZ ddlmZmZmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZmZ ddlmZmZ dd	lmZ erdd
lmZ  ej        e          ZdefdZd Z G d ded          Z G d ded          Z G d de          Z dgZ!dS )z
Processor class for IDEFICS2.
    )
accumulate)TYPE_CHECKINGOptionalUnion   )BatchFeature)
ImageInputis_valid_image
load_image)ImagesKwargsProcessingKwargsProcessorMixinUnpack)
AddedToken	TextInput)logging)PreTokenizedInputreturnc                 V    t          | t                    o|                     d          S )Nhttp)
isinstancestr
startswith)vals    /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/idefics2/processing_idefics2.pyis_urlr   )   s#    c3:CNN6$:$::    c                 >    t          |           pt          |           S N)r   r
   )elems    r   is_image_or_image_urlr!   -   s    $<</>$///r   c                   &    e Zd ZU ee         ed<   dS )Idefics2ImagesKwargsimage_seq_lenN)__name__
__module____qualname__r   int__annotations__ r   r   r#   r#   1   s"         C=     r   r#   F)totalc                   ,    e Zd ZU eed<   ddddi dZdS )Idefics2ProcessorKwargsimages_kwargsTF)add_special_tokenspaddingis_split_into_words)text_kwargsr.   N)r%   r&   r'   r#   r)   	_defaultsr*   r   r   r-   r-   5   sB         '''' #'#(
 

  IIIr   r-   c            
            e Zd ZdZddgZdZdZ	 dded	ee	         f fd
Z
d Z	 	 	 	 ddeeee         eee                  f         deedee         ed         f         dee         defdZ xZS )Idefics2Processora  
    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`Idefics2ImageProcessor`):
            An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
        tokenizer (`PreTrainedTokenizerBase`, *optional*):
            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
        image_seq_len (`int`, *optional*, defaults to 64):
            The length of the image sequence i.e. the number of <image> tokens per image in the input.
            This parameter is used to build the string from the input prompt and image tokens and should match the
            config.perceiver_config.resampler_n_latents value for the model used.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    image_processor	tokenizerIdefics2ImageProcessorAutoTokenizerN@   r$   chat_templatec                 (   t          |d          s}t          ddd          j        | _        t          ddd          j        | _        d| j        | j        gi}|                    |           |                    | j                  | _        n$|j        | _        |j        | _        |j        | _        t          ddd          | _	        |                    d| j	        gi           || _
        t                                          |||	           d S )
Nimage_tokenz<fake_token_around_image>FT)
normalizedspecialz<image>additional_special_tokensz<end_of_utterance>)r;   )hasattrr   contentfake_image_tokenr=   r/   convert_tokens_to_idsimage_token_idimage_boundary_tokenend_of_utterance_tokenr$   super__init__)selfr6   r7   r$   r;   kwargstokens_to_add	__class__s          r   rI   zIdefics2Processor.__init__Z   s    y-00 		;$./JW\fj$k$k$k$sD!))tTTT\D84;PRVRb:cdM((777"+"A"A$BR"S"SD$-$BD!(4D"+":D&01ERWae&f&f&f#$$&ADD_C`%abbb*)=QQQQQr   c                     g }|D ]t}g }|D ]X}t          |          r|                    |           't          |          r"|                    t          |                     Y|                    |           u|S r   )r
   appendr   r   )rJ   promptsprompt_imagespromptimagesr    s         r   _extract_images_from_promptsz.Idefics2Processor._extract_images_from_promptsn   s     	) 	)FF 4 4!$'' 4MM$''''D\\ 4MM*T"2"2333  ((((r   rS   textr   rK   r   c                    |t          d           | j        t          fd| j        j        i|}|d                             dd          }||n| j        }|d                             dd          }g }	i }
|>t          |t                    r|g}n?t          |t                    s*t          |d         t                    st          d	          | j
        }| j        }| ||z   | }| j        j        r
|d
z  }|d
z  }g }|D ]p}|	                    |                    |                     |                    ||          }|                    | | |           }|                    |           q | j        |fi |d         }|                     ||dg           |
                    |           t'                    rggn?t          t          t(          f          rt'          d                   r|t+          |	          t-                    k    r6t          d| dt+          |	           d| dt-                     d	          dgt          t/          |	                    z   fdt1          t-          |	                    D             nlgnht          t          t(          f          sLt          d         t          t(          f          s*t'          d         d                   st          d          d D             }|||	k    st          d|	 d| d          d D              | j        fi |d         }|
                    |           t3          |
|          S )a
  
        Processes the input prompts and returns a BatchEncoding.

        Example:

        ```python
        >>> import requests
        >>> from transformers import Idefics2Processor
        >>> from transformers.image_utils import load_image

        >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example

        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"

        >>> image1, image2 = load_image(url1), load_image(url2)
        >>> images = [[image1], [image2]]

        >>> text = [
        ...     "<image>In this image, we see",
        ...     "bla bla bla<image>",
        ... ]
        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
        >>> input_ids = outputs.input_ids
        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
        >>> print(input_tokens)
        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
        ```

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).

                Wherever an image token, `<image>` is encountered it is expanded to
                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
            return_tensors (`Union[str, TensorType]`, *optional*):
                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
                information.

        Nz+You must provide either `text` or `images`.tokenizer_init_kwargsr.   r$   r2   return_tensorsr   zAInvalid input text. Please provide a string, or a list of strings   image)
modalitieszThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.c                 B    g | ]}|         |d z                     S )   r*   ).0icumsum_images_in_textrS   s     r   
<listcomp>z.Idefics2Processor.__call__.<locals>.<listcomp>   sF        4Q7:OPQTUPU:VVW  r   zdInvalid input images. Please provide a single image or a list of images or a list of list of images.c                 ,    g | ]}t          |          S r*   )lenr_   samples     r   rb   z.Idefics2Processor.__call__.<locals>.<listcomp>   s    !C!C!C&#f++!C!C!Cr   z!The number of images in the text z and images  z should be the same.c                 &    g | ]}d  |D             S )c                 ,    g | ]}t          |          S r*   )r   )r_   ims     r   rb   z9Idefics2Processor.__call__.<locals>.<listcomp>.<listcomp>   s    777"z"~~777r   r*   re   s     r   rb   z.Idefics2Processor.__call__.<locals>.<listcomp>   s'    MMMF77777MMMr   )tensor_type)
ValueError_merge_kwargsr-   r7   init_kwargspopr$   r   r   listrC   r=   r6   do_image_splittingrO   countreplace_check_special_mm_tokensupdater!   tuplesumrd   r   ranger   )rJ   rS   rU   audiovideosrK   output_kwargsr$   rX   n_images_in_textinputsrC   r=   	image_strprompt_stringsrf   text_inputsn_images_in_imagesimage_inputsra   s    `                 @r   __call__zIdefics2Processor.__call__z   sq   l <FNJKKK**#
 
"&."<
 
 

 &o6::?DQQ)6)BHZ&}599:JDQQ$$$ fvd++ fJtAw4L4L f !deee  $4*K+\[=-H\JZ\\I#6 #%M	"N . . ''[(A(ABBBY??+;(O=M(O(OTdQfgg%%f----($.XX=;WXXK)).+SZR[)\\\MM+&&&$V,, !(FT5M22 7LVTUY7W7W #+,,F;;(m; m m&)*:&;&;m m>Im mWZ[aWbWbm m m  
 ./C$zBR7S7S2T2T,T)    !&s+;'<'<!=!=  FF
 %XFF ve}55"6!9tUm<< .fQil;;
 !z   "D!CF!C!C!C(:>N(N(N 8HWi  
 NMfMMMF/4/YY-:XYYLMM,'''F????r   )Nr:   N)NNNN)r%   r&   r'   __doc__
attributesimage_processor_classtokenizer_classr(   r   r   rI   rT   r   r	   ro   r   r   r-   r   r   __classcell__)rM   s   @r   r5   r5   B   s;        & $[1J4%O hlR R>ARW_`cWdR R R R R R(
 
 
 OSbfH@ H@j$z"2Dj9I4JJKH@ I2DOTJ]E^^_H@ 01H@ 
H@ H@ H@ H@ H@ H@ H@ H@r   r5   N)"r   	itertoolsr   typingr   r   r   feature_extraction_utilsr   image_utilsr	   r
   r   processing_utilsr   r   r   r   tokenization_utils_baser   r   utilsr   r   
get_loggerr%   loggerboolr   r!   r#   r-   r5   __all__r*   r   r   <module>r      s    !           1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 A A A A A A A A A A            = < < < < < < <        =<<<<<< 
	H	%	%;4 ; ; ; ;0 0 0! ! ! ! !<u ! ! ! !
 
 
 
 
.e 
 
 
 
@@ @@ @@ @@ @@ @@ @@ @@F 
r   