
     `i.                        d dl mZmZ d dlZddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZ  e            rd	d
lmZ  G d ded          Z G d de
d          Z G d ded          Z G d de          ZdgZdS )    )OptionalUnionN   )BatchFeature)
ImageInput)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixin
TextKwargsUnpack)PreTokenizedInput	TextInput)is_vision_available   )smart_resizec                       e Zd ZU eed<   dS )Emu3TextKwargsreturn_for_image_generationN)__name__
__module____qualname__bool__annotations__     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/emu3/processing_emu3.pyr   r       s         !%%%%%%r   r   F)totalc                   $    e Zd ZU eed<   eed<   dS )Emu3ImagesKwargsratio
image_areaN)r   r   r   strr   intr   r   r   r    r    $   s"         JJJOOOOOr   r    c                   :    e Zd ZU eed<   eed<   dddddddZd	S )
Emu3ProcessorKwargstext_kwargsimages_kwargsF)r   return_mm_token_type_idsz1:1i  )r!   r"   )r'   r(   N)r   r   r   r   r   r    	_defaultsr   r   r   r&   r&   )   sX         #### ,1(-
 

  
 
	 	IIIr   r&   c                        e Zd ZdZddgZdZdZ	 d fd	Z	 	 	 	 ddee	         d	ee
eeee         ee         f                  d
ee         defdZddZd Zde	fdZ xZS )Emu3Processora  
    Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single
    processor.

    [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`].
    See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information.

    Args:
        image_processor ([`Emu3ImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`Emu3TokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    image_processor	tokenizer)GPT2TokenizerGPT2TokenizerFastEmu3ImageProcessorNc                    |j         | _         |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _	        d| _
        t                                          |||           d S )N   )chat_template)image_tokenimage_token_id	boi_tokenimage_start_token	eoi_tokenimage_end_tokenimage_wrapper_tokenfake_token_around_image	eof_token	bos_tokendownsample_ratiosuper__init__)selfr-   r.   r4   kwargs	__class__s        r   rA   zEmu3Processor.__init__M   s}     %0'6!*!4(2'0'D$",", !)=QQQQQr   imagestextrC   returnc                 0    t          |t                    r|g}n?t          |t                    s*t          |d         t                    st          d            j        t
          fd j        j        i|}|d                             dd          }|d                             dd	          }|d                             d
d	          }	|r|t          d          |s||t          d          i }
 j
         } j          j         }|s|  j        |fi |d         }
t          |
j                  }g }|D ]} j        |v rwt#          |          }|\  }}| j        z  }| j        z  }||dz   z  }| | d|  j         d|z   | }|                     j        |d          } j         | } j        |v w|                    |            fd|D             }nX|rV                     ||	 j                  \  }}| | d|  j          fd|D             }||ggt1          |          z  |
d<   |d                             dd	          }|d                             dd          }  j        |fi |d         dd	i}                     ||dg           |rYt5          j        |d                   }t5          j        |d                   }d|| j        k    <   |                                |d<   t?          i ||
|          S )a  
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        r   zAInvalid input text. Please provide a string, or a list of stringstokenizer_init_kwargsr'   r   Fr(   r!   Nr"   zGYou should not provide `images` when `return_for_image_generation=True`zOYou must provide either text or images when `return_for_image_generation=False`r   *<placeholder>c                 F    g | ]}|                     d j                  S )rK   )replacer5   ).0samplerB   s     r   
<listcomp>z*Emu3Processor.__call__.<locals>.<listcomp>   s*    ccc&FNN?D4DEEcccr   c                 *    g | ]}j          |  S r   )r>   )rN   rO   image_promptrB   s     r   rP   z*Emu3Processor.__call__.<locals>.<listcomp>   s+    QQQ&t~=v=|==QQQr   image_sizesreturn_tensorsr)   image)
modalities	input_idsmm_token_type_ids)datatensor_type) 
isinstancer#   list	TypeError_merge_kwargsr&   r.   init_kwargspop
ValueErrorr8   r=   r:   r-   iterrS   r5   nextr?   r<   rM   r>   appendcalculate_generate_sizelen_check_special_mm_tokensnparray
zeros_liker6   tolistr   )rB   rE   rF   audiovideosrC   output_kwargsr   r!   r"   image_featuresimage_start_tokensimage_end_tokensrS   prompt_stringsrO   
image_sizeheightwidthimage_seq_lengthimage_placeholderrT   r)   text_inputs	array_idsrX   rR   s   `                         @r   __call__zEmu3Processor.__call__^   s*   T dC   	a6DDD$'' 	a
47C0H0H 	a_```**
 
"&."<
 
 

 '4M&B&F&FGdfk&l&l#o.227DAA"?377dKK
& 	h6+=fggg* 	pt|nooo $ 68"nDd.BDD + 	Jv/A1T1&[[M/<Z[[N~9::KN . .&&00!%k!2!2J$.MFE#t'<<F!T%::E'-';$+=  )dv  )d  )d  )dtOk  )dm|  @P  nP  )d  Rb  )d  )d%#^^D,<>OQRSSF $888F &&00 %%f----ccccTbcccDD ) 	J 88
DLabbMFE0`&``5`$B^``LQQQQQDQQQD.4e_,=D		,IN=) '}599:JDQQ#0#?#C#CD^`e#f#f $dnT__]=-I__Z^___%%dKWI%NNN# 	J[!9::I "k+.F G GBCi4+>>?/@/G/G/I/IK+,!BK!B>!BP^____r   c                 V   i }|g }|D ]j\  }}t          ||| j        j        | j        j        | j        j                  \  }}|| j        z  }|| j        z  }||dz   z  }|                    |           kdgt          |          z  }|                    ||d           t          di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr   )num_image_tokensnum_image_patchesr   )
r   r-   spatial_factor
min_pixels
max_pixelsr?   rd   rf   updater	   )	rB   rS   rC   vision_datar|   rt   ru   rv   r}   s	            r   _get_num_multimodal_tokensz(Emu3Processor._get_num_multimodal_tokens   s     "!!, : : ,(7(3(3! !  4#88!66#)UQY#7  ''(89999!"c+&6&6 64D[lmmnnn,,,,,r   c                    t          t          |                    d                    \  }}||z  }||z  dz  }t          t          ||z  |z                      }t          t          ||z  |z                      }	||	fS )N:g      ?)mapr$   splitround)
rB   r!   r"   r~   ru   rt   current_areatarget_ratiotoken_heighttoken_widths
             r   re   z%Emu3Processor.calculate_generate_size   s    CS!1!122vv~"\1c95,!6!GHHII% 4~ EFFGG[((r   c                 (     | j         j        |fi |S N)r-   postprocess)rB   rE   rC   s      r   r   zEmu3Processor.postprocess   s     /t#/AA&AAAr   r   )NNNN)r   r   r   __doc__
attributestokenizer_classimage_processor_classrA   r   r   r   r   r   r\   r   r&   r   rz   r   re   r   __classcell__)rD   s   @r   r,   r,   8   s:          $[1J<O0 	R R R R R R& (,hli` i`$i` uY(94	?DQbLccdei` ,-i` 
i` i` i` i`V -  -  -  -D) ) )B* B B B B B B B Br   r,   )typingr   r   numpyrh   image_processing_utilsr   image_utilsr   processing_utilsr   r	   r
   r   r   r   tokenization_utils_baser   r   utilsr   image_processing_emu3r   r   r    r&   r,   __all__r   r   r   <module>r      s  " # " " " " " " "     2 2 2 2 2 2 % % % % % % r r r r r r r r r r r r r r r r C C C C C C C C ( ( ( ( ( (  4333333& & & & &Zu & & & &    |5    
    *%    }B }B }B }B }BN }B }B }B@ 
r   