
     `iK$                         d dl mZmZ d dlZddlmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZmZ ddlmZ d	d
lmZ  G d ded          Z G d de          ZdgZdS )    )OptionalUnionN   )BatchFeature)
ImageInput)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)
TensorType   )AutoTokenizerc                   0    e Zd Zddddddej        dZdS )AriaProcessorKwargsF)paddingreturn_mm_token_type_ids  )max_image_sizesplit_image)text_kwargsimages_kwargsreturn_tensorsN)__name__
__module____qualname__r   PYTORCH	_defaults     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/aria/processing_aria.pyr   r   !   sF         (-
 

 " 
 
 %,
 
IIIr!   r   F)totalc                       e Zd ZdZddgZdZdZ	 	 	 	 ddeee	f         de
e	         de
eeeef         ef                  f fd	Z	 	 	 dd
eeeee         ee         f         de
e         dee         defdZddZed             Z xZS )AriaProcessora  
    AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer.

    Args:
        image_processor (`AriaImageProcessor`, *optional*):
            The AriaImageProcessor to use for image preprocessing.
        tokenizer (`PreTrainedTokenizerBase`, *optional*):
            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
        chat_template (`str`, *optional*):
            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
        size_conversion (`Dict`, *optional*):
            A dictionary indicating size conversions for images.
    image_processor	tokenizerAriaImageProcessorr   Nchat_templatesize_conversionc                     |ddd}d |                                 D             | _        |j        | _        |j        | _        ||j        |j        |_        t                                          |||           d S )N      )i  r   c                 4    i | ]\  }}t          |          |S r    )int).0kvs      r"   
<dictcomp>z*AriaProcessor.__init__.<locals>.<dictcomp>K   s$    NNNdaANNNr!   )r)   )itemsr*   image_tokenimage_token_id	pad_token	unk_tokensuper__init__)selfr&   r'   r)   r*   	__class__s        r"   r:   zAriaProcessor.__init__B   s     "$'c22ONNo6K6K6M6MNNN$0'6 Y%8%@"+"5I)=QQQQQr!   textimageskwargsreturnc                     | j         t          fd| j        j        i|}t	          |t
                    r|g}n?t	          |t                    s*t	          |d         t
                    st          d          | | j        |fi |d         }| j	        |j
        j        d                  }g }	|                    d          |z  }
|D ]D}|                    | j        j        | j        j        |
z            }|	                    |           Eni }|}	|d                             d	d          }|d                             d
d          } | j        |	fi |d         d	di}|                     |	|dg           |rYt#          j        |d                   }t#          j        |d                   }d||| j        k    <   |                                |d<   t-          i |||          S )a  
        Main method to prepare for the model one or several sequences(s) and image(s).

        Args:
            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            images (`ImageInput`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.


        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
            `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
            `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsNr   r   	num_cropsr   r   r   Fimage)
modalities	input_ids   mm_token_type_ids)datatensor_type)_merge_kwargsr   r'   init_kwargs
isinstancestrlist	TypeErrorr&   r*   pixel_valuesshapepopreplacer5   append_check_special_mm_tokensnparray
zeros_liker6   tolistr   )r;   r=   r>   audiovideosr?   output_kwargsimage_inputstokens_per_imageprompt_stringsrC   sampler   r   text_inputs	array_idsrH   s                    r"   __call__zAriaProcessor.__call__T   s=   < +*
 
"&."<
 
 
 dC   	a6DDD$'' 	a
47C0H0H 	a_```/4/YY-:XYYL#3L4M4STU4VWN$((558HHI . .(BDND^ajDjkk%%f----.
 L!N&}599:JDQQ#0#?#C#CD^`e#f#f $dn^ii}]7Siidhiii%%nkwi%XXX# 	J[!9::I "k+.F G GBCi4+>>?/@/G/G/I/IK+,!@K!@<!@n]]]]r!   c                 B    i }|t           j                            di                               |                               dd          p j        j         fd|D             } fd|D             }|                    ||d           t          di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr   r   c                 8    g | ]} j         j        g |R  S r    )r&   get_number_of_image_patches)r0   
image_sizer   r;   s     r"   
<listcomp>z<AriaProcessor._get_num_multimodal_tokens.<locals>.<listcomp>   sE     ! ! ! A$@\*\m\\\! ! !r!   c                 0    g | ]}j                  |z  S r    )r*   )r0   num_patchesmax_sizer;   s     r"   ri   z<AriaProcessor._get_num_multimodal_tokens.<locals>.<listcomp>   s'    rrrQ\ 4X > Lrrrr!   )num_image_tokensnum_image_patchesr    )r   r   getupdater&   r   r   )r;   image_sizesr?   vision_datarn   rm   r   rl   s   `     @@r"   _get_num_multimodal_tokensz(AriaProcessor._get_num_multimodal_tokens   s     "/9==orRRM  ((($(()94@@gDDXDgH! ! ! ! !"-! ! !  srrrr`qrrr4D[lmmnnn,,,,,r!   c                     | j         j        }| j        j        }d |D             }t          t                              ||z                       S )Nc                     g | ]
}|d k    |S )rC   r    )r0   names     r"   ri   z3AriaProcessor.model_input_names.<locals>.<listcomp>   s$    &k&k&kW[_jWjWjtWjWjWjr!   )r'   model_input_namesr&   rO   dictfromkeys)r;   tokenizer_input_namesimage_processor_input_namess      r"   rw   zAriaProcessor.model_input_names   sR     $ @&*&:&L# 'l&k8S&k&k&k#DMM"7:U"UVVWWWr!   )NNNN)NNN)N)r   r   r   __doc__
attributesimage_processor_classtokenizer_classr   r   rN   r   rx   floatr/   r:   r   r   rO   r   r   r   r   rd   rs   propertyrw   __classcell__)r<   s   @r"   r%   r%   /   sv         $[1J0%O /3'+BFR R +,R  }	R
 "$uUCZ'8#'=">?R R R R R R* (,B^ B^I0$y/4HYCZZ[B^ $B^ ,-B^ 
B^ B^ B^ B^H- - - -4 X X XX X X X Xr!   r%   )typingr   r   numpyrW   image_processing_utilsr   image_utilsr   processing_utilsr   r	   r
   r   tokenization_utilsr   r   utilsr   autor   r   r%   __all__r    r!   r"   <module>r      s*  * # " " " " " " "     2 2 2 2 2 2 % % % % % % X X X X X X X X X X X X > > > > > > > >                      *%    KX KX KX KX KXN KX KX KX\ 
r!   