
    .`iK                         d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZmZ d dlmZmZ d dlmZ dgZd	Z G d
 de
d          Z G d de          Z ej        de           dS )    )cached_propertyN)AutoProcessorBatchFeature)
ImageInput)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)convert_image_modeOvisProcessoric                   &    e Zd ZddiddddddZd	S )
OvisProcessorKwargspaddingF	   g?Tpt)max_partitioncovering_thresholdconvert_to_rgbreturn_tensors)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaults     {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/transformers_utils/processors/ovis.pyr   r   (   s>         u
 "%""	
 
	
 
IIIr   r   F)totalc            	       6    e Zd ZdZddgZg dZdZdZ	 	 	 	 	 d fd		Ze	d
             Z
	 	 ddedeez  ee         z  ee         z  dee         defdZdee         dej        fdZd Zd Zd Zd Zdej        j        fdZd Zd Zd Z e!d             Z" xZ#S )r   a  
    Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
    [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
    [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
    Args:
        image_processor ([`Qwen2VLImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`Qwen2TokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    image_processor	tokenizer)chat_templateimage_pad_tokenimage_segment_lenAutoImageProcessorAutoTokenizerN   c                 z    d| _         || _        || _        t                                          |||           d S )Nz<image>)r$   )image_tokenr%   r&   super__init__)selfr"   r#   r$   r%   r&   kwargs	__class__s          r   r-   zOvisProcessor.__init__J   sB     %.!2)=QQQQQr   c           	      d    | j                                         | j                 }ddddddd|d}|S )	Ni8iiiiii)r+   
image_atomimage_startimage_prefiximage_col_sepimage_row_sep	image_end	image_pad)r#   	get_vocabr%   )r.   image_pad_token_idextra_special_tokenss      r   r;   z"OvisProcessor.extra_special_tokensX   sK    !^55778LM !!+	 
 	 
 $#r   imagestextr/   returnc                 `    | j         t          fd| j        j        i|}i }|g }g }g }t	          |t
                    r|n|gD ]Z}	 | j        dd|	i|d         \  }
}}|                    |
           |                    |           |                    |           [|r||d<   |wt	          |t
                    s|g}|                     |          }| 	                    d          }g }d}|D ]}||v rd|v r|t          |d                   k     r|                                }g }t          |          D ]I\  }}||k    r)|d         |         }|                    |           |dz  }4|                    |           Jt          j        |t          j        	          }nt#          d
          |                    |           |rt          j        |          }n t          j        g t          j        	          }t'          d|i          }|r
||d<   ||d<   |S t'          |          S )a  
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
            Args:
                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                    The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                    tensor. Both channels-first and channels-last formats are supported.
                text (`str`, `list[str]`, `list[list[str]]`):
                    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                    The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                    tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
                return_tensors (`str` or [`~utils.TensorType`], *optional*):
                    If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
            Returns:
                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
                - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
                - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
                  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
                  `None`).
                - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
                - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
                - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
                - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
                - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
        tokenizer_init_kwargsNimager   image_placeholdersr+   r      dtypezZMismatch between the images you provided and the number of placeholder present in the text	input_ids)datapixel_valuesgridsr   )_merge_kwargsr   r#   init_kwargs
isinstancelistpreprocess_imageappend_tokenize_with_image_symbolget_token_valuelentolist	enumerateextendtorchtensorlongRuntimeErrorstackr   )r.   r<   r=   r/   output_kwargsimage_featuresprocessed_imagesimage_placeholders_listrI   rA   rH   rB   gridtokenized_batched_textimage_token_idreplaced_ids_listidx
ids_tensorids_listnew_idsitoken_idplaceholder_idsreplaced_and_tokenized_idsoutputs                            r   __call__zOvisProcessor.__call__g   s   V +*
 
"&."<
 
 
 !&(#E $.fd#;#;I& # #9N9N : ::#0#A: :60$ !''555'../ABBBT""""   O7N34 dD)) v%)%E%Ed%K%K"!11-@@N "C4 5 5
"j00,>>S0D!EFFFF#-#4#4#6#6"$ ,5X+>+> 9 9KAx'>992@AU2V$'3" !( ? ? ? #q 'x 8 8 8 8 &+\'%L%L%L

*x   "((4444  P-2[9J-K-K**-2\"EJ-O-O-O* "!;  F  ()9~&"'wM 0000r   	text_listc                     g }|D ]} fd|                      j                  D             }g }t          |          }t          |          D ]K\  }}|                    |           ||dz
  k     r(|                                         d                     L|                    |           t          j        |t          j	                  S )Nc                 H    g | ]}                     |d           j        S )F)add_special_tokens)r#   rF   ).0chunkr.   s     r   
<listcomp>z=OvisProcessor._tokenize_with_image_symbol.<locals>.<listcomp>   s<        u??I  r   rC   r+   rD   )
splitr+   rR   rT   rU   rO   rQ   rV   rW   rX   )	r.   rm   batch_token_idsr=   text_chunks	token_ids	num_chuckrg   rr   s	   `        r   rP   z)OvisProcessor._tokenize_with_image_symbol   s     	. 	.D   !ZZ(899  K IK((I%k22 J J5  '''y1}$$$$T%9%9-%H%HIII""9----|O5:>>>>r   c                     | j         j        }d|v r|d         x}}n(d|v rd|v r|d         }|d         }nt          d          ||fS )Nshortest_edgeheightwidthz3Can't parse image size from image_processor config.)r"   size
ValueError)r.   r}   r|   r{   s       r   get_image_sizezOvisProcessor.get_image_size   si    #(d""!/22EFF'T//ME(^FFRSSSu}r   c                     | j         |         S N)r;   )r.   toks     r   rQ   zOvisProcessor.get_token_value  s    (--r   c                    |                      d          |                      d          |                      d          g}|d         |d         z  dk    rt          |d                   D ]}t          |d                   D ]a}|                    |                      d                     ||d         dz
  k     r(|                    |                      d                     b||d         dz
  k     r(|                    |                      d                     |                    |                      d                     |S )	Nr3   r2   r4   r   rC   r5   r6   r7   )rQ   rangerO   )r.   r_   rB   rcs        r   construct_image_indicatorsz(OvisProcessor.construct_image_indicators  s?     //  ..  00

 7T!Wq  47^^ U UtAw Y YA&--d.B.B<.P.PQQQ47Q;*11$2F2F2W2WXXXtAw{??&--d.B.B?.S.STTT!!$"6"6{"C"CDDD!!r   c                    |                      |          }|                     d          }|                     d          }g }|D ];}|                    |           ||k    r|                    |g| j        z             <|S )Nr2   r8   )r   rQ   rO   rU   r&   )r.   r_   rB   image_atom_token_idimage_padding_token_idpadded_placeholder_tokenstokens          r   construct_image_placeholdersz*OvisProcessor.construct_image_placeholders  s    !<<TBB"22<@@!%!5!5k!B!B %'!' 	 	E%,,-CDDD+++)00+,t/EE   )(r   rA   c                     dt           j        j        f fddt          t          t          t          t          t          f                  fdd fd}|rt          d                                           }|d         |d	         k    rt          d
          |d          |          } |          }	fd|	D             }
t          |
          d	k    r|
	                    d           t          j        fd|
D             d          }                     |          }t          j        |          |t          j        |          fS )Nimgc                     | j         \  }}||k    r|x}}n5||k    r|}t          ||z  |z            }n|}t          ||z  |z            }t          ||          }j                            | |
          d         }t          j        dd||g|j        |j                  }|j	        dd          \  }}||k    r||d d d d d d d d f<   n?||k    r||z
  dz  }	||d d d d d d |	|	|z   f<   n||z
  dz  }	||d d d d |	|	|z   d d f<   |S )N)r{   r|   )r}   r   rH   rC      )rE   device   )
r}   intdictr"   
preprocessrV   zerosrE   r   shape)r   sidewh	new_width
new_heightnew_sizerH   square_values
from_indexr   r.   s             r   _preprocessz3OvisProcessor.preprocess_image.<locals>._preprocess2  s   8DAqAvv)--	JJQ 	 Q!233

!
A
 233	:Y???H/::(> ;  L
 "KAtT",*<\EX  M %1$6qrr$:!J	Y&&,8aaaAAAqqqj))i''"Y.14
  aaaAAAzJ4J'JJKK #Z/A5
  aaaJj1H$H!!!KL ! r   r>   c                 T   | j         \  }}||d         z  }||d         z  }g }t          |d                   D ]o}t          |d                   D ]W}||z  }	||z  }
||d         dz
  k    r|n|dz   |z  }||d         dz
  k    r|n|dz   |z  }|                    |	|
||f           Xp|S Nr   rC   )r}   r   rO   )r   r_   r   r   
row_height	col_width	partitionrowcolleftupperrightlowers                r   
_partitionz2OvisProcessor.preprocess_image.<locals>._partitionV  s    8DAqd1gJT!WIIT!W~~ B B a>> B BC?D*,E!$Q!!3!3AA#'Y9NE!$Q!!3!3AA#'Z9OE$$dE5%%@AAAAB r   c                     || z
  }||z
  }t          ||          t          ||          }}||k    r
||z  |z  }|}||z  S r   )maxmin)r   r   r   r   r   r   r   s          r   _covering_areaz6OvisProcessor.preprocess_image.<locals>._covering_areaf  sQ    AAq!99c!QiiqA4xxEDLq5Lr   c                 V   | j         d         | j         d         z  }g }t          ddz             D ]8}t          ddz             D ]"}||z  k    r|                    ||f           #9g }g }|D ]i} | |          }	t          fd|	D                       |z  }
|
dk    sJ |                    ||
f           |
k    r|                    ||
f           jt	          |          dk    rt          |d           d         d         S t          |d           d         d         S )Nr   rC   c                 $    g | ]} g |R  S r   r   )rq   pr   r   s     r   rs   zJOvisProcessor.preprocess_image.<locals>._get_best_grid.<locals>.<listcomp>}  s.    EEEa11D111EEEr   g      ?c                 N    | d         d         | d         d         z  | d          fS r   r   xs    r   <lambda>zHOvisProcessor.preprocess_image.<locals>._get_best_grid.<locals>.<lambda>  s&    1a1Q479JQqTE8R r   )keyc                 N    | d          | d         d         | d         d         z  fS )NrC   r   r   r   s    r   r   zHOvisProcessor.preprocess_image.<locals>._get_best_grid.<locals>.<lambda>  s&    1qtAw1a?P7Q r   )r}   r   rO   sumrR   sorted)r   r   img_areacandidate_gridsrg   j	all_grids
good_gridsr_   r   covering_ratior   r   r   r   s    `         r   _get_best_gridz6OvisProcessor.preprocess_image.<locals>._get_best_grido  s   x{SXa[0H O1ma/00 7 7q-!"344 7 7A1u--'..1v6667 IJ' > >&JsD11	EEEEE9EEEFFQ  &,,,,  $!7888!$666%%t^&<===:""j.R.RSSSTUV 
 i-Q-QRRRSTUVWXXr   RGBr   rC   z(get_image_size() returns non-square sizec                 :    g | ]}                     |          S r   )crop)rq   r   rA   s     r   rs   z2OvisProcessor.preprocess_image.<locals>.<listcomp>  s#    2221A222r   c                 (    g | ]} |          S r   r   )rq   r   r   r   s     r   rs   z2OvisProcessor.preprocess_image.<locals>.<listcomp>  s%    !L!L!Ld++dD"9"9!L!L!Lr   )dim)PILImagerM   tupler   r   r   r~   rR   insertrV   catr   rW   )r.   rA   r   r   r   r   r   sidesr_   r   cropsrH   rB   r   r   r   r   s   ```` `       @@@@r   rN   zOvisProcessor.preprocess_image*  s   "	!SY_ "	! "	! "	! "	! "	! "	! "	!H	T%S#s0B*C%D 	 	 	 	 	 	 		Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y<  	5&ue44E##%%8uQxGHHHQx~eT**Jud++	2222	222u::>>LLE"""y!L!L!L!L!Le!L!L!LRSTTT!>>tDD|L))+=u|D?Q?QQQr   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        r#   batch_decoder.   argsr/   s      r   r   zOvisProcessor.batch_decode  s    
 +t~*D;F;;;r   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r#   decoder   s      r   r   zOvisProcessor.decode  s    
 %t~$d5f555r   c                 <    | j                             |dd          S )a  
        Post-process the output of the model to decode the text.
        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
        Returns:
            `list[str]`: The decoded text.
        TF)skip_special_tokensclean_up_tokenization_spacesr   )r.   generated_outputss     r   post_process_image_text_to_textz-OvisProcessor.post_process_image_text_to_text  s,     ~** $). + 
 
 	
r   c                     | j         j        }| j        j        }t          t                              ||z                       }|dgz   S )Nsecond_per_grid_ts)r#   model_input_namesr"   rM   r   fromkeys)r.   tokenizer_input_namesimage_processor_input_namesnames_from_processors       r   r   zOvisProcessor.model_input_names  sP     $ @&*&:&L##MM/2MMNN 
  
 $';&<<<r   )NNNNr)   )NN)$r   r   r   __doc__
attributesvalid_kwargsimage_processor_classtokenizer_classr-   r   r;   r   r   r
   rM   r	   r   r   rl   strrV   
LongTensorrP   r   rQ   r   r   r   r   rN   r   r   r   propertyr   __classcell__)r0   s   @r   r   r   6   s         $[1JLLLL0%O R R R R R R $ $ _$  " %)A1 A1A1 

y/  
!"A1 ,-A1 
A1 A1 A1 A1F?T#Y ?5CS ? ? ? ? 	 	 	. . ." " "") ) )"qRyqR qR qR qRf< < <6 6 6
 
 
  = = X= = = = =r   )	functoolsr   r   rV   transformersr   r   transformers.image_utilsr   transformers.processing_utilsr   r   r	   $transformers.tokenization_utils_baser
   r   vllm.multimodal.imager   __all__	IGNORE_IDr   r   registerr   r   r   <module>r      s,  2 & % % % % % 



  4 4 4 4 4 4 4 4 / / / / / / R R R R R R R R R R M M M M M M M M 4 4 4 4 4 4
	    *%    L= L= L= L= L=N L= L= L=^   6 6 6 6 6r   