
     `il                        d dl mZ d dlmZmZ d dlZddlmZm	Z	m
Z
 ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZmZmZ  e            rd dl Z  ej!        e"          Z# ed	
          de$de$de%e&e$e$f                  fd            Z' ed
          de&e$e$f         de&e$e$f         de$de$de&e$e$f         f
d            Z(de$de$de$de$de$de)fdZ*de$de$de&e$e$f         de%e&e$e$e$e$f                  fdZ+ ed
          	 d$de&e$e$f         de$de$d e)de&e$e$f         f
d!            Z, G d" d#e          Z-d#gZ.dS )%    )	lru_cache)OptionalUnionN   )BaseImageProcessorBatchFeatureget_size_dict)convert_to_rgbresizeto_channel_dimension_format)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimension
ImageInputPILImageResamplinginfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_vision_availablelogging
   )maxsizemin_image_tilesmax_image_tilesreturnc                     g }t          d|dz             D ]A}t          d|dz             D ]+}||z  |k    r ||z  | k    r|                    ||f           ,Bt          |d           }|S )aK  
    Computes all allowed aspect ratios for a given minimum and maximum number of input tiles.

    This function calculates all possible arrangements of tiles that can be formed
    within the constraint of the minimum and maximum number of tiles. Each arrangement is
    represented by its aspect ratio (width/height) and the corresponding tile configuration.

    Args:
        min_image_tiles (`int`):
            The minimum number of tiles allowed.
        max_image_tiles (`int`):
            The maximum number of tiles allowed.

    Returns:
        `List[Tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
        configuration in terms of number of tiles.

    Example:
        >>> get_all_supported_aspect_ratios(1, 4)
        [(1, 1), (1, 2), (2, 1), (1, 3), (3, 1), (1, 4), (2, 2), (4, 1)]

       c                 $    | d         | d         z  S Nr   r"    xs    /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/ovis2/image_processing_ovis2.py<lambda>z1get_all_supported_aspect_ratios.<locals>.<lambda>M   s    !qt     key)rangeappendsorted)r   r   aspect_ratioswidthheights        r(   get_all_supported_aspect_ratiosr3   /   s    0 Mq/A-.. 6 6A233 	6 	6Fv~00UV^5V5V$$eV_555	6 =.C.CDDDMr*   d   original_image_sizetarget_tile_sizec                 "   t          ||          }| \  }}|\  }}||z  }	||z  }
t          d          }d}|D ]V}|d         |d         z  }t          |	|z
            }||k     r|}|}0||k    r |
d|z  |z  |d         z  |d         z  k    r|}W|S )a  
    Given a minimum and maximum number of tiles, find the canvas with the closest aspect ratio to the
    original image aspect ratio.
    In case of tie-breaking condition when two canvases have the same aspect ratio difference, we favor the canvas with
    more tiles, until the area covered by the tiles is more than twice the target area, in order to avoid unnecessarily
    excessive tiling.
    infr"   r"   r   r"   g      ?)r3   floatabs)r5   r6   r   r   possible_tile_arrangementsoriginal_heightoriginal_widthtarget_tile_heighttarget_tile_widthaspect_ratioareabest_ratio_diff	best_gridgridgrid_aspect_ratio
ratio_diffs                   r(   get_optimal_tiled_canvasrH   R   s     "ARa!b!b&9#O^,<))!O3LO+D EllOI* 
! 
! Gd1g-(99::
''(OII?** c..1BBT!WLtTUwVVV 	r*   leftupperrightlowersidec                     || z
  }||z
  }t          ||          t          ||          }}||k    r
||z  |z  }|}||z  S )N)maxmin)rI   rJ   rK   rL   rM   whs          r(   compute_patch_covering_arearS   y   sQ    AAq!99c!QiiqA4xxEDLq5Lr*   rR   rQ   rE   c                       d         z  d         z   fdt          d                   D             S )Nr   r"   c           	          g | ]T}t          d                    D ]<}|z  |z  |d          d z
  k    rn|d z   z  |d         d z
  k    rn|d z   z  f=US )r"   r   r-   ).0rowcol	col_widthrE   rR   
row_heightrQ   s      r(   
<listcomp>z)split_image_into_grid.<locals>.<listcomp>   s     	 	 	 a>>	 	  )O*Q!##AA#'Y)>Q!##AA#'Z)?		
	 	 	 	r*   rV   )rR   rQ   rE   rZ   r[   s   ```@@r(   split_image_into_gridr]      sl    d1gJT!WI	 	 	 	 	 	 	 	 a>>	 	 	 	r*   ?
image_sizetarget_patch_sizecovering_thresholdc                 t   | \  }}||z  }t          d|          }g }g }	|D ]e}
t          |||
          }t          fd|D                       |z  }|                    |
|f           ||k    r|	                    |
|f           f|	rt	          |	d           d         S t	          |d           d         S )Nr"   c                 ,    g | ]}t          g |R  S r%   )rS   )rW   regionr`   s     r(   r\   z.get_min_tile_covering_grid.<locals>.<listcomp>   s.    dddV,HfH6GHHHdddr*   c                 N    | d         d         | d         d         z  | d          fS r$   r%   r&   s    r(   r)   z,get_min_tile_covering_grid.<locals>.<lambda>   s*    QqT!WqtAw=NQRSTQUPU<V r*   r+   r   c                 N    | d          | d         d         | d         d         z  fS )Nr"   r   r%   r&   s    r(   r)   z,get_min_tile_covering_grid.<locals>.<lambda>   s&    AaD5!A$q'AaDG:K2L r*   )r3   r]   sumr.   rP   )r_   r`   r   ra   image_heightimage_width
image_areacandidate_tile_gridsevaluated_gridssufficient_covering_grids	tile_gridtile_regionstile_covering_ratios    `           r(   get_min_tile_covering_gridrq      s
    !+L+|+J:1oNNO ") O O	,\;	RRddddWcdddeehrr 	 		+>?@@@!333%,,i9L-MNNN  Q,2V2VWWWXYZZ ?(L(LMMMaPPr*   c            &           e Zd ZdZdgZdddddej        dddddddfd	ed
ee	e
ef                  dedededededeeef         dedeeeee         f                  deeeee         f                  dededdf fdZej        ddfdej        d
e	e
ef         dedeee
ef                  deee
ef                  dej        fdZ e            dddddddddddddej        ddfded	ee         d
ee	e
ef                  dee         dee         dee         dee         dee         dee         dee         deeeee         f                  deeeee         f                  deee
ef                  dee         dedeee
ef                  dedej        j        f$d            Z	 	 	 	 d#dej        dededed eeeee	f                  dee         d!efd"Z xZS )$Ovis2ImageProcessora  
    Constructs a Ovis2 image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
            `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        crop_to_patches (`bool`, *optional*, defaults to `False`):
            Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
            `preprocess` method.
        min_patches (`int`, *optional*, defaults to 1):
            The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
            set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
        max_patches (`int`, *optional*, defaults to 12):
            The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
            set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
            overridden by the `resample` parameter in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        use_covering_area_grid (`bool`, *optional*, defaults to `True`):
            Whether to use the covering area grid to determine the number of patches. Only has an effect if
            `crop_to_patches` is set to `True`. Can be overridden by the `use_covering_area_grid` parameter in the
            `preprocess` method.
    pixel_valuesTNFr"      gp?	do_resizesizecrop_to_patchesmin_patchesmax_patchesresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbuse_covering_area_gridr    c                 :    t                      j        di | ||nddd}t          |d          }|| _        || _        || _        || _        || _        || _        || _	        || _
        |	| _        |
|
nt          | _        ||nt          | _        || _        d S )Ni  r2   r1   Tdefault_to_squarer%   )super__init__r	   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   )selfrv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   kwargs	__class__s                  r(   r   zOvis2ImageProcessor.__init__   s    " 	""6"""'ttc-J-JTT:::"	.&& $,((2(>**DT&/&;,r*   imagedata_formatinput_data_formatc                     t          |          }d|vsd|vr$t          d|                                           |d         |d         f}t          |f||||d|S )a  
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        r2   r1   zFThe `size` dictionary must contain the keys `height` and `width`. Got )rw   r{   r   r   )r	   
ValueErrorkeysr   )r   r   rw   r{   r   r   r   output_sizes           r(   r   zOvis2ImageProcessor.resize  s    F T""47$#6#6sfjfofofqfqsstttH~tG}5
#/
 
 
 
 	
r*   imagesreturn_tensorsc           
          ||n j         }||n j        }n j        n j        ||n j        }||n j        }|	|	n j        }	|
|
n j        }
||n j        }||n j	        }||n j
        }n j        t          d                               |          }t          |          }t          |          st!          d          t#          ||	|
||||           |rd |D             }d |D             }|r/t%          |d                   rt&                              d	           t+          |d                   |r2d
k    r, fd|D             }d |D             }d |D             }ndgt-          |          z  }t/          |          D ]\  }}|r                     ||          ||<   |r!                     ||         |	          ||<   |
r"                     ||         ||          ||<   t7          ||         |          ||<   t9          ||d|          }|S )a  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Controls the size of the image after `resize`. The shortest edge of the image is resized to
                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
            crop_to_patches (`bool`, *optional*, defaults to `self.crop_to_patches`):
                Whether to crop the image to patches.
            min_patches (`int`, *optional*, defaults to `self.min_patches`):
                The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
                set to `True`.
            max_patches (`int`, *optional*, defaults to `self.max_patches`):
                The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
                set to `True`.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image values between [0 - 1].
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to normalize the image by if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            use_covering_area_grid (`bool`, *optional*, defaults to `True`):
                Whether to use the covering area grid to determine the number of patches. Only has an effect if
                `crop_to_patches` is set to `True`.
        NFr   zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)r|   r}   r~   r   r   rv   rw   r{   c                 ,    g | ]}t          |          S r%   )r
   rW   r   s     r(   r\   z2Ovis2ImageProcessor.preprocess.<locals>.<listcomp>  s     @@@nU++@@@r*   c                 ,    g | ]}t          |          S r%   )r   r   s     r(   r\   z2Ovis2ImageProcessor.preprocess.<locals>.<listcomp>  s     <<<E.''<<<r*   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.r"   c           
      F    g | ]}                     |           S ))ry   rz   
patch_sizer   r   )crop_image_to_patches)rW   r   r   rz   ry   r   rw   r   s     r(   r\   z2Ovis2ImageProcessor.preprocess.<locals>.<listcomp>  sR     
 
 
  ** + +# 1+A +  
 
 
r*   c                     g | ]\  }}|S r%   r%   )rW   _rE   s      r(   r\   z2Ovis2ImageProcessor.preprocess.<locals>.<listcomp>  s    000gaT000r*   c                 "    g | ]\  }}|D ]}|S r%   r%   )rW   images_listr   r   s       r(   r\   z2Ovis2ImageProcessor.preprocess.<locals>.<listcomp>  s)    RRRQkRRUeRRRRr*   r9   )rw   r{   r   )r   scaler   )r   meanstdr   )input_channel_dim)rt   grids)datatensor_type)rv   rx   ry   rz   r{   r|   r}   r~   r   r   r   rw   r	   fetch_imagesr   r   r   r   r   loggerwarning_oncer   len	enumerater   rescale	normalizer   r   )r   r   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   ir   encoded_outputss   `  ` ``         ``    r(   
preprocesszOvis2ImageProcessor.preprocess6  sS   ^ "+!6IIDN	-<-H//dNb%0%<kk$BR%0%<kk$BR'388#-#9ZZt
+9+E4K^'3'?||TEV#-#9ZZt
!*!6IIDN	+9+E4K^'ttTYTU;;;""6**)&11F## 	:  
 	&!)%!		
 		
 		
 		
  	A@@@@@F =<V<<< 	/&)44 	s  
 $ >vay I I 	+{Q
 
 
 
 
 
 
 
 
 $
 
 
F 10000ERR&RRRFFHs6{{*E!&)) 	q 	qHAu r KKD8_pKqqq	 u LLvaybsLttq	  NN )#!&7	 +  q	 4F1I{^opppF1II&Ve,T,Tbpqqqr*   r^   r   ra   c                 B   |t          |          }t          |t          j        |          }|d         |d         }	}|j        dd         \  }
}|rt          |
|f|||          \  }}nt          |
|f||	f||          \  }}|	|z  }||z  }||z  }|                     |||dt          j        t          j                  }g }t          |          D ]{}||z  }||z  }||	z  ||z  |dz   |	z  |dz   |z  f}|d	|d         |d
         |d         |d         f         }t          ||t          j                  }|	                    |           |t          |          dk    r9|                     |||t          j                  }|                    d|           |||ffS )a  
        Crop the image to patches and return a list of cropped images.
        The number of patches and their grid arrangement are determined by the original image size,
        the target patch size and the minimum and maximum number of patches.
        The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.

        Args:
            images (`np.ndarray`):
                The image to be cropped.
            min_patches (`int`):
                The minimum number of patches to be extracted from the image.
            max_patches (`int`):
                The maximum number of patches to be extracted from the image.
            use_covering_area_grid (`bool`, *optional*, defaults to `True`):
                Whether to use the covering area grid to determine the number of patches.
            patch_size (`int`, `Tuple[int, int]`, `dict`, *optional*):
                The size of the output patches.
            data_format (`ChannelDimension`, *optional*):
                The format of the image data. If `None`, the format is inferred from the input image.
            covering_threshold (`float`, *optional*, defaults to `0.9`):
                The threshold for the covering area grid. If the covering area is less than this value, the grid is
                considered invalid.

        Returns:
            List[`PIL.Image.Image`] or List[np.ndarray]: The list of cropped images.
        Nr2   r1   )r`   r   ra   r   )r   r   r"   .r   r      )r   r   r   FIRSTshaperq   rH   r   r-   r.   r   insert)r   r   ry   rz   r   r   r   ra   patch_size_heightpatch_size_widthr=   r>   num_columnsnum_rowstarget_widthtarget_height
num_blocksresized_imageprocessed_imagesr   columnrX   boxpatch_imagethumbnail_imgs                            r(   r   z)Ovis2ImageProcessor.crop_image_to_patches  s)   H 8@@K,V5E5K[YY.8.BJwDW+*0,rss*;'! 	$> .1"3 +#5	% % %!K %= .1"$45	% %!K (+5)H4 8+
 $|<<(..4	 $ 
 
 z"" 	1 	1A_F{"C))''!//q--	C (SVc!f_c!fs1vo(MNK5k;P`PfggK##K0000  A%% KK
O_Oe (  M ##A}555(K!888r*   )TNNr^   )__name__
__module____qualname____doc__model_input_namesr   BICUBICboolr   dictstrintr   r:   listr   npndarrayr   r   r   r   r   r   PILImager   tupler   __classcell__)r   s   @r(   rs   rs      s       - -^ (( )- %'9'A,3!:>9=#'+ -  - - tCH~& - 	 -
  -  - % -  - c5j) -  - U5$u+#567 - E%e"456 -  - !% -  
! -  -  -  -  -  -L (:'A>BDH.
 .
z.
 38n.
 %	.

 eC)9$9:;.
 $E#/?*?$@A.
 
.
 .
 .
 .
` %$&& %))-*.%)%)15%)*.'+:>9=;?)-(8(>DH'+%d dd D>d tCH~&	d
 "$d c]d c]d -.d TNd !d tnd U5$u+#567d E%e"456d !sJ!78d !d  &!d" $E#/?*?$@A#d$ !%%d& 
'd d d '&dV (,8<26$'^9 ^9
^9 ^9 	^9
 !%^9 U5#t#345^9 ./^9 "^9 ^9 ^9 ^9 ^9 ^9 ^9 ^9r*   rs   )r^   )/	functoolsr   typingr   r   numpyr   image_processing_utilsr   r   r	   image_transformsr
   r   r   image_utilsr   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r   
get_loggerr   r   r   r   r   r3   rH   r:   rS   r]   rq   rs   __all__r%   r*   r(   <module>r      sn           " " " " " " " "     U U U U U U U U U U S S S S S S S S S S                          _ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^  JJJ 
	H	%	% 2S 3 SWX]^acf^fXgSh    D 3#sCx#CHo# # 	#
 38_# # # #Lc # c # UX ]b    S S c3h DsTWY\^aOaIbDc     3
 !$	Q Qc3hQQ Q 	Q
 38_Q Q Q Q>I9 I9 I9 I9 I9, I9 I9 I9X !
!r*   