
     `iwf                         d dl mZ d dlmZmZ d dlZddlmZm	Z	m
Z
 ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZmZm Z   e            rd dl!Z! e j"        e#          Z$ G d	 d
e          Z%d
gZ&dS )    )Iterable)OptionalUnionN   )BaseImageProcessorBatchFeatureget_size_dict)convert_to_rgbresizeto_channel_dimension_format)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_vision_availableloggingc            #       6    e Zd ZdZdgZdddej        dddddddfdedee	e
ef                  d	ed
ededeeef         dedeeeee         f                  deeeee         f                  dee         dee         ddf fdZej        ddfdej        dee	e
ef         ef         d
edeee
ef                  deee
ef                  dej        fdZ e            ddddddddddddej        dfdedee         dee	e
ef                  d
ee         dee         dee         dee         deeeee         f                  deeeee         f                  deee
ef                  dee         deeeeeeef         f                  dee         dedeee
ef                  dej        j        f d            Z	 	 	 d dej        deeeeeef         f         deee
ef                  deee
ef                  dej        f
dZ	 	 	 	 	 	 	 d!dedee         dee         dee         deee                  deee                  dee
         dee
         fdZ	 d"dej        deeee         f         deeee         f         deee
ef                  dej        f
dZ xZ S )#JanusImageProcessora  
    Constructs a JANUS image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
            `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        min_size (`int`, *optional*, defaults to 14):
            The minimum allowed size for the resized image. Ensures that neither the height nor width
            falls below this value after resizing.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
            overridden by the `resample` parameter in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image to square or not.
    pixel_valuesTN   gp?	do_resizesizemin_sizeresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbdo_padreturnc                 ~    t                      j        di | ||nddd}t          |d          }|| _        || _        || _        || _        || _        || _        ||nt          | _
        |	|	nt          | _        |
| _        || _        || _        |	d| _        d S t#          d |D                       | _        d S )Ni  )heightwidthTdefault_to_square)   r2   r2   c              3   :   K   | ]}t          |d z            V  dS )   N)int).0xs     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/janus/image_processing_janus.py	<genexpr>z/JanusImageProcessor.__init__.<locals>.<genexpr>   s,      )K)K1#a#g,,)K)K)K)K)K)K     )super__init__r	   r!   r"   r$   r%   r&   r'   r   r(   r   r)   r*   r+   r#   background_colortuple)selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   kwargs	__class__s                r8   r=   zJanusImageProcessor.__init___   s     	""6"""'ttc-J-JTT:::"	 $,((2(>**DT&/&;, $3D!!!$))K)K
)K)K)K$K$KD!!!r:   imagedata_formatinput_data_formatc                    |t          |          }t          ||          \  }}t          ||          }	t          |d          }|d         |d         k    r!t	          d|d          d|d                    |d         }||	z  }
t          t          ||
z            | j                  t          t          ||
z            | j                  g}t          |f||||d|}|S )	an  
        Resize an image to dynamically calculated size.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`dict[str, int]` or `int`):
                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `None`: will be inferred from input
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        NTr0   r.   r/   z5Output height and width must be the same. Got height=z and width=)r"   r$   rD   rE   )r   r   maxr	   
ValueErrorr5   r#   r   )r@   rC   r"   r$   rD   rE   rA   r.   r/   max_sizedeltaoutput_size_nonpaddeds               r8   r   zJanusImageProcessor.resize   s    F $ >u E E&u.?@@vu%%TT:::>T']**rXrrcghocprr   H~x FUN##T]33EEM""DM22!

 
&#/
 
 
 
 r:   imagesreturn_tensorsr>   c           
      z   	 ||n j         }n j        ||n j        }n j        ||n j        }n j        		n j        	||n j        }||n j        }n j	        n j
        t          d                               |          }t          |          }t          |          st          d          t!          ||	|           |rd |D             }d |D             }|r/t#          |d                   rt$                              d	           t)          |d                   |r fd
|D             }|r fd|D             }|r fd|D             }|r	 fd|D             }fd|D             }t+          d|i|
          }|S )a`  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Controls the size of the image after `resize`. The shortest edge of the image is resized to
                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image values between [0 - 1].
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to normalize the image by if `do_normalize` is set to `True`.
            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            background_color (`tuple[int, int, int]`):
                The background color to use for the padding.
            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                Whether to pad the image to square or not.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        NFr0   zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)r%   r&   r'   r(   r)   r!   r"   r$   c                 ,    g | ]}t          |          S r;   )r
   r6   rC   s     r8   
<listcomp>z2JanusImageProcessor.preprocess.<locals>.<listcomp>,  s     @@@nU++@@@r:   c                 ,    g | ]}t          |          S r;   )r   rP   s     r8   rQ   z2JanusImageProcessor.preprocess.<locals>.<listcomp>/  s     <<<E.''<<<r:   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.c                 B    g | ]}                     |           S ))rC   r"   r$   rE   )r   )r6   rC   rE   r$   r@   r"   s     r8   rQ   z2JanusImageProcessor.preprocess.<locals>.<listcomp><  s>        %dXYjkk  r:   c                 @    g | ]}                     |           S ))rC   r>   rE   )pad_to_square)r6   rC   r>   rE   r@   s     r8   rQ   z2JanusImageProcessor.preprocess.<locals>.<listcomp>C  sI         ""%5&7 #    r:   c                 @    g | ]}                     |           S ))rC   scalerE   )rescale)r6   rC   rE   r&   r@   s     r8   rQ   z2JanusImageProcessor.preprocess.<locals>.<listcomp>M  s<        5Rcdd  r:   c                 B    g | ]}                     |           S )rC   meanstdrE   )	normalize)r6   rC   r(   r)   rE   r@   s     r8   rQ   z2JanusImageProcessor.preprocess.<locals>.<listcomp>S  s>        U^opp  r:   c                 4    g | ]}t          |           S )input_channel_dim)r   )r6   rC   rD   rE   s     r8   rQ   z2JanusImageProcessor.preprocess.<locals>.<listcomp>X  s7     
 
 
ej'{N_```
 
 
r:   r   datatensor_type)r!   r$   r%   r&   r'   r(   r)   r*   r+   r>   r"   r	   fetch_imagesr   r   rH   r   r   loggerwarning_oncer   r   )r@   rL   r!   r"   r$   r%   r&   r'   r(   r)   rM   r*   r>   r+   rD   rE   encoded_outputss   `  `` ` ``  ` `` r8   
preprocesszJanusImageProcessor.preprocess   s   L "+!6IIDN	'388#-#9ZZt
+9+E4K^'3'?||TEV#-#9ZZt
!*!6IIDN	+9+E4K^!-4;/?/K++QUQf'ttTYTU;;;""6**)&11F## 	:  
 	&!)%!		
 		
 		
 		
  	A@@@@@F =<V<<< 	/&)44 	s  
 $ >vay I I 	      #  F
  		      $  F  	     #  F
  	      #  F

 
 
 
 
nt
 
 
 '^V,DR`aaar:   r   c                 Z   t          ||          \  }}|t          j        k    r|j        d         n|j        d         }||k    r|t	          |||          n|}|S t          ||          }t          |t                    r|g}n&t          |          |k    rt          d| d          |t          j        k    ryt          j        |||f|j                  }	t          |          D ]\  }
}||	|
ddddf<   ||k    r||z
  dz  }||	dd|||z   ddf<   n||z
  dz  }||	dddd|||z   f<   nxt          j        |||f|j                  }	t          |          D ]\  }
}||	dddd|
f<   ||k    r||z
  dz  }||	|||z   ddddf<   n||z
  dz  }||	dd|||z   ddf<   |	S )a}  
        Pads an image to a square based on the longest edge.

        Args:
            image (`np.ndarray`):
                The image to pad.
            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                The color to use for the padding. Can be an integer for single channel or a
                tuple of integers representing for multi-channel images. If passed as integer
                in multi-channel mode, it will default to `0` in subsequent channels.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The padded image.
        r   Nz(background_color must have no more than z) elements to match the number of channels)dtype   )r   r   FIRSTshaper   rG   
isinstancer5   lenrH   npzerosrk   	enumerate)r@   rC   r>   rD   rE   r.   r/   num_channelsmax_dimresulticolorstarts                r8   rU   z!JanusImageProcessor.pad_to_square`  so   < 'u.?@@):>N>T)T)Tu{1~~Z_ZefhZiU?? * ,E;@QRRR 
 Lfe$$ &,, 	 01!""l22r<rrr    0 666X|Wg>ekRRRF%&677 ( (5"'q!!!QQQwv~~ 6)a/7<qqq%%&.0!!!344 5Q.6;qqq!!!UUU]2233Xw>ekRRRF%&677 ( (5"'qqq!!!Qwv~~ 6)a/7<uuv~-qqq!!!344 5Q.6;qqq%%%-/23r:   c	                 4   ||n| j         }|
d| j        z  n|}||n| j        }||n| j        }||n| j        }t          |          }t          |d         t          j        j                  rt          |          dk    r|n|d         S |t          |d                   }g }	|D ]}
t          |
          }
|r|                     |
|||          }
|rK|                     |
||          }
|
                    dd                              t           j                  }
|rC|rA|dk    r;t%          |
t&          j        |	          }
t          j                            |
          }
|	                    |
           d
|	i}|dk    r|nd}t/          ||          S )znApplies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.Ng      ?r      )rC   r(   r)   rE   )rW   rE   r4   zPIL.Image.Imager_   r   ra   )r%   r&   r'   r(   r)   r   ro   PILImagerp   r   r   unnormalizerX   clipastyperq   uint8r   r   LAST	fromarrayappendr   )r@   rL   r%   r&   r'   r(   r)   rE   rM   r   rC   rb   s               r8   postprocesszJanusImageProcessor.postprocess  s    $.#9ZZt
6D6Lt222R`'3'?||TEV#-#9ZZt
!*!6IIDN	)&11fQi11 	< [[1__66&);$ >vay I I 	' 	'E"5))E ((J)_p )    <U.Teff

1c**11"(;; 3
 3~AR/R/R3E;K;Pduvvv	++E22&&&&-+9=N+N+NTX>BBBBr:   c                     d}t          |t                    r6t          |          |k    r"t          d| dt          |                     n|g|z  }t          |t                    r6t          |          |k    r"t          d| dt          |                     n|g|z  }t	          d t          ||          D                       }t	          d |D                       }|                     ||||          }|S )a~  
        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
        image = (image * image_std) + image_mean
        Args:
            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
                Batch of pixel values to postprocess.
            image_mean (`float` or `Iterable[float]`):
                The mean to use for unnormalization.
            image_std (`float` or `Iterable[float]`):
                The standard deviation to use for unnormalization.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        r   zmean must have z$ elements if it is an iterable, got zstd must have c              3   (   K   | ]\  }}| |z  V  d S Nr;   )r6   r[   r\   s      r8   r9   z2JanusImageProcessor.unnormalize.<locals>.<genexpr>  s,      WWytSus{WWWWWWr:   c              3       K   | ]	}d |z  V  
dS )r{   Nr;   )r6   r\   s     r8   r9   z2JanusImageProcessor.unnormalize.<locals>.<genexpr>  s&      ;;#a#g;;;;;;r:   rZ   )ro   r   rp   rH   r?   zipr]   )r@   rC   r(   r)   rE   rt   rev_image_meanrev_image_stds           r8   r~   zJanusImageProcessor.unnormalize  s2   0 j(++ 	5:,.. !v<!v!vehisetet!v!vwww / %4Ji** 	39~~-- !t,!t!tdghqdrdr!t!tuuu . #l2IWWC
I<V<VWWWWW;;;;;;;n-Sd  
 
 r:   )r   NN)NNNNNNNr   )!__name__
__module____qualname____doc__model_input_namesr   BICUBICboolr   dictstrr5   r   floatlistr=   rq   ndarrayr   r   r   rm   r   r   r?   r|   r}   rh   rU   r   r   r~   __classcell__)rB   s   @r8   r   r   5   sM       % %N (( )-'9'A,3!:>9=)-!%"L "L"L tCH~&"L 	"L
 %"L "L c5j)"L "L U5$u+#567"L E%e"456"L !"L "L 
"L "L "L "L "L "LP (:'A>BDH? ?z? DcNC'(? %	?
 eC)9$9:;? $E#/?*?$@A? 
? ? ? ?B %$&& %))-15%)*.'+:>9=;?)-GK!%(8(>DH!Y YY D>Y tCH~&	Y
 -.Y TNY !Y tnY U5$u+#567Y E%e"456Y !sJ!78Y !Y #5eCcM.B)B#CDY Y &Y  $E#/?*?$@A!Y" 
#Y Y Y '&Y| >?>BDHH HzH  U3S=%9 9:H eC)9$9:;	H
 $E#/?*?$@AH 
H H H HZ &**.'+,0+/+/(,1C 1C1C TN1C !	1C
 tn1C T%[)1C DK(1C $C=1C !1C 1C 1C 1Cp EI+ +z+ %%01+ /0	+
 $E#/?*?$@A+ 
+ + + + + + + +r:   r   )'collections.abcr   typingr   r   numpyrq   image_processing_utilsr   r   r	   image_transformsr
   r   r   image_utilsr   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r|   
get_loggerr   re   r   __all__r;   r:   r8   <module>r      s  , % $ $ $ $ $ " " " " " " " "     U U U U U U U U U U S S S S S S S S S S                            _ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^  JJJ 
	H	%	%S S S S S, S S Sl !
!r:   