
    Pi!                         d dl Z d dlmZmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ  e j        e          Z G d d	          ZdS )
    N)AnyListMappingOptionalTuple)Image)find_supported_resolutionsget_canvas_best_fit)resize_with_pad)	tile_crop)
functionalc                      e Zd ZdZdddddej        ddddeee                  d	eee                  d
eee	e
e
f                           de
dee
         dej        dededdfdZ	 ddeeef         dedeeef         fdZdS )CLIPImageTransforma	  
    This class accepts images of any size and dynamically resizes, pads, normalizes and tiles it
    based on the image aspect ratio and the number of image tiles we allow.

    The algorithm will NOT distort the image to fit a certain aspect ratio, because
    that leads to a significant degradation in image quality.

    The user can choose if they want to allow upscaling by using the flag ``resize_to_max_canvas``.

    For example, if an input image is of size 300x800, and we want to allow
    a maximum of 16 image tiles, with side 224px, then:

    If ``resize_to_max_canvas=False``, then:
    best_resolution = (448, 896) -> smallest canvas, up to 16 tiles, that doesn't require downscaling
    image is NOT resized
    image is padded (300, 800) -> 448,896
    Image is tiled 2x4, for a final output shape of (8, 3, 224, 224)

    If ``resize_to_max_canvas=True``, then:
    best_resolution = (448, 1344) # canvas that allows maximum upscaling, with minimum padding, up to 16 tiles
    image is resized without distortion (300,800) -> (448, 1194) #448 is the limiting side for the resize
    image is padded (448, 1194) -> (448, 1344)
    Image is tiled 2x5, for a final output shape of (10, 3, 224, 224)

    Args:
        image_mean (Optional[List[float]]): Mean values of each channel, used for normalization.
            Should be the same used for the pre-trained model. If None, no normalization is performed. Default None.
        image_std (Optional[List[float]]): Standard deviation values of each channel, used for normalization.
            Should be the same used for the pre-trained model. If None, no normalization is performed. Default None.
        possible_resolutions (Optional[List[Tuple[int, int]]]): List of possible resolutions as tuples (height, width).
            where each tuple represents a possible canvas to fit the image into when calling ``get_canvas_best_fit``.
            If None, this will be calculated using max_num_tiles and tile_size. Default None.
        tile_size (int): Size of the tiles to divide the image into. Default 224.
        max_num_tiles (Optional[int]): Only used if possible_resolutions is NOT given.
            Maximum number of tiles to break an image into.
            This will be used to generate possible_resolutions,
            e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224.
            Default 4.
        dtype (torch.dtype): Data type of the output image. Default torch.bfloat16.
        resample (str): Resampling method used when resizing images. Supports any enum of
            ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic".
            Default 'bilinear'.
        resize_to_max_canvas (bool): "If True, the image will be upscaled without distortion to fit the largest possible
            resolution from possible_resolutions.
            If False, it will pick the resolution that minimizes downscaling, including no downscaling at all.
            In this case, the image will only be upscaled if it's size < tile_size. Default False.

    Examples:
        >>> image_transform = CLIPImageTransform(
        ...    image_mean=None,
        ...    image_std=None,
        ...    tile_size=224,
        ...    possible_resolutions=None,
        ...    max_num_tiles=4,
        ...    resample="bilinear",
        ...    resize_to_max_canvas=True,
        ...)
        >>> # create random image
        >>> image = (np.random.rand(100,200,3) * 255).astype(np.uint8)
        >>> image = PIL.Image.fromarray(image)
        >>> output = image_transform(image)
        >>> output['image'].shape # [num_tiles, num_channels, tile_size, tile_size]
        torch.Size([2, 3, 224, 224])
        >>> output['ar'] # image best fits the canvas 224x448
        torch.tensor([1,2])
    N      bilinearF)
image_mean	image_stdpossible_resolutions	tile_sizemax_num_tilesdtyperesampleresize_to_max_canvasr   r   r   r   r   r   r   r   returnc                   ||J d|d|            |s|rt          ||          }n|}t          j        |                              dd          | _        t
                              d| j         d           || _        |d u |d u k    sJ d|d	|            || _        || _	        |rd n|| _
        || _        t          j        j        |                                         | _        || _        t$          | _        d S )
NzUEither possible_resolutions or max_num_tiles must be given. Got possible_resolutions=z and max_num_tiles=)r   r      zFound possible_resolutions: z4. Will fit the images into the canvas with best fit.zINeed to provide both or none of image_mean and image_std. Got image_mean=z and image_std=)r	   torchtensorreshaper   loggerdebugr   meanstdmax_sizer   torchvision
transformsInterpolationModeupperr   r   r   )	selfr   r   r   r   r   r   r   r   s	            t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/clip/_transform.py__init__zCLIPImageTransform.__init__^   s[    !,0I0IzNbzzjwzz 1J0II $ 	8 	8#=+y$ $ $   $8 $)L1E$F$F$N$NrST$U$U!z4+Dzzz	
 	
 	
 %9! d"
 
 
jJjj^gjj
 
 
 	 !5C)
#.@AQAQR #"    sample	inferencec                    |d         }t          |t          j                  s
J d            |j        dk    r|                    d          }t	          j        |          }t	          j        || j        d          }t          || j	        | j
                  }t          ||| j        | j                  }| j        r!t	          j        || j        | j                  }|                     || j        	          }t'          j        |                              d
          | j        z  }|                    ||d           |S )a  
        Apply image decoding and transformations to the "image" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with an "image" field containing
                a List[Message] to tokenize
            inference (bool): Whether the template is being used for inference or not.

        Returns:
            Mapping[str, Any]: The sample with an updated "image" filed and added
                "aspect_ratio" field.
        imagez Input image must be a PIL image.RGBT)r   scale)r2   r   r   )r2   target_sizer   r&   )r$   r%   )r2   r   r   )r2   aspect_ratio)
isinstancer   modeconvertFto_imageto_dtyper   r
   r   r   r   r   r&   r$   	normalizer%   r   r   r   r    r!   update)r+   r/   r0   r2   best_resolutionr6   s         r,   __call__zCLIPImageTransform.__call__   sQ    w%--QQ/QQQ- :MM%((E
5!!
5
$??? .!%!:!%!:
 
 
  ']]	
 
 
 9 	EKDI48DDDE UdnEE|O44<<R@@DNR , 	
 	
 	
 r.   )F)__name__
__module____qualname____doc__r   bfloat16r   r   floatr   intr   strboolr-   r   r   r@    r.   r,   r   r      s)       A AL -1+/@D'("^"%*/# /# /# T%[)/# DK(	/#
 'tE#s(O'<=/# /#  }/# {/# /# #/# 
/# /# /# /#d <A7 7c3h'7487	c	7 7 7 7 7 7r.   r   )loggingtypingr   r   r   r   r   r   r'   PILr   =torchtune.modules.transforms.vision_utils.get_canvas_best_fitr	   r
   9torchtune.modules.transforms.vision_utils.resize_with_padr   3torchtune.modules.transforms.vision_utils.tile_cropr   torchvision.transforms.v2r   r:   	getLoggerrA   r"   r   rJ   r.   r,   <module>rS      s    6 6 6 6 6 6 6 6 6 6 6 6 6 6                   V U U U U U I I I I I I 5 5 5 5 5 5		8	$	$l l l l l l l l l lr.   