
     `i:                     T   d dl mZmZ d dlZd dlZd dlmZ ddl	m
Z
mZ ddlmZmZmZmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZ  ej        e          ZdZ dZ!dZ"dZ#dZ$de%d         de%e&         fdZ'de&de(e&e&f         fdZ) G d de          Z* G d de          Z+dgZ,dS )    )OptionalUnionN)
functional   )BatchFeatureget_size_dict)IMAGENET_STANDARD_MEANIMAGENET_STANDARD_STDPILImageResamplingSizeDict)UnpackVideosKwargs)
TensorTypelogging)BaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videoszYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.zgYou are provided the following series of {frame_count} frames from a {video_duration} [H:MM:SS] video.
z

z
Frame from {timestamp}:i   videostorch.Tensorreturnc                     t          d          x}}| D ]A}|                                dd         \  }}t          ||          }t          ||          }B||fS )zH
    Get the maximum height and width across all videos in a batch.
    z-infN)floatsizemax)r   
max_height	max_widthvideoheightwidths         /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/smolvlm/video_processing_smolvlm.pyget_max_height_widthr#   )   sh     #6]]*J * *

RSS),,
y))			""    resolution_max_sidec                    |                                  dd         \  }}t          t          |          }|t          ||          n|}||z  }||k    r#|}t	          ||z            }|dz  dk    r|dz  }n(||k    r"|}t	          ||z            }|dz  dk    r|dz  }t          |d          }t          |d          }||fS )a  
    Get the output size of the video after resizing given a dictionary specifying the max and min sizes.
    Args:
        video (`np.ndarray`):
            Video to resize.
        resolution_max_side (`int`):
            The longest edge of the video will be resized to this value. The shortest edge will be resized to keep the
            input aspect ratio.
    Returns:
        The output size of the video after resizing.
    r   N   r      )r   minMAX_IMAGE_SIZEr   int)r   r%   r    r!   aspect_ratios        r"   get_resize_output_image_sizer-   5   s     JJLL%MFE n.ABB0C0K#fe,,,Qd6>L#U\)**A:??aKF	%$F\)**19>>QJE^^FqMME5=r$   c                   L    e Zd ZU eeeef                  ed<   ee         ed<   dS )SmolVLMVideoProcessorInitKwargsmax_image_sizedo_padN)	__name__
__module____qualname__r   dictstrr+   __annotations__bool r$   r"   r/   r/   ]   s;         T#s(^,,,,TNr$   r/   c                       e Zd Zej        ZddiZddiZeZ	e
ZdZdZdZdZdZdZeZddgZdee         f fd	Z	 	 d*dddeded         deddf
dZ	 	 d+dddeeef         dededef
dZ	 	 	 d,dedee         deeee f                  dee         fdZ!	 d-de"d         d ed!ededed         d"ed#e d$ed%ed&eee e"e          f                  d'eee e"e          f                  d(eee#e$f                  fd)Z% xZ&S ).SmolVLMVideoProcessorlongest_edgei  il  TFpixel_valuespixel_attention_maskkwargsc                     t                      j        di | d|v rd|v r|d         |d         d<   d|v rO|d         d         | _        |d         d         | _        t	          |d         d         | j                  | _        d S d S )Nr   video_sampling
video_size
max_framesfps)default_to_squarer9   )super__init__
num_framesrD   r   rE   r   )selfr?   	__class__s     r"   rG   zSmolVLMVideoProcessor.__init__q   s    ""6""" V 0F : :5;F^F#$\2v%%$%56|DDO./6DH%f-=&>|&L`d`vwwwDIII &%r$   Nr   r   r   interpolationzF.InterpolationMode	antialiasr   c                    ||nt           j        j        }|t           j        j        k    r+t                              d           t           j        j        }|j        rt          ||j                  }n0|j	        r|j
        r|j	        |j
        f}nt          d| d          t          j        ||||          }| j        d         | j        d         f}t          j        ||||          }|S )a9  
        Resize an video to `(size["height"], size["width"])`.
        Args:
            video (`torch.Tensor`):
                Video to resize.
            size (`SizeDict`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output video.
            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                `InterpolationMode` filter to use when resizing the video e.g. `InterpolationMode.BICUBIC`.
        Returns:
            `torch.Tensor`: The resized video.
        NzYou have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. BICUBIC resample will be used as an alternative. Please fall back to image processor if you want full consistency with the original model.)r%   zHSize must contain 'height' and 'width' keys, or 'longest_edge' key. Got .)rK   rL   r<   )FInterpolationModeBILINEARLANCZOSloggerwarning_onceBICUBICr<   r-   r    r!   
ValueErrorresizer0   )rI   r   r   rK   rL   r?   new_sizemax_sizes           r"   rW   zSmolVLMVideoProcessor.resize}   s   ( *7)BH[HdA/777A  
 /7M 
	q 4$($5  HH [ 	qTZ 	qTZ0HHohlooopppQZ[[[ &~68KN8[[QZ[[[r$   r   padded_sizemax_num_framesfillreturn_pixel_maskc                    |                                 dd         }|d         |d         z
  }|d         |d         z
  }||j        d         z
  }	|dk     s|dk     rt          d| d| d          ||k    r!d|d|ddd|	g}
t          j        ||
|          }d}|rFt          j        |d	dddddf         t
          j        
          }d|d	d|d         d|d         f<   ||fS )a"  Pads the sample with empty video to the padded_size
        Args:
            video (`torch.Tensor`):
                Video to pad.
            padded_size (`tuple[int, int]`):
                Height and width to pad.
            max_num_frames (`int`):
                The maximum number of frames to which video will be padded.
            fill (`int`, *optional*):
                The value to use for the padding.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                Whether to return a pixel mask.
        r   Nr   r(   zzPadding dimensions are negative. Please make sure that the padded size is larger than the original size. Got padded size: z, original size: rN   )r\   .dtype)r   shaperV   rO   padtorch
zeros_likeint64)rI   r   rZ   r[   r\   r]   original_sizepadding_heightpadding_widthpadding_framepadding
pixel_masks               r"   rb   zSmolVLMVideoProcessor.pad   sH   * 

RSS)$Q-*::#Aq)99&Q71 2 2b3>b bQ^b b b   K''-NAq!]SGE%t444E 
 	H)%Q111*=U[QQQJFGJs.mA..0B-2B0BBCj  r$   r(   metadatarH   rD   	skip_secsc                 j   |t          |dd          t          d          ||n| j        }||n| j        }|j        }t          t          ||d         z                      }t          ||          }|dk     rd}d}	|dz
  }
|dk    rH|d         d|z  z
  ||z  k    r3t          ||d         z            }	t          |||d         z  z
            }
t          d|	          }	t          |
|dz
            }
|	|
k    rd|dz
  }
}	t          j
        |	|
|t
                    }t          j        |          }|S )	a!  
        Video sampling function which:
            - Uses `num_frames` (if provided) or calculates it from `fps` and metadata.
            - Applies a basic center-skip if fewer frames than available, otherwise
                optionally skips `skip_secs` from both the start and end.
            - Uniformly samples the desired number of frames between the start and end indices.

        Args:
            metadata (`VideoMetadata`):
                Metadata of the video containing information about total duration, fps and total number of frames.
            num_frames (`int`, *optional*):
                Maximum number of frames to sample. Defaults to `self.num_frames`.
            fps (`int` or `float`, *optional*):
                Target frames to sample per second. Defaults to `self.fps`.
            skip_secs (`float`, *optional*, defaults to `1`):
                Number of seconds to skip from the start and end if the video is long enough.

        Returns:
            np.ndarray:
                Indices to sample video frames.
        NrD   zAsked to sample frames per second but no video metadata was provided which is required when sampling in SmolVLM. Please pass in `VideoMetadata` object or set `do_sample_frames=False`durationr(   r   r'   r_   )getattrrV   rH   rD   total_num_framesr+   roundr)   r   nplinspaceunique)rI   rl   rH   rD   rm   r?   rq   estimated_framesdesired_frames	start_idxend_idxindicess               r"   sample_framesz#SmolVLMVideoProcessor.sample_frames   su   : wx==EX  
 $.#9ZZt
_cc$(#4 uS8J+?%?@@AA -z::AN 	"Q&q==hz2Q]BzTWGWXXI788I*Y%-HHIIG9%%	g/!344!"$4q$8wI+i.LLL)G$$r$   r   do_convert_rgb	do_resize
do_rescalerescale_factordo_normalizer1   
image_mean	image_stdreturn_tensorsc           	         t          |          \  }}i }|                                D ];\  }}|r|                     |          }|r|                     |||          }|||<   <t	          ||          }t          |          \  }}i }|                                D ]$\  }}|                     |||||
|          }|||<   %t	          ||          }|	rt          |          }t          d |D                       }t          |          \  }}i }i }|                                D ]*\  }}|                     |||          \  }}|||<   |||<   +t	          ||          }t	          ||          }|rt          j
        |d          n|}d|i}|	r|	r|t          j
        |d          n||d<   t          ||          S )	N)r   rK   c              3   4   K   | ]}t          |          V  d S N)len).0r   s     r"   	<genexpr>z4SmolVLMVideoProcessor._preprocess.<locals>.<genexpr>@  s(       J JU J J J J J Jr$   )rZ   r[   r   )dimr=   r>   )tensor_type)r   itemsconvert_to_rgbrW   r   rescale_and_normalizer#   r   rb   rc   stackr   )rI   r   r|   r}   r   rK   r~   r   r   r1   r   r   r   r?   grouped_videosgrouped_videos_indexresized_videos_groupedra   stacked_videosresized_videosprocessed_videos_groupedprocessed_videospad_sizer[   processed_padded_mask_groupedpadded_masksr>   datas                               r"   _preprocessz!SmolVLMVideoProcessor._preprocess  sq     0EV/L/L,,!#%3%9%9%;%; 	; 	;!E> E!%!4!4^!D!D e!%^$Vc!d!d,:"5))'(>@TUU/D^/T/T,,#% %3%9%9%;%; 	= 	=!E>!77
NL*V_ N /=$U++)*BDXYY 	g+,<==H  J J9I J J JJJN3HIY3Z3Z0N0,.)')$)7)=)=)?)? D D%~/3xx" 08 0 0, 3A(/7C-e44-.FH\]]#12OQe#f#f CQg5;'7Q????Wg 01 	 *,8 0a8888) '(
 Dn====r$   )NT)r   T)NNr(   r   )'r2   r3   r4   r   rR   resampler   r0   r	   r   r
   r   r}   r~   r   r|   r1   do_sample_framesr/   valid_kwargsmodel_input_namesr   rG   r   r   r8   rW   tupler+   rb   r   r   r   r{   listr6   r   r   __classcell__)rJ   s   @r"   r;   r;   b   s       !)HG$D$c*N'J%IIJLNF2L')?@
x(G!H 
x 
x 
x 
x 
x 
x  :>/ // /   56	/
 / 
/ / / /l "&)! )!)! 38_)! 	)!
 )!  )! )! )! )!\ %)+/#$? ?? SM? eCJ'(	?
 C=? ? ? ?\ <@>> >>^$>> >> 	>>
 >>   56>> >> >> >> >> U5$u+#567>> E%e"456>> !sJ!78>> >> >> >> >> >> >> >>r$   r;   )-typingr   r   numpyrs   rc   torchvision.transforms.v2r   rO   image_processing_utilsr   r   image_utilsr	   r
   r   r   processing_utilsr   r   utilsr   r   video_processing_utilsr   video_utilsr   r   r   
get_loggerr2   rS   DEFAULT_SYSTEM_MESSAGEDEFAULT_VIDEO_INTRODEFAULT_MEDIA_OUTTROFRAME_TIMESTAMP_MESSAGEr*   r   r+   r#   r   r-   r/   r;   __all__r9   r$   r"   <module>r      s    # " " " " " " "      5 5 5 5 5 5 A A A A A A A A f f f f f f f f f f f f 4 4 4 4 4 4 4 4 ( ( ( ( ( ( ( ( 8 8 8 8 8 8 O O O O O O O O O O 
	H	%	% V n   5 	#n!5 	#$s) 	# 	# 	# 	#%% 38_% % % %P    l   
v> v> v> v> v>. v> v> v>r #
#r$   