
    .`i-k                        d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	Z
d dlmZ erd dlZd dlmZ d dlmZ  ee          Zdej        deeef         d	ej        fd
Zdej        ded	ej        fdZdej        ded	ej        fdZ G d d          Z e            Ze                    d           G d de                      Ze                    d           G d de                      Ze                    d           G d de                      Z e                    d           G d de                      Z!dS )    N)abstractmethod)BytesIO)TYPE_CHECKINGAnycast)init_logger)ExtensionManagerframessizereturnc                     | j         \  }}}}|\  }}t          j        ||||f| j                  }dd l}t          |           D ]\  }	}
 |j        |
||f          }|||	<   |S )Ndtyper   )shapenpemptyr   cv2	enumerateresize)r
   r   
num_frames_channels
new_height	new_widthresized_framesr   iframeresized_frames               i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/multimodal/video.pyresize_videor       s    !'J1h J	X	ZH5V\  N JJJf%% * *5"
59j*ABB)q    size_factorc                     | j         \  }}}}t          ||z            }t          ||z            }t          | ||f          S N)r   intr    )r
   r"   r   heightwidthr   r   s          r   rescale_video_sizer(   #   sK     ,AvuaVk)**JEK'((IY 7888r!   r   c                     | j         d         }|dk    r| S t          j        d|dz
  |t                    }| |df         }|S )Nr      r   .)r   r   linspacer%   )r
   r   total_framesframe_indicessampled_framess        r   sample_frames_from_videor0   +   sO    <?LRK<!#3ZsKKKMM3./Nr!   c                      e Zd Zee	 ddededeej	        e
eef         f         fd                        Zededee         de
eef         d	edef
d
            Zedddee         d	edeej	        ee         e
eef         f         fd            Zedee         dededeej	        eee         f         fd            ZdS )VideoLoaderr*   datar   r   c                     t           r$   )NotImplementedErrorclsr3   r   kwargss       r   
load_byteszVideoLoader.load_bytes6   s
    
 "!r!   idxfailed_framesnext_target_mapr-   c                 R    |sdS |d         }|                     ||          }| |k     S )z;Check if current frame can recover the oldest failed frame.Fr   )get)r:   r;   r<   r-   oldest_failedlimits         r   _can_use_for_recoveryz!VideoLoader._can_use_for_recovery=   s:      	5%a(##M<@@U{r!   capzcv2.VideoCapturer.   c                 .   ddl }t          |                     |j                            }t          |                     |j                            }|dk    r|dk    sJ d| d|             t          |          }|r|d         nd}i }t          t          |          dz
            D ]}	||	dz            |||	         <   |||d         <   g }
g }g }i }d}t          |dz             D ]L}||v }|                                 }|s3|r0t          
                    d|           |                    |           Pt                              ||||          }|s|r|                                 \  }}|r||j        dk    r |j        ||j                  }|
                    |           |                    |           |dz  }|r:|                    d          }|||<   t                              d||||z
             |r0t          
                    d	|           |                    |           N|D ]}t          
                    d
|           |
rt)          j        |
          }n$t)          j        d||dft(          j                  }|||fS )a  
        Read frames with dynamic window forward-scan recovery.

        When a target frame fails to load, the next successfully grabbed
        frame (before the next target frame) will be used to recover it.

        Args:
            cap: OpenCV VideoCapture object
            frame_indices: Sorted list of target frame indices to load
            total_frames: Total number of frames in the video

        Returns:
            Tuple of (frames_array, valid_frame_indices, recovered_map)
            - frames_array: Array of loaded frames
            - valid_frame_indices: List of frame indices that were loaded
            - recovered_map: Dict mapping recovered_idx -> source_idx
        r   Nz Invalid video frame size: width=z	, height=r*   r+   z-Failed to grab frame %d during video loading.z-Recovered frame %d using frame %d (delay: %d)z1Failed to retrieve frame %d during video loading.z/Frame %d could not be recovered (end of video).   r   )r   r%   r>   CAP_PROP_FRAME_WIDTHCAP_PROP_FRAME_HEIGHTsetrangelengrabloggerwarningappendr2   rA   retriever   cvtColorCOLOR_BGR2RGBpopinfor   stackr   uint8)rB   r.   r-   r   r'   r&   frame_idx_setmax_frame_idxr<   kframes_listvalid_frame_indicesfailed_frames_idxrecovered_mapr   r:   is_target_frameokcan_recoverretr   	rgb_framerecovered_idx
failed_idxr
   s                            r   _read_frames_with_recoveryz&VideoLoader._read_frames_with_recoveryK   s*   . 	


CGGC45566SWWS67788qyyVaZZZGuGGvGG (Z' M**-:Ab)) +-s=))A-.. 	E 	EA0=a!e0DOM!,---9b)*)+)+')(**++ +	2 +	2C!]2O B " 2NNG   &,,S111 &;;& K  2+ 2 \\^^
U 25,a ,UC4E F FI&&y111'..s333FA" (9(=(=a(@(@7:m4K)-/	   % 2NNK   &,,S111 , 	 	JNNA     	EXk**FFXq&%328DDDF*M99r!   num_expected_framesrV   c                    dd l }t          |                     |j                            }t          |                     |j                            }t          j        |||dft
          j                  }d}g }	t          |dz             D ]}
| 	                                }|s |
|v rt                              d|
           8|
|v rh|                                 \  }}|r4 |j        ||j                  ||<   |	                    |
           |dz  }t                              d|
           t!          |	          }||k     r t                              d||z
  ||           |d |         ||	fS )Nr   rD   r   r+   zIFailed to grab frame %d during video loading. This frame will be skipped.zMFailed to retrieve frame %d during video loading. This frame will be skipped.zgVideo loading completed with %d broken/unreadable frames. Expected %d frames but only loaded %d frames.)r   r%   r>   rE   rF   r   r   rT   rH   rJ   rK   rL   rN   rO   rP   rM   rI   )rB   r.   rd   rV   r   r'   r&   r
   r   rY   r:   r]   r_   r   valid_num_framess                  r   _read_frameszVideoLoader._read_frames   s    	


CGGC45566SWWS67788.qARRR *++ 	 	CB -''NN6  
 m## \\^^
U 
 ,UC4E F FF1I'..s333FAA NN6   233111NN@#&66#    '''(*:<OOOr!   Nr*   )__name__
__module____qualname__classmethodr   bytesr%   tuplenptNDArraydictstrr   r9   staticmethodlistboolrA   rc   rG   rg    r!   r   r2   r2   5   s       ,." ""&)"	s{DcN*	+" " " ^ ["
 Cy c3h 	
 
   \ h:h:Cyh: h: 
s{DItCH~5	6	h: h: h: \h:T 1P3x1P !1P 	1P
 
s{Cc*	+1P 1P 1P \1P 1P 1Pr!   r2   identityc                   N    e Zd ZdZe	 d	dedededeeef         fd            Z	dS )
IdentityVideoLoaderaJ  IdentityVideoLoader returns raw video bytes without decoding.

    This allows the model processor to handle video decoding and
    is required for models like Kimi-K2.5 that need custom video chunk splitting.

    NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
    to opencv before release if needed.
    r*   r3   r   r8   r   c                 
    |d fS r$   rv   r6   s       r   r9   zIdentityVideoLoader.load_bytes   s     Tzr!   Nrh   )
ri   rj   rk   __doc__rl   rm   r%   r   rn   r9   rv   r!   r   ry   ry      sv              	
 
sCx   [  r!   ry   opencvc                   x    e Zd Zd Ze	 	 	 	 ddedededed	ed
ee	j
        eeef         f         fd            ZdS )OpenCVVideoBackendc                     dd l m} d }|                                D ][}|                    |          s|                    |          s,|                    |          \  }}}|dk     s|dk    r|dk     rY|} |S Nr   r+      cv2.videoio_registryvideoio_registrygetStreamBufferedBackends
hasBackendisBackendBuiltIn%getStreamBufferedBackendPluginVersionselfvrapi_prefbackendr   abiapis          r   get_cv2_video_apiz$OpenCVVideoBackend.get_cv2_video_api      ))))))3355 	 	G==)) &&w//  FFwOO377saxxC!GGHr!   r*   ,  Fr3   r   fpsmax_durationframe_recoveryr   c                    ddl } |                                             } |j        t          |          |g           }	|	                                st          d          t          |	                    |j                            }
|	                    |j	                  }|dk    r|
|z  nd}|
}|dk    rt          ||
          }|dk    r%t          |t          j        ||z                      }t          d|          }||
k    rt          t          d|                    }n4t!          j        d|
dz
  |t                    }|                                }|rU|                     |	||
          \  }}}t)          |          }|r(t*                              dt)          |                     n8t/          |          }|                     |	||t          |                    \  }}}|
||d|||
k    d}||fS )	a  
        Load video frames from bytes.

        Args:
            data: Raw video bytes
            num_frames: Target number of frames to sample (-1 for all)
            fps: Target FPS for sampling (-1 for original)
            max_duration: Maximum duration (unused in base backend)
            frame_recovery: Enable forward-scan recovery for failed frames

        Returns:
            Tuple of (frames_array, metadata_dict)
        r   NCould not open video streamr+   r   7Frame recovery: %d frames recovered using forward scan.r|   total_num_framesr   durationvideo_backendframes_indicesdo_sample_frames)r   r   VideoCapturer   isOpened
ValueErrorr%   r>   CAP_PROP_FRAME_COUNTCAP_PROP_FPSminmathfloormaxrt   rH   r   r,   tolistrc   rI   rK   rR   rG   rg   )r7   r3   r   r   r   r   r8   r   r   rB   total_frames_numoriginal_fpsr   num_frames_to_sample	frame_idxuniform_sampled_framesr
   rY   r[   rf   rU   metadatas                         r   r9   zOpenCVVideoBackend.load_bytes  sA   . 	


#%%))++cwt}}gr::||~~ 	<:;;;swws'?@@AAwws/006BQ6F6F#l22A  0>>#&z3C#D#D 77#&';TZSV=W=W#X#X "1&:;;#333U1&:;;<<II%'[#a')=S& & &" /5577I 	9<9W9WY 0: :6F'  ##677 M&&  
  	NNM<?<L<L]$8#i..= =9F$&9 !1 %1 !14D D	
 	
 xr!   N)r*   r*   r   F)ri   rj   rk   r   rl   rm   r%   ru   rn   ro   rp   rq   rr   r   r9   rv   r!   r   r~   r~     s            $Q  Q Q  Q  	Q 
 Q  Q  
s{DcN*	+Q  Q  Q  [Q  Q  Q r!   r~   opencv_dynamicc                   r    e Zd Ze	 	 	 	 ddedededed	ed
eej	        e
eef         f         fd            ZdS )OpenCVDynamicVideoBackendr*   r   r   Fr3   r   r   r   r   r   c                 Z   ddl } |                                             } |j        t          |          |g           }	|	                                st          d          t          |	                    |j                            }
|	                    |j	                  dk    r|
z  nd}|
dz
  |pt          z            dz   }||k    rOt          t          j        |z                      }t          fdt          |          D                       }nit          |z            }||
k    rt          t          |
                    }n4t!          j        d||d          }t          fd|D                       }|rU|                     |	||
          \  }}}t'          |          }|r(t(                              d	t'          |                     n;t-          |          }|                     |	|t'          |          |
dz
            \  }}}|
|d
|dd}||fS )a  
        Load video frames with dynamic sampling based on duration.

        Args:
            data: Raw video bytes
            num_frames: Not used in dynamic backend
            fps: Target FPS for sampling (default: 2)
            max_duration: Maximum video duration to process (default: 300s)
            frame_recovery: Enable forward-scan recovery for failed frames

        Returns:
            Tuple of (frames_array, metadata_dict)
        r   Nr   r+   c                 z    h | ]7}t          t          t          j        |z  z                                8S rv   r   r%   r   ceil).0r   r   rV   r   s     r   	<setcomp>z7OpenCVDynamicVideoBackend.load_bytes.<locals>.<setcomp>  sO        s49Q5E5K+L+L'M'MNN  r!   T)endpointc                 t    h | ]4}t          t          t          j        |z                                5S rv   r   )r   trV   r   s     r   r   z7OpenCVDynamicVideoBackend.load_bytes.<locals>.<setcomp>  sJ        M3ty\9I/J/J+K+KLL  r!   r   r   Fr   )r   r   r   r   r   r   r%   r>   r   r   roundr   r   sortedrH   rt   r   r,   rc   rI   rK   rR   rG   rg   )r7   r3   r   r   r   r   r8   r   r   rB   r   r   nframe_indices_listnum_samplestarget_secondsr
   rY   r[   rf   frame_indices_setr   rV   r   s      `                  @@r   r9   z$OpenCVDynamicVideoBackend.load_bytesk  s   . 	


#%%))++cwt}}gr::||~~ 	<:;;;swws'?@@AAwws/006BQ6F6F#l22A )1,Fu]\%ABBQF
 |##DJx#~..//A!'     "1XX  " " lS011K...%)%0@*A*A%B%B""!#Q+PT!U!U!U%+    !/  & &"  	9<9W9W')9: :6F'  ##677 M&&  
 !$$6 7 7<?<L<L&,>(?(?AQTUAU= =9F$&9 !1 -1 %
 
 xr!   N)r*   r   r   F)ri   rj   rk   rl   rm   r%   ru   rn   ro   rp   rq   rr   r   r9   rv   r!   r   r   r   i  s         $X  X X  X  	X 
 X  X  
s{DcN*	+X  X  X  [X  X  X r!   r   molmo2c                   4   e Zd Zd Ze	 ddedededee         fd            Zededed	ed
e	dee         dedz  fd            Z
ededz  d	edededeedz  ej        f         f
d            Ze	 ddeded
e	dedz  dee         dz  dej        fd            Zedededed
e	dedededej        fd            Ze	 	 	 	 dded
e	dz  dedededeej        ee	ef         f         fd            Ze	 ddededeej        ee	ef         f         fd            ZdS )Molmo2VideoBackendc                     dd l m} d }|                                D ][}|                    |          s|                    |          s,|                    |          \  }}}|dk     s|dk    r|dk     rY|} |S r   r   r   s          r   r   z$Molmo2VideoBackend.get_cv2_video_api  r   r!          @	video_fpssampling_fpsmax_fpsr   c                    t          |          }t          |          }t          |          }|t          d          |dk    s|dk    rt          d| d| d          ||z  dk    rt          d| d| d	          g }t          ||d
z   |          D ]5}||k    r n,||z  dk    r"|                    t	          |                     6|S )a  
        Return the subset of `video_fps` factors that remain multiples
        of `sampling_fps`.

        Examples:
            >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
            [2, 6]
            >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
            [1, 5]
            >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
            [2]
            >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
            Traceback (most recent call last):
                ...
            ValueError: sampling_fps=2 must divide video_fps=5 to produce
                consistent frame steps.
        Nzsampling_fps must be providedr   z1video_fps and sampling_fps must be positive (got z, )zsampling_fps=z must divide video_fps=.r+   )r%   r   rH   rM   float)r7   r   r   r   
candidates	candidates         r   get_candidate_target_fpsz+Molmo2VideoBackend.get_candidate_target_fps  s(   0 	NN	<((g,,<===>>\Q..5!5 5%15 5 5   |#q((QQQYQQQ   
|Y]LII 	4 	4I7""9$))!!%	"2"2333r!   
max_framesr-   frame_sample_modecandidate_target_fpsNc                     d}d}|D ]d}t          t          ||z            d          }	t          ||	z            }
|dk    rd|v r|
|k    r n|}|
}K||
k    sJ |
|k    rZ|
|k    r|}|
}e|S )z]
        Get the target fps that best spans the videoand has the most frames sampled
        r   Nr+   uniform)r   r%   )r7   r   r   r-   r   r   num_frames_sampledselected_target_fps
target_fps	step_sizenum_frames_sampled_at_fpss              r   get_target_fpsz!Molmo2VideoBackend.get_target_fps	  s     ". 	C 	CJC	J 677;;I(+L9,D(E(E%!Q&&!2221J>>E&0#%>"" *-FFFFF,z99.1CCC +5')B&""r!   r   c                     |t          j        d||dt                    }n6t          t          ||z            d          }t          j        d||          }t          |          |k    r
|d |         }||fS )Nr   F)r   r   r+   )r   r,   r%   r   arangerI   )r7   r   r-   r   r   r.   r   s          r   get_frame_times_and_chosen_fpsz1Molmo2VideoBackend.get_frame_times_and_chosen_fps1  s     &K<e3  MM C	,? ?@@!DDIIayAAM}
**)+:+6M"M11r!   r   c                    |dk    rL|J |d         }|dd          D ]}||z  |k     r n|}t          j        d|          |z  }	|	|	|k              }	|	S |dk    r|{|dz
  |z  }
|
|k     r$t          j        d||dt           j                  }	nlt          j        d|d|z            }	t          j        |	|ggd	          }	t          |	          |k    sJ n#t          j        d||dt           j                  }	|	S t          |          )
Nr   r   r+   uniform_last_frameT)numr   r           stopstepaxis)r   r   r,   float64concatenaterI   r5   )r7   r   r   r   r   r   r8   r   candidate_fpstimesr   s              r   sample_timeszMolmo2VideoBackend.sample_timesD  sb    %%'333/2L!5abb!9 - --88E,Ia,,|;E%(*+EL"666"N   (**K8d"*  EE Icq7{KKKENEH:+>QGGGEu::33333xZ$bj   L%&7888r!   r   r   c                    |dk    r=|:|dk    r.t          j        |                              t                    }n||dz
  |z  k    rCt          j        d|dz
  t          ||          d                              t                    }n_t          j        d|dz
  t          ||z                      }	t          j        |	d	                   |dz
  k    rt          j        |	|dz
  ggd
          }	t          j        |	                              t                    }|d	         |k     sJ t          |	          |k    sJ n|dk    rBt          j        d|dz
  t          ||          d                              t                    }n`|dk    rK| 
                    ||          }
|                     |||||
          }|                     ||||          \  }}nt          |          |S )Nr   r   r+   r   T)r   r   r   r   r*   r   r   )r   r   astyper%   r,   r   r   r   r   rI   r   r   r   r5   )r7   r   r   r   r   r   r   r   indicesfloat_indicesr   r   r   s                r   _sample_framesz!Molmo2VideoBackend._sample_framesn  s>     4449L1$$)$455<<SAAZ!^w666+$q(J(899!	  
 &++  !#	)A-y7233! ! !
 8M"-..2BQ2FFF$&N&)9A)=(>?a% % %M (=1188==r{%55555=))Z77777"666k 1$
$455	  
 fSkk G %''#&#?#?	<#X#X "%"4"4 !$# # ;;# 	 JAww &&7888r!   r*   r   r3   c           	      (   dd l } |                                             } |j        t          |          |g           }	|	                                st          d          t          |	                    |j                            }
|	                    |j	                  }|dk    r|
|z  nd}|nt          t          d|
                    }t          |          }|                     |	||
t          |                    \  }}}||
k    }|
||d|d}|s||d<   ||fS |                     |
||||||                                          }|                     |	t          |          t#          |          |
dz
            \  }}}|
||d|dd}||fS )	Nr   r   r|   )r   r   r   r   r   r   r+   Fr   )r   r   r   r   r   r   r%   r>   r   r   rt   rH   rG   rg   r   r   r   rI   )r7   r3   r   r   r   r   r8   r   r   rB   r   r   r   r   rU   r
   rf   rY   r   r   s                       r   load_bytes_opencvz$Molmo2VideoBackend.load_bytes_opencv  s    	


#%%))++cwt}}gr::||~~ 	<:;;;swws'?@@AAwws/006BQ6F6F#l22A$U1&67788I	NNM<?<L<L]$4c)nn= =9F$&9  03CC$4#$!)$4 H $ A-@)*8##&&
 
 &(( 	 9<8H8H	NN	NNq 	9
 9
5 "5 !1 %1 %
 
 xr!   c                 &   t          t          d z  |                    dd                     }t          t          |                    dd                    }t          t          |                    dd                    } | j        |||||fi |}|S )Nr   r   r   r   )r   rr   rQ   r%   r   )r7   r3   r   r8   r   r   r   outs           r   r9   zMolmo2VideoBackend.load_bytes  s     !tVZZ8KT-R-RSSsFJJy!4455CNA!>!>??#c#
 
 
 
 
r!   )r   r$   )Nr*   r   r   rh   )ri   rj   rk   r   rl   r   rt   r   r%   rr   r   rn   ro   rp   r   r   r   rm   rq   r   r   r9   rv   r!   r   r   r     s          
 	. .. . 	.
 
e. . . [.` %#%# %# 	%#
 %# #5k%# 
%# %# %# [%#N 2"T\2 2 	2
 2 
ut|S[(	)2 2 2 [2$  48'9 '9'9 '9 	'9
 t'9 #5kD0'9 
'9 '9 '9 ['9R ;; ; 	;
 ; ; ; ; 
; ; ; [;z  )-A  A A  :A  	A 
 A  A  
s{DcN*	+A  A  A  [A F    
 
s{DcN*	+   [  r!   r   )"r   abcr   ior   typingr   r   r   numpyr   numpy.typingro   r   vllm.loggerr   vllm.utils.registryr	   ri   rK   rp   rn   r%   r    r   r(   r0   r2   VIDEO_LOADER_REGISTRYregisterry   r~   r   r   rv   r!   r   <module>r      s                + + + + + + + + + +           JJJ # # # # # # 0 0 0 0 0 0	X		 E#s(O     9s{ 9 93; 9 9 9 9S[ c ck    sP sP sP sP sP sP sP sPl )(**  
++    +   ,+( ))b  b  b  b  b  b  b  *)b J  011Z  Z  Z  Z  Z  2 Z  Z  21Z z ))z z z z z z z *)z z zr!   