
     `i'                        d Z ddlZddlmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZmZmZmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ  G d de          Z  eded           G d de                      Z!dgZ"dS )z#video processor class for GLM-4.1V.    N)OptionalUnion   )BatchFeature)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimensionPILImageResamplingSizeDictget_image_size)UnpackVideosKwargs)
TensorTypeadd_start_docstrings)BASE_VIDEO_PROCESSOR_DOCSTRINGBaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videos   )smart_resizec                       e Zd ZU dZeeef         ed<   dZe	e         ed<   dZ
e	e         ed<   dZe	e         ed<   dZe	ee                  ed<   dZe	ee                  ed<   dS )Glm4vVideoProcessorInitKwargsNmax_image_size
patch_sizetemporal_patch_size
merge_size
image_mean	image_std)__name__
__module____qualname__r   dictstrint__annotations__r   r   r   r   r   listfloatr        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/glm4v/video_processing_glm4v.pyr   r   '   s         %)NDcN))) $J$$$)-#--- $J$$$(,Je%,,,'+IxU$+++++r*   r   zfConstructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.aj  
        patch_size (`int`, *optional*, defaults to 14):
            The spacial patch size of the vision encoder.
        temporal_patch_size (`int`, *optional*, defaults to 2):
            The temporal patch size of the vision encoder.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
    c                        e Zd Zej        ZdddZddiZeZ	e
ZdZdZdZdZdZdZdZdZdZeZd	ZdZd
dgZdee         f fdZ	 d$dee         def fdZ	 d$de dee!e"e#f                  fdZ$dddej        dddddddddfde%e&j'                 de(de(dee         dede(de#de(dee!e#e%e#         f                  dee!e#e%e#         f                  dee"         d ee"         d!ee"         d"ee!e)e*f                  fd#Z+ xZ,S )%Glm4vVideoProcessori 1  i )shortest_edgelongest_edger/   T      i,     pixel_values_videosvideo_grid_thwkwargsc                      t                      j        di | | j        E| j                            dd           | j                            dd           t	          d          d S d S )Nr.   r/   :size must contain 'shortest_edge' and 'longest_edge' keys.r)   )super__init__sizeget
ValueError)selfr5   	__class__s     r+   r9   zGlm4vVideoProcessor.__init__Q   sq    ""6"""9 IMM/4008DIMM.Z^<_<_<gYZZZ ! <g<gr*   Nr:   returnc                 l    |d|vsd|vrt          d           t                      j        dd|i|S )z
        Update kwargs that need further processing before being validated
        Can be overridden by subclasses to customize the processing of kwargs.
        Nr.   r/   r7   r:   r)   )r<   r8   _further_process_kwargs)r=   r:   r5   r>   s      r+   rA   z+Glm4vVideoProcessor._further_process_kwargsX   sQ     !<!<VZ@Z@ZYZZZ.uww.CCDCFCCCr*   metadatafpsc                     t          dd          t          d          j        }||n| j        |dz
  j        pt          j        z            dz   }|| j        k    rBt          t          j	        |z                      }fdt          |          D             }nat          | j        z            }||k    rt          t          |                    }n't          j        d||d          }	fd	|	D             }t                      g }}
|D ]0}||
vr*|
                    |           |                    |           1t#          |          dz  r|                    |d
                    t          j        |          S )a  
        Args:
            metadata (`VideoMetadata`):
                Metadata of the video containing information about total duration, fps and total number of frames.
            fps (`int` or `float`, *optional*):
                Target frames to sample per second. Defaults to `self.fps`.
        Returns:
            np.ndarray:
                Indices to sample video frames.
        NrC   zAsked to sample frames per second but no video metadata was provided which is required when sampling in GLM4V. Please pass in `VideoMetadata` object or set `do_sample_frames=False`r   c                     g | ]<}t          t          t          j        |j        z  z                                =S r)   minr%   mathceilrC   ).0imax_frame_idxrB   requested_fpss     r+   
<listcomp>z5Glm4vVideoProcessor.sample_frames.<locals>.<listcomp>   sC    tttfgSDIa(,>NQ^>^4_4_0`0`aatttr*   r   T)endpointc                 ~    g | ]9}t          t          t          j        |j        z                                :S r)   rF   )rJ   trL   rB   s     r+   rN   z5Glm4vVideoProcessor.sample_frames.<locals>.<listcomp>   s=     n n nZ[]C	!hlBR8S8S4T4T!U!U n n nr*   )getattrr<   total_num_framesrC   durationroundmax_durationr%   rH   floorranger'   nplinspacesetaddappendlenarray)r=   rB   rC   r5   total_framesrU   nframe_indicesnum_samplestarget_secondsseenuniqidxrL   rM   s    `           @@r+   sample_framesz!Glm4vVideoProcessor.sample_framesf   s     wx==EX  
  0"DH$q($Omhl.J(K(Ka(Ot(((DJx-78899AttttttkpqrkskstttMMd/-?@@Kl** $U<%8%8 9 9!#Q+PT!U!U!U n n n n n_m n n nUUBd  	! 	!C$C   t99q= 	"KKR!!!x~~r*   gp?videosdo_convert_rgb	do_resizeinterpolation
do_rescalerescale_factordo_normalizer   r   r   r   r   return_tensorsc                 6   t          |          \  }}i }|                                D ]\  }}|j        \  }}}}}|||}}}|rt          ||||||z  |j        |j                  \  }}|                    ||z  |||          }|                     |t          ||          |          }|                    |||||          }|||<   t          ||          }t          |          \  }}i } i }!|                                D ]5\  }}t          |d         t          j                  \  }}|                     |||||	|
          }|}"|"j        d         |z  dk    r@|"d d dd f                             d|dz
  ddd          }#t          j        |"|#gd          }"|"j        d d	         \  }$}%}&|%|z  }%||z  ||z  }(}'|"                    |$|%||&|'|z  |||(|z  ||
  
        }"|"                    ddd
dddd	ddd
  
        }"|"                    |$|%|'z  |(z  |&|z  |z  |z            })|)| |<   |%|'|(gg|$z  |!|<   7t          | |          }*t          |!|          }!t          j        |*d          }+t          j        |!          },|+|,d}-t)          |-|          S )N)
num_framesheightwidthtemporal_factorfactor
min_pixels
max_pixels)rt   ru   )r:   rm   r   )channel_dimr   rR   )dimr               r1      	   )r3   r4   )datatensor_type)r   itemsshaper   r.   r/   viewresizer   r   r   r	   FIRSTrescale_and_normalizerepeattorchcatpermutereshapetensorr   ).r=   rj   rk   rl   r:   rm   rn   ro   rp   r   r   r   r   r   rq   r5   grouped_videosgrouped_videos_indexresized_videos_groupedr   stacked_videosBTCHWrs   rt   ru   resized_heightresized_widthresized_videosprocessed_videos_groupedprocessed_gridspatchesrepeats
batch_sizegrid_tchannelgrid_hgrid_wflatten_patchesprocessed_videosr3   r4   r   s.                                                 r+   _preprocesszGlm4vVideoProcessor._preprocess   s~   $ 0EV/L/L,,!#%3%9%9%;%; 	; 	;!E>*0MAq!Q()1aJ ]0<)!$7%
2#1#01 1 1- "0!4!4QUAq!!D!D!%"!}MMM"/ "- " "
 "0!4!4Q1nm!\!\,:"5))'(>@TUU 0E^/T/T,,#% %3%9%9%;%; %	M %	M!E>,:>!;LZjZp,q,q,q)NM "77
NL*V_ N %G }Q"55::!!!!RSS&/004G!4KQPQSTUU)Wg$6A>>>*1-*;'J22F+z9=J;VFFll#*$*$ G ooaAq!Q1aCCG%oo&(--
:ZG O />$U+'-vv&>%?*%LOE"")*BDXYY(:NOO#i(8a@@@o66#6,
 

 >BBBBr*   )N)-r    r!   r"   r
   BICUBICresampler:   r   r   r   r   r   rl   rn   rp   rk   do_sample_framesr   r   rW   r   r   valid_kwargsrs   rC   model_input_namesr   r9   r   r   r#   rA   r   r   r%   r(   ri   r'   r   Tensorboolr$   r   r   __classcell__)r>   s   @r+   r-   r-   0   s        ")H&8KLLD$&9:N!JIIJLNJLJ0LJ
C.0@A[(E!F [ [ [ [ [ [ $(D Dx D 
	D D D D D D" ,00 00 eCJ'(0 0 0 0j  $#',>,F )!:>9=$(-1$(;?aC aCU\"aC aC 	aC
 x aC *aC aC aC aC U5$u+#567aC E%e"456aC SMaC &c]aC SMaC !sJ!78aC aC aC aC aC aC aC aCr*   r-   )#__doc__rH   typingr   r   numpyrZ   r   image_processing_utilsr   image_utilsr   r   r	   r
   r   r   processing_utilsr   r   utilsr   r   video_processing_utilsr   r   video_utilsr   r   r   image_processing_glm4vr   r   r-   __all__r)   r*   r+   <module>r      s   * )  " " " " " " " "      2 2 2 2 2 2                5 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 X X X X X X X X O O O O O O O O O O 0 0 0 0 0 0, , , , ,L , , , l" }C }C }C }C }C, }C }C }C@ !
!r*   