
     `i+&                         d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZ ddlmZmZ dd	lmZ d
dlmZ  ej        e          Z G d de          ZdgZdS )zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)OptionalUnion   )BatchFeature)ProcessorMixin)
AddedTokenPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging)
VideoInput   )AutoTokenizerc            $       |    e Zd ZdZg dZdZdZdZd fd	Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d d
e	e
         deeeee         ee         f         dedeeeef         deeeef         de	e         dede	e         de	e         dedededededede	eeef                  def"dZed             Z fdZe fd            Z xZS )!InstructBlipVideoProcessora  
    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.

    Args:
        video_processor (`InstructBlipVideoVideoProcessor`):
            An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )video_processor	tokenizerqformer_tokenizerAutoVideoProcessorr   Nc                     t          |d          s5t          ddd          | _        |                    | j        gd           n|j        | _        || _        t                                          |||           d S )Nvideo_tokenz<video>FT)
normalizedspecial)special_tokens)hasattrr   r   
add_tokensnum_query_tokenssuper__init__)selfr   r   r   r   kwargs	__class__s         /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.pyr!   z#InstructBlipVideoProcessor.__init__?   s    y-00 	5))tTTTD  $"2!3D IIII(4D 0)5FGGGGG    TFr   imagestextadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsreturnc                 &   ||t          d          i }|7t          |t                    r|g}n?t          |t                    s*t          |d         t                    st          d           | j        d||||||||	|
||||||d|}|                    d          |d<   |                    d          |d	<   |
|| j        z  } | j        d||||||||	|
|||||dd|}|R| j        j	        | j        z  d
z  }|                     |d|	|
||||d	  	        |D ]fd|         D             |<   |
                    |           |,|                     ||          }|
                    |           t          ||          }|S )a%  
        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Nz3You have to specify at least one of images or text.r   zAInvalid input text. Please provide a string, or a list of strings)r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   	input_idsqformer_input_idsattention_maskqformer_attention_mask   F)r)   r/   r0   r1   r2   r3   r4   r6   c                 &    g | ]}         |z   S  r?   ).0samplekvideo_text_encodings     r%   
<listcomp>z7InstructBlipVideoProcessor.__call__.<locals>.<listcomp>   s$    'g'g'gF(;A(>(G'g'g'gr&   )r6   )tensor_typer?   )
ValueError
isinstancestrlistr   popr   r   r   contentupdater   r   )r"   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r#   encodingqformer_text_encodingtext_encodingvideo_tokensimage_encodingrB   rC   s                          @@r%   __call__z#InstructBlipVideoProcessor.__call__H   s^   4 >dlRSSS$$$ fvd++ fJtAw4L4L f !deee$:D$: %#5%%#5&;*C+E'=&;+-% %  !% %!$ -B,E,Ek,R,RH()1F1J1JK[1\1\H-. %d33
*DN #5%%#5&;*C+E'=&;+#   ! M& !#/7$:OORSS&*nn ',*?.G/I+A*?"/#' '5 
' 
'# ' h hA'g'g'g'g'gVcdeVf'g'g'gM!$$OOM***!11&1XXNOON+++nEEEr&   c                 J    | j         j        }| j        j        }ddg}||z   |z   S )Nr:   r<   )r   model_input_namesr   )r"   tokenizer_input_namesvideo_processor_input_namesqformer_input_namess       r%   rT   z,InstructBlipVideoProcessor.model_input_names   s7     $ @&*&:&L#24LM$'BBEXXXr&   c                    t           j                            |          rt          d| d          t          j        |d           t           j                            |d          }| j                            |           d| j        v }|r| j        	                    d            t                      j        |fi |}|r| xj        dgz  c_        |S )NzProvided path (z#) should be a directory, not a fileT)exist_okr   )ospathisfilerF   makedirsjoinr   save_pretrained
attributesremover    )r"   save_directoryr#   qformer_tokenizer_pathqformer_presentoutputsr$   s         r%   r_   z*InstructBlipVideoProcessor.save_pretrained   s    7>>.)) 	db~bbbccc
NT2222!#n>Q!R!R../EFFF .@ 	8O""#6777)%'').CCFCC 	5OO 344OOr&   c                      t                      j        |fi |}t          |t                    r|d         }t	          j        |d          }||_        |S )Nr   r   )	subfolder)r    from_pretrainedrG   tupler   r   )clspretrained_model_name_or_pathr#   	processorr   r$   s        r%   rh   z*InstructBlipVideoProcessor.from_pretrained   se    +EGG+,ITTVTT	 i'' 	%!!I)9:Wcvwww&7	#r&   )N)NNTFNNr   NNFFFFFTN)__name__
__module____qualname____doc__r`   video_processor_classtokenizer_classqformer_tokenizer_classr!   r   r   r   r   r
   rI   boolrH   r	   r   intr   r   rR   propertyrT   r_   classmethodrh   __classcell__)r$   s   @r%   r   r   '   s#        $ GFFJ0%O-H H H H H H (,^b#'5:;?$(,004*/+0',&+#;?#f f$f I0$y/4HYCZZ[f !	f
 tS/12f $%778f SMf f %SMf  (~f $(f %)f !%f  $f f  !f" !sJ!78#f& 
'f f f fP Y Y XY    &     [    r&   r   )rp   rZ   typingr   r   image_processing_utilsr   processing_utilsr   tokenization_utils_baser   r	   r
   r   r   utilsr   r   video_utilsr   autor   
get_loggerrm   loggerr   __all__r?   r&   r%   <module>r      s/    
			 " " " " " " " " 2 2 2 2 2 2 . . . . . .              ) ( ( ( ( ( ( ( % % % % % %             
	H	%	%m m m m m m m m` (
(r&   