
     `i"                         d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZ dd	lmZ d
dlmZ  ej        e          Z G d de
d          Z G d de          ZdgZdS )zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)OptionalUnion   )BatchFeature)
ImageInput)ProcessingKwargsProcessorMixinUnpack)
AddedTokenPreTokenizedInput	TextInput)logging   )AutoTokenizerc            
       ,    e Zd Zdddddddddd	i dZdS )InstructBlipProcessorKwargsTFr   )	add_special_tokenspaddingstridereturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbose)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaults     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/instructblip/processing_instructblip.pyr   r   !   sF         #').*/&+%*"

 

  IIIr#   r   F)totalc            
            e Zd ZdZg dZdZdZdZd fd	Z	 	 	 	 dde	e
         deeeee         ee         f         d	ee         d
efdZed             Z fdZe fd            Z xZS )InstructBlipProcessora  
    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.

    Args:
        image_processor (`BlipImageProcessor`):
            An instance of [`BlipImageProcessor`]. The image processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):"
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )image_processor	tokenizerqformer_tokenizer)BlipImageProcessorBlipImageProcessorFastr   Nc                     t          |d          s5t          ddd          | _        |                    | j        gd           n|j        | _        || _        t                                          |||           d S )Nimage_tokenz<image>FT)
normalizedspecial)special_tokens)hasattrr   r.   
add_tokensnum_query_tokenssuper__init__)selfr(   r)   r*   r4   kwargs	__class__s         r$   r6   zInstructBlipProcessor.__init__J   s    y-00 	5))tTTTD  $"2!3D IIII(4D 0)5FGGGGGr#   imagestextr8   returnc                    ||t          d           | j        t          fd| j        j        i|}|d                             dd          }i }|ct          |t                    r|g}n?t          |t                    s*t          |d         t                    st          d           | j	        |fi |d         }	|	                    d          |d	<   |	                    d
          |d<   |d         
                    d          |d         dxx         | j        z  cc<    | j        |fi |d         }
|f| j        j        | j        z  }d|d         d<   d|d         d<   d|d         d<    | j        |fi |d         |
D ]fd|
         D             |
<   |                    |
           |) | j        |fi |d         }|                    |           t!          ||          }|S )a  
        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Args:
            images (`ImageInput`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        Nz,You have to specify at least images or text.tokenizer_init_kwargsr   return_tensorsr   zAInvalid input text. Please provide a string, or a list of strings	input_idsqformer_input_idsattention_maskqformer_attention_mask
max_lengthFr   r   
truncationc                 &    g | ]}         |z   S r"   r"   ).0sampleimage_text_encodingks     r$   
<listcomp>z2InstructBlipProcessor.__call__.<locals>.<listcomp>   s$    'g'g'gF(;A(>(G'g'g'gr#   r   )tensor_type)
ValueError_merge_kwargsr   r)   init_kwargspop
isinstancestrlistr*   getr4   r.   contentupdater(   r   )r7   r:   r;   audiovideosr8   output_kwargsr?   encodingqformer_text_encodingtext_encodingimage_tokensimage_encodingrI   rJ   s                @@r$   __call__zInstructBlipProcessor.__call__T   s   , >dlKLLL**'
 
"&."<
 
 
 '}599:JDQQ$$$ fvd++ fJtAw4L4L f !deee$:D$:4$`$`=Q^C_$`$`!,A,E,Ek,R,RH()1F1J1JK[1\1\H-. ]+//==Im,\:::d>SS:::*DN4PP=3OPPM!#/7$:OOEJm,-AB:?m,Y7=Bm,\:&4dn\&b&b]S`Ea&b&b#& h hA'g'g'g'g'gVcdeVf'g'g'gM!$$OOM***1T1&[[M/<Z[[NOON+++  nEEEr#   c                 J    | j         j        }| j        j        }ddg}||z   |z   S )NrA   rC   )r)   model_input_namesr(   )r7   tokenizer_input_namesimage_processor_input_namesqformer_input_namess       r$   ra   z'InstructBlipProcessor.model_input_names   s7     $ @&*&:&L#24LM$'BBEXXXr#   c                    t           j                            |          rt          d| d          t          j        |d           t           j                            |d          }| j                            |           d| j        v }|r| j        	                    d            t                      j        |fi |}|r| xj        dgz  c_        |S )NzProvided path (z#) should be a directory, not a fileT)exist_okr*   )ospathisfilerM   makedirsjoinr*   save_pretrained
attributesremover5   )r7   save_directoryr8   qformer_tokenizer_pathqformer_presentoutputsr9   s         r$   rl   z%InstructBlipProcessor.save_pretrained   s    7>>.)) 	db~bbbccc
NT2222!#n>Q!R!R../EFFF .@ 	8O""#6777)%'').CCFCC 	5OO 344OOr#   c                      t                      j        |fi |}t          |t                    r|d         }t	          j        |d          }||_        |S )Nr   r*   )	subfolder)r5   from_pretrainedrQ   tupler   r*   )clspretrained_model_name_or_pathr8   	processorr*   r9   s        r$   ru   z%InstructBlipProcessor.from_pretrained   se    +EGG+,ITTVTT	 i'' 	%!!I)9:Wcvwww&7	#r#   )N)NNNN)r   r   r    __doc__rm   image_processor_classtokenizer_classqformer_tokenizer_classr6   r   r   r   r   r   rS   r
   r   r   r_   propertyra   rl   classmethodru   __classcell__)r9   s   @r$   r'   r'   2   sC        $ GFFJL%O-H H H H H H (,^bA A$A I0$y/4HYCZZ[A 45A 
A A A AF Y Y XY    &     [    r#   r'   )rz   rg   typingr   r   image_processing_utilsr   image_utilsr   processing_utilsr   r	   r
   tokenization_utils_baser   r   r   utilsr   autor   
get_loggerr   loggerr   r'   __all__r"   r#   r$   <module>r      s=    
			 " " " " " " " " 2 2 2 2 2 2 % % % % % % H H H H H H H H H H O O O O O O O O O O                   
	H	%	%    "2%    "I I I I IN I I IX #
#r#   