
     `iM(                     >   d dl mZmZmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZmZ  e
            rd dlmZ ddlmZ  e            rdd	lmZ  e	            r
d d
lZddlmZ  ej        e          Z e edd                     G d de                      Zd
S )    )AnyUnionoverload   )GenerationConfig)add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMEST)has_tokenizerhas_image_processorc                   j    e Zd ZdZdZdZdZdZdZ e	d          Z
 fdZddZed	eed
f         dedeeeef                  fd            Zed	eee         ed
         f         dedeeeeef                           fd            Zd	eeee         d
ed
         f         f fdZddZd Zd Z xZS )ImageToTextPipelinea  
    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
    TF   )max_new_tokensc                      t                      j        |i | t          | d           |                     | j        dk    rt
          nt                     d S )Nvisiontf)super__init__r   check_model_type	frameworkr   r   )selfargskwargs	__class__s      x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/pipelines/image_to_text.pyr   zImageToTextPipeline.__init__T   sg    $)&)))$)))7;~7M7M33Sw	
 	
 	
 	
 	
    Nc                     i }i }|||d<   |||d<   |||d<   |*|d|v rt          d          |                    |           | j        
| j        |d<   | j        | j        |d<   | j        |d<   ||i fS )Nprompttimeoutr   zp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 versionassistant_model	tokenizerassistant_tokenizer)
ValueErrorupdater+   r-   r,   )r"   r   generate_kwargsr)   r*   forward_paramspreprocess_paramss          r&   _sanitize_parametersz(ImageToTextPipeline._sanitize_parameters[   s    *0h'+2i(%/=N+,&).>/.Q.Q &   !!/222+040DN,-#/*..N;'484LN01 ."44r'   inputszImage.Imager$   returnc                     d S N r"   r4   r$   s      r&   __call__zImageToTextPipeline.__call__v   s    beber'   c                     d S r7   r8   r9   s      r&   r:   zImageToTextPipeline.__call__y   s    twtwr'   c                     d|v r|                     d          }|t          d           t                      j        |fi |S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.

            max_new_tokens (`int`, *optional*):
                The amount of maximum tokens to generate. By default it will use `generate` default.

            generate_kwargs (`Dict`, *optional*):
                Pass it to send all of these arguments directly to `generate` allowing full control of this function.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

            - **generated_text** (`str`) -- The generated text.
        imagesNzBCannot call the image-to-text pipeline without an inputs argument!)popr.   r   r:   )r"   r4   r$   r%   s      r&   r:   zImageToTextPipeline.__call__|   sT    < vZZ))F>abbbuww11&111r'   c                    t          ||          }|t                              d           t          |t                    s t          dt          |           d          | j        j        j	        }|dk    r| 
                    || j                  }| j        dk    r|                    | j                  }|                     |d	          j        }| j        j        g|z   }t#          j        |                              d
          }|                    d|i           n|dk    rC| 
                    ||| j                  }| j        dk    r|                    | j                  }n|dk    rs| 
                    || j                  }| j        dk    r|                    | j                  }|                     || j                  }|                    |           nTt          d| d          | 
                    || j                  }| j        dk    r|                    | j                  }| j        j        j	        dk    r|d |d<   |S )N)r*   u   Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48 of 🤗 Transformers. Use the `image-text-to-text` pipeline insteadz&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)r=   return_tensorsptF)textadd_special_tokensr   	input_ids
pix2struct)r=   header_textrA   zvision-encoder-decoder)rA   zModel type z- does not support conditional text generation)r   loggerwarning_once
isinstancestrr.   typemodelconfig
model_typeimage_processorr!   todtyper,   rE   cls_token_idtorchtensor	unsqueezer/   )r"   imager)   r*   rO   model_inputsrE   text_inputss           r&   
preprocesszImageToTextPipeline.preprocess   sv   5'222W   fc**  oT&\\ o o o  
 *5JU""#335QUQ_3``>T))#/??4:#>#>L NN5NQQ[	!^89IE	!L33==a@@	##[)$<====|++#335feies3tt>T))#/??4:#>#>L777#335QUQ_3``>T))#/??4:#>#>L"nnVDNnSS##K0000 !!hz!h!h!hiii  //uT^/\\L~%%+tz:::'500V^(,L%r'   c                    d|v r?t          |d         t                    r$t          d |d         D                       rd |d<   d|vr
| j        |d<   |                    | j        j                  } | j        j        |fi ||}|S )NrE   c              3      K   | ]}|d u V  	d S r7   r8   ).0xs     r&   	<genexpr>z/ImageToTextPipeline._forward.<locals>.<genexpr>   s&      AA!AIAAAAAAr'   generation_config)rJ   listallr`   r>   rM   main_input_namegenerate)r"   rX   r0   r4   model_outputss        r&   _forwardzImageToTextPipeline._forward   s     <''<4d;; (AA|K'@AAAAA ( )-L% o55373IO/0 !!$*"<==+
+FVVlVoVVr'   c                 z    g }|D ]5}d| j                             |d          i}|                    |           6|S )Ngenerated_textT)skip_special_tokens)r,   decodeappend)r"   re   records
output_idsrecords        r&   postprocesszImageToTextPipeline.postprocess   s[    ' 	# 	#J $."7"7(, #8 # #F NN6""""r'   )NNNN)NN)__name__
__module____qualname____doc___pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   r3   r   r   rK   r   ra   dictr:   rZ   rf   ro   __classcell__)r%   s   @r&   r   r   .   s        4  $O #O!1!1" " "
 
 
 
 
5 5 5 56 euS-%78eCeDQUVY[^V^Q_L`eee XewuT#Y]0C%CDwPSwX\]abfgjlogobp]qXrwww Xw"2uS$s)]DDW%WX "2 "2 "2 "2 "2 "2H1 1 1 1f  ,
 
 
 
 
 
 
r'   r   )typingr   r   r   
generationr   utilsr   r	   r
   r   r   r   baser   r   PILr   image_utilsr   models.auto.modeling_tf_autor   rT   models.auto.modeling_autor   
get_loggerrp   rH   r   r8   r'   r&   <module>r      s    ( ' ' ' ' ' ' ' ' ' ) ) ) ) ) )                5 4 4 4 4 4 4 4  )((((((? WVVVVVV QLLLPPPPPP		H	%	% ,,4UYZZZ[[D D D D D( D D \[D D Dr'   