
     `iK                     (   d dl mZmZ ddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZmZ ddlmZ  e            rd dlZ G d	 d
e
d          ZdZd  ed          D             d  ed          D             z   Zd Z G d de          ZdgZdS )    )OptionalUnion   )BatchFeature)
ImageInputmake_flat_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenPreTokenizedInput	TextInput)is_torch_availableNc                   (    e Zd ZddidddddidZd	S )
ColPaliProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/colpali/processing_colpali.pyr   r   $   sB         y
 ,"
 
 +D1	 	IIIr"   r   F)totalz<image>c                     g | ]	}d |dd
S )z<locz0>4>r!   .0is     r#   
<listcomp>r*   2   s"    555Aq555r"   i   c                     g | ]	}d |dd
S )z<segz0>3r&   r!   r'   s     r#   r*   r*   2   s"    8]8]8]Q8]8]8]r"      c                      ||z  |z   | |  dS )aZ  
    Builds a string from the input prompt and image tokens.
    For example, for the call:
    build_string_from_input(
        prompt="Prefix str"
        bos_token="<s>",
        image_seq_len=3,
        image_token="<im>",
    )
    The output will be:
    "<im><im><im><s>Initial str"
    Args:
        prompt (`list[Union[str, ImageInput]]`): The input prompt.
        bos_token (`str`): The beginning of sentence token.
        image_seq_len (`int`): The length of the image sequence.
        image_token (`str`): The image token.
        num_images (`int`): Number of images in the prompt.
    
r!   prompt	bos_tokenimage_seq_lenimage_token
num_imagess        r#   build_string_from_inputr5   5   s'    & M)J6M	M6MMMMr"   c                       e Zd ZdZddgZdZdZ	 	 	 	 	 d d	ed
ef fdZ	 	 	 	 d!de	e
         deeeee         ee         f         dee         defdZd"dZedefd            Z	 d"de	e
         dee         defdZdeeee         f         dee         defdZ	 	 	 d#deded         f         deded         f         dede	d         dedef         ddfdZ xZS )$ColPaliProcessora  
    Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as
    well as to compute the late-interaction retrieval score.

    [`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`]
    for more information.

    Args:
        image_processor ([`SiglipImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
        visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*, defaults to `"Question: "`):
            A prefix to be used for the query.
    image_processor	tokenizer)SiglipImageProcessorSiglipImageProcessorFast)GemmaTokenizerGemmaTokenizerFastNDescribe the image.
Question: visual_prompt_prefixquery_prefixc                    t                                          |||           t          |d          st          d          |j        | _        t          |d          s]t          t          dd          }d|gi}|                    |           |                    t                    | _	        t          | _
        n|j	        | _	        |j
        | _
        |                    t                     d|_        d|_        || _        || _        d S )	N)chat_templateimage_seq_lengthz;Image processor is missing an `image_seq_length` attribute.r3   FT)
normalizedspecialadditional_special_tokens)super__init__hasattr
ValueErrorrD   r   IMAGE_TOKENadd_special_tokensconvert_tokens_to_idsimage_token_idr3   
add_tokensEXTRA_TOKENSadd_bos_tokenadd_eos_tokenr@   rA   )	selfr8   r9   rC   r@   rA   r3   tokens_to_add	__class__s	           r#   rI   zColPaliProcessor.__init__d   s	    	)=QQQ(:;; 	\Z[[[ / @y-00 	5$[UDQQQK8;-HM((777"+"A"A+"N"ND*D"+":D(4D\***"'	"'	$8!(r"   imagestextkwargsreturnc                       j         t          fd j        j        i|}|d                             dd          }|du}||t          d          ||t          d          |' j                            |          }t          |          } j	        gt          |          z  }	d |D             } fdt          |	|          D             }
  j        |fi |d	         d
         }|d                             dd          |d         dxx          j        z  cc<     j        |
fddi|d         }i |d
|i}|r=|d                             |d         dk    d          }|                    d|i           t!          |          S |t#          |t$                    r|g}n?t#          |t&                    rt#          |d         t$                    st          d          |
 j        dz  }g }|D ]4} j        j         j        z   |z   |z   dz   }|                    |           5|d                             dd          |d         d<     j        |fddi|d         }|S dS )a	  
        Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
        wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
        both text and images at the same time.

        When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
        [`~LlamaTokenizerFast.__call__`].
        When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
        [`~SiglipImageProcessor.__call__`].
        Please refer to the docstring of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timec                 8    g | ]}|                     d           S )RGB)convert)r(   images     r#   r*   z-ColPaliProcessor.__call__.<locals>.<listcomp>   s$    ???uemmE**???r"   c                     g | ]Q\  }}t          |j        j        j        t          t          |t                    rt          |          nd           RS )   r/   )r5   r9   r1   rD   rL   
isinstancelistlen)r(   r0   
image_listrT   s      r#   r*   z-ColPaliProcessor.__call__.<locals>.<listcomp>   so     	 	 	 'FJ (!"n6"&"7 +2<Z2N2NUs:TU  	 	 	r"   r   pixel_values
max_lengthreturn_token_type_idsF	input_idstoken_type_idsr   ilabels)dataz*Text must be a string or a list of strings
   r.   2   )_merge_kwargsr   r9   init_kwargspoprK   r8   fetch_imagesr   r@   rf   zipgetrD   masked_fillupdater   rd   strre   query_augmentation_tokenr1   rA   append)rT   rW   rX   audiovideosrY   output_kwargsr]   rj   	texts_docinput_stringsrh   inputsreturn_datarm   texts_queryquerybatch_querys   `                 r#   __call__zColPaliProcessor.__call__   sC   Z +*"
 
"&."<
 
 

 }-11(DAA &d 2<FNEFFF 2TUUU)66v>>F-f55F23c&kkAI?????F	 	 	 	 +.i*@*@	 	 	M 04/YY-:XYYZhiL ]+//dCCOm,\:::d>SS:::#T^ &+  . F CVB^\BBK$ 7,88@P9QUV9VX\]]""Hf#5666[1111$$$ Ov t,, ODGS1I1I O !MNNN~6;%'K * *043DDuLvUX\\""5))))9F}9U9Y9YZfhj9k9kM-(6($. &+  . K - r"   c                     i }|C| j         gt          |          z  }dgt          |          z  }|                    ||d           t          di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (list[list[str]], *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nrc   )num_image_tokensnum_image_patchesr!   )rD   rf   rx   r	   )rT   image_sizesrY   vision_datar   r   s         r#   _get_num_multimodal_tokensz+ColPaliProcessor._get_num_multimodal_tokens   so     " $ 56[9I9II!"c+&6&6 64D[lmmnnn,,,,,r"   c                     | j         j        S )z
        Return the query augmentation token.

        Query augmentation buffers are used as reasoning buffers during inference.
        )r9   	pad_token)rT   s    r#   rz   z)ColPaliProcessor.query_augmentation_token  s     ~''r"   c                       | j         dd|i|S )a  
        Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColPaliProcessor's
        [`ColPaliProcessor.__call__`].

        This method forwards the `images` and `kwargs` arguments to the image processor.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        rW   r!   r   )rT   rW   rY   s      r#   process_imageszColPaliProcessor.process_images  s"    B t}55F5f555r"   c                       | j         dd|i|S )a  
        Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColPaliProcessor's
        [`ColPaliProcessor.__call__`].

        This method forwards the `text` and `kwargs` arguments to the tokenizer.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        rX   r!   r   )rT   rX   rY   s      r#   process_queriesz ColPaliProcessor.process_queries7  s"    @ t}11$1&111r"   r,   cpuquery_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	      8   t          |          dk    rt          d          t          |          dk    rt          d          |d         j        |d         j        k    rt          d          |d         j        |d         j        k    rt          d          ||d         j        }g }t	          dt          |          |          D ]:}g }t
          j        j        j        	                    ||||z            dd          }	t	          dt          |          |          D ]}
t
          j        j        j        	                    ||
|
|z            dd          }|
                    t          j        d	|	|                              d
          d                             d                     |
                    t          j        |d                              |                              |                     <t          j        |d          S )aZ  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeNT)batch_firstpadding_valuezbnd,csd->bcnsr   )dim   rc   )rf   rK   devicedtyperangetorchnnutilsrnnpad_sequencer{   einsummaxsumcatto)rT   r   r   r   r   r   scoresr)   batch_scoresbatch_queriesjbatch_passagess               r#   score_retrievalz ColPaliProcessor.score_retrievalY  s*   @   A%%2333!""a''3444A%);A)>)EEENOOOA$(:1(=(CCCLMMM+A.4L%'q#.//<< 	] 	]A/1L!HN.;; Q^!34$VW <  M 1c"455zBB  !&!3!@!@&q1z>'9:\] "A " " ##L-PPTTYZT[[\]^bbghbii    MM%)La888;;LIILL][[\\\\yQ''''r"   )NNNr>   r?   )NNNN)N)r,   Nr   )r   r   r   __doc__
attributesimage_processor_classtokenizer_classry   rI   r   r   r   r   r   re   r   r   r   r   r   propertyrz   r   r   intr   __classcell__)rV   s   @r#   r7   r7   K   sU        ( $[1JP>O $9() )
 ") ) ) ) ) ) )@ (,^bu u$u I0$y/4HYCZZ[u /0u 
u u u un- - - -$ (# ( ( ( X( (,!6 !6$!6 /0!6 
	!6 !6 !6 !6F 2ItI./ 2 /0 2 
	 2  2  2  2L 0449>( >(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>( >( >( >( >( >( >( >(r"   r7   )typingr   r   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   r   r   r   r   r   rL   r   rQ   r5   r7   __all__r!   r"   r#   <module>r      s  . # " " " " " " " 4 4 4 4 4 4 ? ? ? ? ? ? ? ? X X X X X X X X X X X X O O O O O O O O O O ' ' ' ' ' '  LLL
 
 
 
 
-U 
 
 
 
 55t5558]8]RWRWX[R\R\8]8]8]]N N N,L( L( L( L( L(~ L( L( L(^
 
r"   