
     `i)M                         d dl mZmZ ddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZ  e            rd dlZ G d	 d
e
d          Z G d de          ZdgZdS )    )OptionalUnion   )BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)is_torch_availableNc                   (    e Zd ZddidddddidZd	S )
ColQwen2ProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/colqwen2/processing_colqwen2.pyr   r   #   sB         y
 ,"
 
 +D1	 	IIIr!   r   F)totalc                       e Zd ZdZddgZdZdZ	 	 	 	 	 ddee         dee         f fd	Z		 	 	 	 d d
ee
         deeeee         ee         f         dee         defdZd!dZedefd            Z	 d!d
ee
         dee         defdZdeeee         f         dee         defdZ	 	 	 d"deded         f         deded         f         deded         dedef         ddfdZed             Z xZS )#ColQwen2Processora  
    Constructs a ColQwen2 processor which wraps a Qwen2VLProcessor and special methods to process images and queries, as
    well as to compute the late-interaction retrieval score.

    [`ColQwen2Processor`] offers all the functionalities of [`Qwen2VLProcessor`]. See the [`~Qwen2VLProcessor.__call__`]
    for more information.

    Args:
        image_processor ([`Qwen2VLImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`Qwen2TokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
        visual_prompt_prefix (`str`, *optional*): A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*): A prefix to be used for the query.
    image_processor	tokenizerAutoImageProcessor)Qwen2TokenizerQwen2TokenizerFastNvisual_prompt_prefixquery_prefixc                     t                                          |||           t          |d          sdn|j        | _        t          |d          sdn|j        | _        |d}|| _        |d}|| _        d S )N)chat_templateimage_tokenz<|image_pad|>video_tokenz<|video_pad|>zf<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>zQuery: )super__init__hasattrr/   r0   r+   r,   )selfr&   r'   r.   r+   r,   kwargs	__class__s          r"   r2   zColQwen2Processor.__init__H   s     	)=QQQ29)]2S2Sn??YbYn29)]2S2Sn??YbYn' $M $8!$L(r!   imagestextr5   returnc                 &    | j         t          fd| j        j        i|}|d                             dd          }|du}||t          d          ||t          d          |t          |          r|g}nt          |t                    rt          |d                   rnZt          |t                    r6t          |d         t                    rt          |d         d                   st          d          | j	        gt          |          z  }	 | j        dd	|i|d
         }
|
d         }|| j        j        dz  }d}t          t          |	                    D ]}| j        |	|         v rW|	|                             | j        d||                                         |z  z  d          |	|<   |dz  }| j        |	|         v W|	|                             d| j                  |	|<    | j        |	fddi|d         }t#          i ||
          }|d         dddf         |d         dddf         z  }t          t%          j        |d         |                                                    }t$          j        j        j                            |d          |d<   |r=|d                             |d         dk    d          }|                    d|i           |S |t          |t6                    r|g}n?t          |t                    rt          |d         t6                    st          d          |
| j        dz  }g }|D ]$}| j        |z   |z   }|                    |           % | j        |fddi|d         }|S dS )a	  
        Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
        wrapper around the Qwen2VLProcessor's [`~Qwen2VLProcessor.__call__`] method adapted for the ColQwen2 model. It cannot process
        both text and images at the same time.

        When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's
        [`~Qwen2TokenizerFast.__call__`].
        When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to Qwen2VLImageProcessor's
        [`~Qwen2VLImageProcessor.__call__`].
        Please refer to the doctsring of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesr7   r   image_grid_thw   z<|placeholder|>   return_token_type_idsF)datapixel_valuesT)batch_first	input_idstoken_type_idsilabelsz*Text must be a string or a list of strings
   r    )_merge_kwargsr   r'   init_kwargspop
ValueErrorr   
isinstancelistr+   lenr&   
merge_sizeranger/   replaceprodr   torchsplittolistnnutilsrnnpad_sequencemasked_fillupdatestrquery_augmentation_tokenr,   append)r4   r7   r8   audiovideosr5   output_kwargsr<   r@   	texts_docimage_inputsr=   merge_lengthindexitext_inputsreturn_dataoffsetsrB   rF   texts_queryqueryaugmented_querybatch_querys                           r"   __call__zColQwen2Processor.__call__]   sh   Z +*#
 
"&."<
 
 

 }-11(DAA &d 2<FNEFFF 2TUUUf%% f FD)) fnVAY.G.G f .. f:fQi3N3N fSabhijbklmbnSoSo f !deee23c&kkAI/4/``v`A_``L)*:;N)#3>As9~~.. ] ]A*il::'0|';'; ,.?>RWCXC]C]C_C_coCo.prs( (	! 
	 *il::
 $-Q<#7#78I4K[#\#\IaLL($. &+  . K ',K{,Kl,KLLLK ""23AAAqD9KHX<YZ[Z[Z[]^Z^<__G  K79I9IJJ L
 +0(.*<*I*I$ +J + +K' % 7$[1==kJZ>[_`>`bfgg""Hf#5666$$$ Ov t,, ODGS1I1I O !MNNN~6;%'K 4 4"&"3e";f"D""?3333($. &+  . K + r!   c                 @    i }|t           j                            di                               |                               dd          p j        j         fd|D             }fd|D             }|                    ||d           t          di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr   rO   c                 8    g | ]} j         j        g |R  S r    )r&   get_number_of_image_patches).0
image_sizer   r4   s     r"   
<listcomp>z@ColQwen2Processor._get_num_multimodal_tokens.<locals>.<listcomp>   sE     ! ! ! A$@\*\m\\\! ! !r!   c                      g | ]
}|d z  z  S )r>   r    )rr   num_patchesrO   s     r"   rt   z@ColQwen2Processor._get_num_multimodal_tokens.<locals>.<listcomp>   s"    ddd;
A!=dddr!   )num_image_tokensnum_image_patchesr    )r   r   getr[   r&   rO   r	   )r4   image_sizesr5   vision_datarx   rw   r   rO   s   `     @@r"   _get_num_multimodal_tokensz,ColQwen2Processor._get_num_multimodal_tokens   s     "3=AA/SUVVM  (((&**<>>a$BVBaJ! ! ! ! !"-! ! !  edddRcddd4D[lmmnnn,,,,,r!   c                     | j         j        S )z
        Return the query augmentation token.

        Query augmentation buffers are used as reasoning buffers during inference.
        )r'   	pad_token)r4   s    r"   r]   z*ColQwen2Processor.query_augmentation_token   s     ~''r!   c                       | j         dd|i|S )a  
        Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColQwen2Processor's
        [`ColQwen2Processor.__call__`].

        This method forwards the `images` and `kwargs` arguments to the image processor.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        r7   r    rn   )r4   r7   r5   s      r"   process_imagesz ColQwen2Processor.process_images  s"    B t}55F5f555r!   c                       | j         dd|i|S )a  
        Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColQwen2Processor's
        [`ColQwen2Processor.__call__`].

        This method forwards the `text` and `kwargs` arguments to the tokenizer.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        r8   r    r   )r4   r8   r5   s      r"   process_queriesz!ColQwen2Processor.process_queries(  s"    @ t}11$1&111r!      cpuquery_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	      8   t          |          dk    rt          d          t          |          dk    rt          d          |d         j        |d         j        k    rt          d          |d         j        |d         j        k    rt          d          ||d         j        }g }t	          dt          |          |          D ]:}g }t
          j        j        j        	                    ||||z            dd          }	t	          dt          |          |          D ]}
t
          j        j        j        	                    ||
|
|z            dd          }|
                    t          j        d	|	|                              d
          d                             d                     |
                    t          j        |d                              |                              |                     <t          j        |d          S )a[  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeNT)rC   padding_valuezbnd,csd->bcnsr   )dimr>   r?   )rN   rK   devicedtyperP   rS   rV   rW   rX   rY   r^   einsummaxsumcatto)r4   r   r   r   r   r   scoresrf   batch_scoresbatch_queriesjbatch_passagess               r"   score_retrievalz!ColQwen2Processor.score_retrievalJ  s*   @   A%%2333!""a''3444A%);A)>)EEENOOOA$(:1(=(CCCLMMM+A.4L%'q#.//<< 	] 	]A/1L!HN.;; Q^!34$VW <  M 1c"455zBB  !&!3!@!@&q1z>'9:\] "A " " ##L-PPTTYZT[[\]^bbghbii    MM%)La888;;LIILL][[\\\\yQ''''r!   c                 T    | j         j        }| j        j        }d |D             }||z   S )Nc                     g | ]}|d v|	S ))pixel_values_videosvideo_grid_thwr    )rr   names     r"   rt   z7ColQwen2Processor.model_input_names.<locals>.<listcomp>  s*     '
 '
 '
DHq<q<qD<q<q<qr!   )r'   model_input_namesr&   )r4   tokenizer_input_namesimage_processor_input_namess      r"   r   z#ColQwen2Processor.model_input_names  sF     $ @&*&:&L#'
 '
8'
 '
 '
# %'BBBr!   )NNNNN)NNNN)N)r   Nr   )r   r   r   __doc__
attributesimage_processor_classtokenizer_classr   r\   r2   r   r   r   r   rM   r   r   r   rn   r|   propertyr]   r   r   intr   r   __classcell__)r6   s   @r"   r%   r%   0   s        $ $[1J0>O .2&*) )
 'sm) sm) ) ) ) ) ). (,^bC C$C I0$y/4HYCZZ[C 01C 
C C C CJ- - - -4 (# ( ( ( X( (,!6 !6$!6 01!6 
	!6 !6 !6 !6F 2ItI./ 2 01 2 
	 2  2  2  2L 0449>( >(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>( >( >( >(@ 	C 	C X	C 	C 	C 	C 	Cr!   r%   )typingr   r   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   rW   r   rS   r   r%   __all__r    r!   r"   <module>r      s.  , # " " " " " " " 4 4 4 4 4 4 5 5 5 5 5 5 5 5 X X X X X X X X X X X X C C C C C C C C ' ' ' ' ' '  LLL
 
 
 
 
.e 
 
 
 
dC dC dC dC dC dC dC dCN 
r!   