§
     `ƒiƒ!  ã                   ó>  — d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZmZmZ ddlmZ e G d„ de¦  «        ¦   «         Ze ed¬¦  «         G d„ de¦  «        ¦   «         ¦   «         Z ed¬¦  «         G d„ de¦  «        ¦   «         ZddgZdS )zPyTorch ColPali modelé    )Ú	dataclass)ÚOptionalN)Únn)ÚAutoModelForImageTextToTexté   )ÚCache)ÚPreTrainedModel)ÚModelOutputÚauto_docstringÚcan_return_tupleé   )ÚColPaliConfigc                   ó4   — e Zd ZU eed<   dZg ZdZdZdZ	d„ Z
dS )ÚColPaliPreTrainedModelÚconfigÚmodelTc                 ó2  — t          | j        d¦  «        r| j        j        n| j        j        j        j        }t          |t          j        t          j        f¦  «        rJ|j	        j
                             d|¬¦  «         |j         |j        j
                             ¦   «          d S d S t          |t          j        ¦  «        rS|j	        j
                             d|¬¦  «         |j        -|j	        j
        |j                                      ¦   «          d S d S d S )NÚinitializer_rangeg        )ÚmeanÚstd)Úhasattrr   r   Ú
vlm_configÚtext_configÚ
isinstancer   ÚLinearÚConv2dÚweightÚdataÚnormal_ÚbiasÚzero_Ú	EmbeddingÚpadding_idx)ÚselfÚmoduler   s      ú€/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/colpali/modeling_colpali.pyÚ_init_weightsz$ColPaliPreTrainedModel._init_weights(   s  € õ t”{Ð$7Ñ8Ô8ðFˆDŒKÔ)Ð)à”Ô'Ô3ÔEð 	õ frœy­"¬)Ð4Ñ5Ô5ð 	?ØŒMÔ×&Ò&¨C°SÐ&Ñ9Ô9Ð9ØŒ{Ð&Ø”Ô ×&Ò&Ñ(Ô(Ð(Ð(Ð(ð 'Ð&å˜¥¤Ñ-Ô-ð 	?ØŒMÔ×&Ò&¨C°SÐ&Ñ9Ô9Ð9ØÔ!Ð-Ø”Ô" 6Ô#5Ô6×<Ò<Ñ>Ô>Ð>Ð>Ð>ð	?ð 	?à-Ð-ó    N)Ú__name__Ú
__module__Ú__qualname__r   Ú__annotations__Úbase_model_prefixÚ_no_split_modulesÚ_supports_sdpaÚ_supports_flash_attnÚ_supports_flex_attnr'   © r(   r&   r   r      sN   € € € € € € àÐÐÑØÐØÐØ€NØÐØÐð?ð ?ð ?ð ?ð ?r(   r   z3
    Base class for ColPali embeddings output.
    )Úcustom_introc                   óú   — e Zd ZU dZdZeej                 ed<   dZ	eej
                 ed<   dZee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dS )	ÚColPaliForRetrievalOutputaä  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The embeddings of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    NÚlossÚ
embeddingsÚpast_key_valuesÚhidden_statesÚ
attentionsÚimage_hidden_states)r)   r*   r+   Ú__doc__r6   r   ÚtorchÚFloatTensorr,   r7   ÚTensorr8   r   r9   Útupler:   r;   r2   r(   r&   r5   r5   9   s´   € € € € € € ðð ð )-€Dˆ(5Ô$Ô
%Ð,Ð,Ñ,Ø)-€J˜œÔ&Ð-Ð-Ñ-Ø'+€OX˜e”_Ð+Ð+Ñ+Ø8<€M8˜E %Ô"3Ô4Ô5Ð<Ð<Ñ<Ø59€J˜˜uÔ0Ô1Ô2Ð9Ð9Ñ9Ø7;Ð˜ %Ô"3Ô4Ð;Ð;Ñ;Ð;Ð;r(   r5   u/  
    The ColPali architecture leverages VLMs to construct efficient multi-vector embeddings directly
    from document images (â€œscreenshotsâ€) for document retrieval. The model is trained to maximize the similarity
    between these document embeddings and the corresponding query embeddings, using the late interaction method
    introduced in ColBERT.

    Using ColPali removes the need for potentially complex and brittle layout recognition and OCR pipelines with a
    single model that can take into account both the textual and visual content (layout, charts, etc.) of a document.

    ColPali is part of the ColVision model family, which was first introduced in the following paper:
    [*ColPali: Efficient Document Retrieval with Vision Language Models*](https://huggingface.co/papers/2407.01449).
    c                   óJ  ‡ — e Zd ZdddddœZdefˆ fd„Zee	 	 	 	 	 	 dd	ee	j
                 d
ee	j                 dee	j                 dee         dee         dee         defd„¦   «         ¦   «         Zd„ Zd„ Zd„ Zd„ Zd„ Z	 	 	 ddee         dee         dedej        fd„Zˆ xZS )ÚColPaliForRetrievalzvlm.model.language_modelzvlm.model.vision_towerzvlm.model.multi_modal_projectorzvlm.lm_head)zvlm.language_model.modelzvlm.vision_towerzvlm.multi_modal_projectorzvlm.language_model.lm_headr   c                 óª  •— t          ¦   «                              |¦  «         || _        |j        j        j        | _        t          j        |j        ¦  «        | _        d„ | j        j	        pg D ¦   «         | _	        | j        j
        | _
        t          j        | j        j        j        j        | j
        ¦  «        | _        |                      ¦   «          d S )Nc                 ó   — g | ]}d |› ‘ŒS )zvlm.language_model.r2   )Ú.0Úks     r&   ú
<listcomp>z0ColPaliForRetrieval.__init__.<locals>.<listcomp>s   s!   € Ð"jÐ"jÐ"jÀÐ#<¸Ð#<Ð#<Ð"jÐ"jÐ"jr(   )ÚsuperÚ__init__r   r   r   Ú
vocab_sizer   Úfrom_configÚvlmÚ_tied_weights_keysÚembedding_dimr   r   Úhidden_sizeÚembedding_proj_layerÚ	post_init)r$   r   Ú	__class__s     €r&   rI   zColPaliForRetrieval.__init__m   sµ   ø€ Ý‰Œ×Ò˜Ñ Ô Ð ØˆŒØ Ô+Ô7ÔBˆŒå.Ô:¸6Ô;LÑMÔMˆŒØ"jÐ"jÀtÄxÔGbÐGhÐfhÐ"jÑ"jÔ"jˆÔà!œ[Ô6ˆÔÝ$&¤IØŒKÔ"Ô.Ô:ØÔñ%
ô %
ˆÔ!ð
 	ŠÑÔÐÐÐr(   NÚ	input_idsÚpixel_valuesÚattention_maskÚoutput_attentionsÚoutput_hidden_statesÚreturn_dictÚreturnc           
      ó   — ||                      | j        ¬¦  «        }||n| j        j        }||n| j        j        }||n| j        j        } | j        j        d|||dd|dœ|¤Ž}|r|j        nd }	||j	        nd }
|d         }| j
        j        j        }|  
                    |                      |¦  «        ¦  «        }||                     dd¬¦  «        z  }|||                     d¦  «        z  }t          ||j        |	|j        |
¬¦  «        S )	N)ÚdtypeT)rS   rU   rT   rW   rX   rV   r   éÿÿÿÿ)ÚdimÚkeepdim)r7   r8   r9   r:   r;   r2   )Útor[   r   rV   rW   Úuse_return_dictrL   r   r9   r;   rP   r   ÚnormÚ	unsqueezer5   r8   r:   )r$   rS   rT   rU   rV   rW   rX   ÚkwargsÚ
vlm_outputÚvlm_hidden_statesÚvlm_image_hidden_statesÚlast_hidden_statesÚ
proj_dtyper7   s                 r&   ÚforwardzColPaliForRetrieval.forward}   sl  € ð Ð#Ø'Ÿ?š?°´˜?Ñ<Ô<ˆLØ1BÐ1NÐ-Ð-ÐTXÔT_ÔTqÐð %9Ð$DÐ Ð È$Ì+ÔJjð 	ð &1Ð%<kkÀ$Ä+ÔB]ˆà#T”X”^ð 
ØØ)Ø%Ø!%ØØ/ð
ð 
ð ð
ð 
ˆ
ð 9MÐV˜JÔ4Ð4ÐRVÐØDPÐD\ *Ô"@Ð"@ÐbfÐà'¨œ]ÐØÔ.Ô5Ô;ˆ
Ø×.Ò.Ð/A×/DÒ/DÀZÑ/PÔ/PÑQÔQˆ
ð   *§/¢/°bÀ$ /Ñ"GÔ"GÑGˆ
àÐ%Ø# n×&>Ò&>¸rÑ&BÔ&BÑBˆJå(Ø!Ø&Ô6Ø+Ø!Ô,Ø 7ð
ñ 
ô 
ð 	
r(   c                 ó4   — | j                              ¦   «         S ©N)rL   Úget_input_embeddings©r$   s    r&   rl   z(ColPaliForRetrieval.get_input_embeddings°   s   € ØŒx×,Ò,Ñ.Ô.Ð.r(   c                 ó:   — | j                              |¦  «         d S rk   )rL   Úset_input_embeddings)r$   Úvalues     r&   ro   z(ColPaliForRetrieval.set_input_embeddings³   s   € ØŒ×%Ò% eÑ,Ô,Ð,Ð,Ð,r(   c                 ó4   — | j                              ¦   «         S rk   )rL   Úget_output_embeddingsrm   s    r&   rr   z)ColPaliForRetrieval.get_output_embeddings¶   s   € ØŒx×-Ò-Ñ/Ô/Ð/r(   c                 ó:   — | j                              |¦  «         d S rk   )rL   Úset_output_embeddings)r$   Únew_embeddingss     r&   rt   z)ColPaliForRetrieval.set_output_embeddings¹   s   € ØŒ×&Ò& ~Ñ6Ô6Ð6Ð6Ð6r(   c                 ó4   — | j                              ¦   «         S rk   )rL   Útie_weightsrm   s    r&   rw   zColPaliForRetrieval.tie_weights¼   s   € ØŒx×#Ò#Ñ%Ô%Ð%r(   TÚnew_num_tokensÚpad_to_multiple_ofÚmean_resizingc                 óÜ   — | j                              |||¬¦  «        }|j        | j        j        j        _        |j        | j        j        _        |j        | j         _        |j        | _        |S )N)rx   ry   rz   )rL   Úresize_token_embeddingsÚnum_embeddingsr   r   r   rJ   )r$   rx   ry   rz   Úmodel_embedss        r&   r|   z+ColPaliForRetrieval.resize_token_embeddings¿   sl   € ð ”x×7Ò7Ø)Ø1Ø'ð 8ñ 
ô 
ˆð 9EÔ8SˆŒÔÔ*Ô5Ø,8Ô,GˆŒÔÔ)Ø*Ô9ˆŒÔØ&Ô5ˆŒàÐr(   )NNNNNN)NNT)r)   r*   r+   Ú_checkpoint_conversion_mappingr   rI   r   r   r   r=   Ú
LongTensorr>   r?   Úboolr5   ri   rl   ro   rr   rt   rw   Úintr   r"   r|   Ú__classcell__)rR   s   @r&   rB   rB   W   sª  ø€ € € € € ð  %?Ø4Ø%FØ&3ð	&ð &Ð"ð˜}ð ð ð ð ð ð ð  Øð 15Ø48Ø15Ø,0Ø/3Ø&*ð/
ð /
à˜EÔ,Ô-ð/
ð ˜uÔ0Ô1ð/
ð ! ¤Ô.ð	/
ð
 $ Dœ>ð/
ð ' tœnð/
ð ˜d”^ð/
ð 
#ð/
ð /
ð /
ñ „^ñ Ôð/
ðb/ð /ð /ð-ð -ð -ð0ð 0ð 0ð7ð 7ð 7ð&ð &ð &ð
 )-Ø,0Ø"ð	ð à  œðð % SœMðð ð	ð
 
Œðð ð ð ð ð ð ð r(   rB   )r<   Údataclassesr   Útypingr   r=   r   Útransformersr   Úcache_utilsr   Úmodeling_utilsr	   Úutilsr
   r   r   Úconfiguration_colpalir   r   r5   rB   Ú__all__r2   r(   r&   ú<module>rŒ      s­  ðð Ð à !Ð !Ð !Ð !Ð !Ð !Ø Ð Ð Ð Ð Ð à €€€Ø Ð Ð Ð Ð Ð à 4Ð 4Ð 4Ð 4Ð 4Ð 4à  Ð  Ð  Ð  Ð  Ð  Ø -Ð -Ð -Ð -Ð -Ð -Ø BÐ BÐ BÐ BÐ BÐ BÐ BÐ BÐ BÐ BØ 0Ð 0Ð 0Ð 0Ð 0Ð 0ð ð?ð ?ð ?ð ?ð ?˜_ñ ?ô ?ñ „ð?ð2 Ø€ððñ ô ð
<ð <ð <ð <ð < ñ <ô <ñô ñ „ð<ð0 €ððñ ô ðkð kð kð kð kÐ0ñ kô kñô ðkð^ Øð€€€r(   