
     `iUS                     &   d Z ddlZddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ  ej        e          Ze G d de                      Z G d de	j                  Z	 d%de	j        dej        dej        dej        deej                 de de fdZ! G d de	j                  Z" G d de	j                  Z# G d d e          Z$ G d! d"e	j                  Z% G d# d$e	j                  Z&dS )&zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONS)ModelOutputcan_return_tuplelogging   )IdeficsVisionConfigc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )IdeficsVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler        v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/idefics/vision.pyr   r   '   s          * 15L(5,-44459x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r"   r   c                   z     e Zd Zdef fdZdej        dededej        fdZdd	ej	        d
e
dej        fdZ xZS )IdeficsVisionEmbeddingsconfigc                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r&   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr&   	__class__s     r#   r2   z IdeficsVisionEmbeddings.__init__F   s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr"   
embeddingsheightwidthreturnc                    |j         d         dz
  }|                     | j                  }|j         d         dz
  }||k    r||k    r|S |dddf         }|ddddf         }|j         d         }	|| j        j        z  }
|| j        j        z  }|
dz   |dz   }}
t          j        |          }|                    dt          |          t          |          |	          }|	                    dddd          }|j
        t          j        k    }|r9t                              d           |                    t          j                  }t"          j                            ||
|z  ||z  fd	d
          }|r|                    t          j                  }t          |
          |j         d         k    st          |          |j         d         k    rJt)          dt          |
          t          |          f d|j         d         |j         d         f d          |	                    dddd                              dd|	          }t          j        |                    d          |fd          S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   r/   g?r   r-   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaper@   r.   r&   r6   mathsqrtreshapeintpermutedtyper   bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rE   rG   rH   rI   r=   	pos_embedr>   class_pos_embedpatch_pos_embedr4   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastings                 r#   interpolate_pos_encodingz0IdeficsVisionEmbeddings.interpolate_pos_encoding]   s    !&q)A-++D,=>>	!*Q.-''FeOO#AAAqD/#AAAqrrE*$R(	$+"88!77 (5s':MC<O}!Y}55)11!S9K5L5LcRdNeNegpqq)11!Q1==(.%.@ 	>h   .00==O-33'*<<mN`>`a	 4 
 
  	A-00@@O}!6r!:::c->P>PTcTijlTm>m>mh]1C1CSEWEW0X h h0?0Eb0I?K`acKd/eh h h   *11!Q1==BB1b)TTy/33A66HaPPPPr"   Fpixel_valuesrm   c                 <   |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          | j        j        j        }|                     |                    |                    }|                    d                              dd          }| j	        
                    |dd          }	t          j        |	|gd	          }
|r|
|                     |
||          z   }
n|
|                     | j                  z   }
|
S )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)rZ   r-   r   r/   rR   )rT   r5   rb   r<   weightrZ   r^   flatten	transposer9   rC   r   rd   rm   r@   r.   )rE   rn   rm   
batch_sizer;   rH   rI   target_dtypepatch_embedsclass_embedsrG   s              r#   forwardzIdeficsVisionEmbeddings.forward   sc   2>2D/
L&%' 	((ET_,D,D u u u% u uu u+/?u u u  
 +28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
 $ 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr"   F)r   r   r   r   r2   r   TensorrX   rm   r   boolrx   __classcell__rF   s   @r#   r%   r%   E   s        q2 q q q q q q./Q5< /Q /QUX /Q]b]i /Q /Q /Q /Qb E$5 QU bgbn        r"   r%           modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr/   rP   )rS   rZ   )ptrainingr   r-   )r   matmulrs   r   r`   softmaxfloat32r^   rZ   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r#   eager_attention_forwardr      s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r"   c                        e Zd ZdZdef fdZ	 	 	 ddej        deej                 deej                 d	ee	         d
e
ej        eej                 f         f
dZ xZS )IdeficsVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr&   c                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r1   r2   r&   r3   r4   num_attention_heads	num_headshead_dimrb   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrD   s     r#   r2   zIdeficsVisionAttention.__init__   s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr"   NFr   r   causal_attention_maskoutput_attentionsrJ   c           
      n   |j         \  }}}|                     |          }|                     |          }	|                     |          }
|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
| j        j	        dk    r||||z   }n||}n	|du| _
        t          }| j        j	        dk    rt          | j        j	                 } || ||	|
|| j
        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }|sd}||fS )z#Input shape: Batch x Time x Channelr   r-   flash_attention_2Neagerr~   )r   r   r   )rT   r   r   r   rc   r   r   rs   r&   _attn_implementationr   r   r   r   r   r   rW   r   r   )rE   r   r   r   r   rt   
seq_lengthr4   querieskeysvaluesattention_interfacer   r   s                 r#   rx   zIdeficsVisionAttention.forward   s    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc ;+/BBB).C.O!/2G!G&2!62$>DN(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00  	 LL((r"   )NNF)r   r   r   r   r   r2   r   rz   r   r{   r    rx   r|   r}   s   @r#   r   r      s        GGB2 B B B B B B. 268<,1/) /)|/) !./)  (5	/)
 $D>/) 
u|Xel33	4/) /) /) /) /) /) /) /)r"   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )IdeficsVisionMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S N)r1   r2   r&   r	   
hidden_actactivation_fnr   r   r3   intermediate_sizefc1fc2rD   s     r#   r2   zIdeficsVisionMLP.__init__  sf    #F$569V/1IJJ9V5v7IJJr"   r   rJ   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )rE   r   s     r#   rx   zIdeficsVisionMLP.forward  s=    //**=99//r"   )r   r   r   r2   r   rz   rx   r|   r}   s   @r#   r   r   
  sc        K K K K KU\ el        r"   r   c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )IdeficsVisionEncoderLayerr&   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S N)eps)r1   r2   r3   r4   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rD   s     r#   r2   z"IdeficsVisionEncoderLayer.__init__  s    +/77<F<QRRR#F++<F<QRRRr"   Fr   r   r   r   rJ   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   )r   r   r   r   )rE   r   r   r   r   residualr   outputss           r#   rx   z!IdeficsVisionEncoderLayer.forward#  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr"   ry   )r   r   r   r   r2   r   rz   r   r{   r    r   rx   r|   r}   s   @r#   r   r     s        S2 S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r"   r   c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej	                 deej	                 dee
         dee
         d	ee
         d
eeef         fd            Z xZS )IdeficsVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`IdeficsVisionEncoderLayer`].

    Args:
        config: IdeficsVisionConfig
    r&   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r!   )r   ).0_r&   s     r#   
<listcomp>z1IdeficsVisionEncoder.__init__.<locals>.<listcomp>Y  s"    $p$p$p1%>v%F%F$p$p$pr"   F)	r1   r2   r&   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingrD   s    `r#   r2   zIdeficsVisionEncoder.__init__V  sb    m$p$p$p$pPUV\VnPoPo$p$p$pqq&+###r"   Nr   r   r   output_hidden_statesreturn_dictrJ   c                 @   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]2\  }
}|r||	fz   } ||	|||          }|d         }	|r||d         fz   }3|r||	fz   }t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr!   )r   r   r   )r   r   r   )r&   r   r   use_return_dict	enumerater   r   )rE   inputs_embedsr   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r#   rx   zIdeficsVisionEncoder.forward\  s    N 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M%"3	  M *!,M  F!/=3C2E!E 	?+}.>>N+>Vd
 
 
 	
r"   )NNNNN)r   r   r   r   r   r2   r   r   r   rz   r{   r   r    r   rx   r|   r}   s   @r#   r   r   M  s         ,2 , , , , , ,  268<,0/3&*D
 D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
 D
 D
 D
 D
 D
 D
r"   r   c                        e Zd Zdef fdZ	 	 	 	 	 ddeej                 dee         dee         dee         d	ee         d
e	e
ef         fdZ xZS )IdeficsVisionTransformerr&   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r   )r1   r2   r&   r3   r%   rG   r   r   r   pre_layrnormr   encoderpost_layernorm)rE   r&   r4   rF   s      r#   r2   z!IdeficsVisionTransformer.__init__  s    &	1&99L8MNNN+F33 l9&:OPPPr"   NFrn   r   r   rm   r   rJ   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     ||          }|                     |          }|                     ||||          }|d         }|dddddf         }	|                     |	          }	|s||	f|dd         z   S t          ||	|j
        |j                  S )z
        Returns:

        Nz You have to specify pixel_values)rm   )r   r   r   r   r   r   )r   pooler_outputr   r   )r&   r   r   r   rb   rG   r   r   r   r   r   r   )
rE   rn   r   r   rm   r   r   encoder_outputsr   pooled_outputs
             r#   rx   z IdeficsVisionTransformer.forward  s/    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@Oghh))-88,,'/!5#	 ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r"   )NNNFN)r   r   r   r   r2   r   r   r   r{   r   r    r   rx   r|   r}   s   @r#   r   r     s        Q2 Q Q Q Q Q Q 59,0/338&*+
 +
u01+
 $D>+
 'tn	+

 #+4.+
 d^+
 
u00	1+
 +
 +
 +
 +
 +
 +
 +
r"   r   )r~   )'r   rU   dataclassesr   typingr   r   r   r   r   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   configuration_ideficsr   
get_loggerr   r\   r   Moduler%   rz   r_   r   r   r   r   r   r   r!   r"   r#   <module>r      s   [ Z  ! ! ! ! ! ! , , , , , , , , , ,        ! ! ! ! ! ! 9 9 9 9 9 9 K K K K K K K K 5 5 5 5 5 5         
 7 6 6 6 6 6 
	H	%	% ? ? ? ? ?{ ? ? ?:` ` ` ` `bi ` ` `V % %I%<% 
% <	%
 U\*% % % % % %.F) F) F) F) F)RY F) F) F)T    ry    / / / / / : / / /fT
 T
 T
 T
 T
29 T
 T
 T
p7
 7
 7
 7
 7
ry 7
 7
 7
 7
 7
r"   