
     `i                        d dl Zd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*  ed           G d de
j+                              Z,	 dEde
j+        dej-        dej-        dej-        deej-                 de.de.fdZ/ G d d e
j+                  Z0e e!d!"           G d# d$e                                  Z1 G d% d&e
j+                  Z2 G d' d(e
j+                  Z3 G d) d*e
j+                  Z4e
j5        e,d+Z6 G d, d-e          Z7 G d. d/e
j+                  Z8e! G d0 d1e                      Z9e! G d2 d3e9                      Z:e! G d4 d5e                      Z; G d6 d7e
j+                  Z<e e!d8"           G d9 d:e                                  Z= e!d;"           G d< d=e;                      Z>e e!d>"           G d? d@e                                  Z? e!dA"           G dB dCe;e                      Z@g dDZAdS )F    N)	dataclass)CallableOptionalUnion   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple	torch_int)check_model_inputs   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )InternVLVisionRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zD
        InternVLVisionRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/internvl/modeling_internvl.pyr$   zInternVLVisionRMSNorm.__init__.   sD     	l5:k#:#:;; #    c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nr   T)keepdim)	dtypetor'   float32powmeanrsqrtr*   r)   )r+   hidden_statesinput_dtypevariances       r/   forwardzInternVLVisionRMSNorm.forward6   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r0   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler)   shaper*   r+   s    r/   
extra_reprz InternVLVisionRMSNorm.extra_repr=   s&    )**II$2GIIIr0   )r!   )__name__
__module____qualname__r$   r=   rB   __classcell__r.   s   @r/   r    r    ,   sb        $ $ $ $ $ $; ; ;J J J J J J Jr0   r            modulequerykeyvalueattention_maskscalingdropoutc                    |}|}	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
d          }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd          	                                }||
fS )Nr   r   r2   dim)ptrainingr   )
r'   matmul	transposer@   r%   
functionalsoftmaxrO   rU   
contiguous)rI   rJ   rK   rL   rM   rN   rO   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r/   eager_attention_forwardra   A   s     JL<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1 =((2(>>L=((6?([[L,|\::K''1--88::K$$r0   c                   l     e Zd ZdZdef fdZ	 d	dej        deej                 de	e
         fdZ xZS )
InternVLVisionAttentionz+Attention Class for InternVL Vision Encoderconfigc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _
        |j        }|j        }d| _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j                  | _        |dk    rt          j        |          nt          j                    | _        |rt/          | j                  nt          j                    | _        |rt/          | j                  nt          j                    | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r#   r$   rd   r,   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr%   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr    q_normk_norm)r+   rd   proj_dropoutqk_normr.   s       r/   r$   z InternVLVisionAttention.__init___   s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
!'!90$ i0NU[Ujkkki0NU[Ujkkki0NU[Ujkkk "	$.$. I I>JQ>N>N"*\":":":TVT_TaTa?FY+DN;;;BKMM?FY+DN;;;BKMMr0   Nr:   rM   r[   c                    |                                 \  }}}|                     |          }|                     |          }|                     |          }	|                     |          }|                     |          }|                    ||| j        | j                  	                    dd          }|                    ||| j        | j                  	                    dd          }|	
                    ||| j        | j                  	                    dd          }	t          }
| j        j        dk    rt          | j        j                 }
 |
| |||	|f| j        sdn| j        | j        dd|\  }}|                    ||| j                  }|                     |          }|                     |          }||fS )Nr   r   eagerrH   F)rO   rN   rq   )sizert   ru   rv   rz   r{   reshaperj   rk   rW   viewra   rd   _attn_implementationr   rU   rn   rm   rh   rw   ro   )r+   r:   rM   r[   
batch_sizeseq_len_query_statesr\   r]   attention_interfacer`   r^   outputs                 r/   r=   zInternVLVisionAttention.forward{   s    "/!3!3!5!5
GQ{{=11[[//
{{=11{{<00[[,,
#++JQUQ^__iijkmnoo''
GT^T][[eefgijkk
#((Wdndm\\ffghjkll(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HJ
%
 
%
 
%
 
%
!\ "))*gt~NN&&{33((00|##r0   N)rC   rD   rE   __doc__r   r$   r'   Tensorr   r   r   r=   rF   rG   s   @r/   rc   rc   \   s        55Z3 Z Z Z Z Z Z> 26'$ '$|'$ !.'$ +,	'$ '$ '$ '$ '$ '$ '$ '$r0   rc   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                       e Zd ZdZdS )$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)rC   rD   rE   r    r0   r/   r   r      s           r0   r   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}|d         |d         z  |d         |d         z  z  }|d         |d         z  |d         |d         z  f}|| _        || _        || _        || _        || _        t          j	        ||||          | _
        d S )Nr   r   )kernel_sizestride)r#   r$   
image_size
patch_sizenum_channelsr,   num_patchespatch_shaper%   Conv2d
projection)	r+   rd   r   r   r   r,   r   r   r.   s	           r/   r$   z&InternVLVisionPatchEmbeddings.__init__   s    !'!2F4EJ
$*$79Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&)L+:^hiiir0   pixel_valuesreturnc                 
   |j         \  }}}}|| j        k    rt          d          |                     |          }|j         d         |j         d         }}|                    d                              dd          }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r@   r   rl   r   flattenrW   )	r+   r   r   r   heightwidth
embeddingspatch_heightpatch_widths	            r/   r=   z%InternVLVisionPatchEmbeddings.forward   s    2>2D/
L&%4,,,w   __\22
$.$4Q$79I!9Lk''**44Q::
L+666r0   )	rC   rD   rE   r   r$   r'   r   r=   rF   rG   s   @r/   r   r      sm         j j j j j7EL 7U\ 7 7 7 7 7 7 7 7r0   r   c                        e Zd ZdZdeddf fdZdej        dededej        fd	Z		 dd
ej        de
ej                 dej        fdZ xZS )InternVLVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rd   r   Nc                    t                                                       t          j        t	          j        dd|j                            | _        |j        r3t          j        t	          j        dd|j                            | _	        nd | _	        t          |          | _        |j        | _        t          |j        t          j        j                  r|j        n|j        |j        f| _        | j        j        }|j        r6t          j        t	          j        d|dz   |j                            | _        nd | _        t          j        |j                  | _        d S )Nr   )r#   r$   r%   r&   r'   zerosr,   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsrx   hidden_dropout_probrO   )r+   rd   r   r.   s      r/   r$   z!InternVLVisionEmbeddings.__init__   s(   ek!Q8J&K&KLL  	# l5;q!V=O+P+PQQDOO"DO =f E E + &+[_-EFF8F#V%67 	
 +72 	,')|EK;QR?TZTf4g4g'h'hD$$'+D$z&"<==r0   r   r   r   c                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        d         z  }	|| j        d         z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr2   r         ?r   r   bicubicF)r   modealign_cornersrR   )r@   r   r'   jit
is_tracingr   r   r   permuter%   rX   interpolater   cat)r+   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrS   
new_height	new_widthsqrt_num_positionss               r/   interpolate_pos_encodingz1InternVLVisionEmbeddings.interpolate_pos_encoding   s|    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"tq11
T_Q//	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr0   r   bool_masked_posc                    |j         \  }}}}|                     |          \  }\  }}|                                \  }	}
}|R| j                            |	|
d          }|                    d                              |          }|d|z
  z  ||z  z   }| j                            |	dd          }t          j	        ||fd          }| j
        ||                     |||          z   }|                     |          }|||ffS )Nr2   r   rR   )r@   r   r   r   expand	unsqueezetype_asr   r'   r   r   r   rO   )r+   r   r   r   r   r   r   r   r   r   r   mask_tokensw
cls_tokenss                 r/   r=   z InternVLVisionEmbeddings.forward  s   
 +01fe262G2G2U2U/
/\;!+!2!2
GQ&/00WbIIK))"--55kBBA#q1u-a?J^**:r2>>
Y
J7Q???
#/#d&C&CJPVX]&^&^^J\\*--
L+666r0   r   )rC   rD   rE   r   r   r$   r'   r   intr   r   
BoolTensorr=   rF   rG   s   @r/   r   r      s         
>3 > > > > > > >,&D5< &D &DUX &D]b]i &D &D &D &DV 7;7 7l7 "%"237 
	7 7 7 7 7 7 7 7r0   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )InternVLVisionMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )r#   r$   rd   r   
hidden_actactivation_fnr%   rr   r,   intermediate_sizefc1fc2r+   rd   r.   s     r/   r$   zInternVLVisionMLP.__init__9  sf    #F$569V/1IJJ9V5v7IJJr0   r:   r   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )r+   r:   s     r/   r=   zInternVLVisionMLP.forward@  s=    //**=99//r0   )rC   rD   rE   r$   r'   r   r=   rF   rG   s   @r/   r   r   8  sc        K K K K KU\ el        r0   r   )
layer_normrms_normc                        e Zd ZdZdeddf fdZdej        dee	ej                 e	ej        ej        f         f         fdZ
 xZS )InternVLVisionLayerz?This corresponds to the Block class in the timm implementation.rd   r   Nc                    t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |j	                 |j
        |j                  | _        t          |j	                 |j
        |j                  | _        |j        }t          j        |t#          j        |j
                  z  d          | _        t          j        |t#          j        |j
                  z  d          | _        t          j        |j                  | _        d S )Nr   r-   T)requires_grad)r#   r$   chunk_size_feed_forwardseq_len_dimrc   	attentionr   mlpNORM2FN	norm_typer,   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer%   r&   r'   r(   lambda_1lambda_2rx   r   rO   )r+   rd   init_valuesr.   s      r/   r$   zInternVLVisionLayer.__init__M  s   '-'E$088$V,, '(8 9&:LRXRg h h h&v'789KQWQfggg3[5:f>P3Q3Q%Qaefff[5:f>P3Q3Q%Qaefffz&"<==r0   r:   c                 $   |                      |                     |                    \  }}| j        |z  }||z   }|                     |          }|                     |          }|                     |          }| j        
| j        |z  }||z   }|S r   )r   r   r   r   r   rO   r   )r+   r:   attention_outputr   layer_outputs        r/   r=   zInternVLVisionLayer.forward\  s     #nn!!-00
 
!  =+;; )=8 ++M::xx--||L11=$=<7L $m3r0   )rC   rD   rE   r   r   r$   r'   r   r   r?   r=   rF   rG   s   @r/   r   r   J  s        II>3 > > > > > > >| 
uU\"E%,*D$EE	F       r0   r   c                   R     e Zd Zdeddf fdZdej        deee	f         fdZ
 xZS )InternVLVisionEncoderrd   r   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r   )r   ).0ird   s     r/   
<listcomp>z2InternVLVisionEncoder.__init__.<locals>.<listcomp>|  s"    #i#i#iA$7$?$?#i#i#ir0   F)	r#   r$   rd   r%   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r/   r$   zInternVLVisionEncoder.__init__y  s`    ]#i#i#i#ivOgIhIh#i#i#ijj
&+###r0   r:   c                 L    | j         D ]} ||          }t          |          S )N)last_hidden_state)r   r   )r+   r:   layer_modules      r/   r=   zInternVLVisionEncoder.forward  s@     !J 	8 	8L(L77MM+
 
 
 	
r0   )rC   rD   rE   r   r$   r'   r   r   r?   r   r=   rF   rG   s   @r/   r   r   x  s~        ,3 , , , , , , ,	
|	
 
uo%	&	
 	
 	
 	
 	
 	
 	
 	
r0   r   c                   V     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZeedZ fdZ xZS )InternVLVisionPreTrainedModelrd   internvl_visionr   Tr   )r:   
attentionsc                    t                                          |           t          |t                    rl|j        j                                         |j        |j        j                                         |j         |j        j                                         dS dS t          |t                    rT|j
        j                            | j        j                   |j        j                            | j        j                   dS dS )zInitialize the weightsN)r#   _init_weightsr   r   r   datazero_r   r   r   r   fill_rd   r   r   )r+   rI   r.   s     r/   r  z+InternVLVisionPreTrainedModel._init_weights  s    f%%%f677 	K!''))) ,!&,,...)5*/5577777 65 344 	KO &&t{'IJJJO &&t{'IJJJJJ	K 	Kr0   )rC   rD   rE   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rc   _can_record_outputsr  rF   rG   s   @r/   r  r    s             )$O&*#./N"& -- 
K K K K K K K K Kr0   r  c                        e Zd Zdeddf fdZd Z ed          e	 ddej	        d	e
ej                 deeef         fd
                        Z xZS )InternVLVisionModelrd   r   Nc                 N   t                                          |           || _        t          |          | _        t          |          | _        |j        rt          j	                    nt          j
        |j        |j                  | _        |                                  d S )Nr   )r#   r$   rd   r   r   r   encoderuse_mean_poolingr%   ry   	LayerNormr,   r   	layernorm	post_initr   s     r/   r$   zInternVLVisionModel.__init__  s       26::,V44 $4uBKMMM",vGY_e_t:u:u:u 	
 	r0   c                     | j         j        S r   )r   r   rA   s    r/   get_input_embeddingsz(InternVLVisionModel.get_input_embeddings  s    //r0   F)tie_last_hidden_statesr   r   c                     |                      ||          \  }}|                     |          }|d         }|                     |          }t          ||j        |j                  S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   r   )r   r:   r  )r   r  r  r   r:   r  )r+   r   r   embedding_outputr   encoder_outputssequence_outputs          r/   r=   zInternVLVisionModel.forward  sr     #oolOo\\!,,'788)!,..993-)7&1
 
 
 	
r0   r   )rC   rD   rE   r   r$   r  r   r   r'   r   r   r   r   r?   r   r=   rF   rG   s   @r/   r  r    s        3       0 0 0 u555 7;
 
l
 "%"23
 
u::	;	
 
 
 ^ 65
 
 
 
 
r0   r  c                   :    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdS )InternVLPreTrainedModelrd    Tpast_key_valuesN)rC   rD   rE   r   r	  r
  r  _skip_keys_device_placementr  r  _can_compile_fullgraphr  r  r   r0   r/   r#  r#    sM         &*#"3N!"&r0   r#  c                   *     e Zd Zdef fdZd Z xZS )InternVLMultiModalProjectorrd   c                    t                                                       t          j        |j        j        t          d|j        z            dz  z            | _        t          j	        |j        j        t          d|j        z            dz  z  |j
        j                  | _        t          |j                 | _        t          j	        |j
        j        |j
        j                  | _        d S )Nr   r   )r#   r$   r%   r  vision_configr,   r   downsample_ratior   rr   text_configlinear_1r   projector_hidden_actactlinear_2r   s     r/   r$   z$InternVLMultiModalProjector.__init__  s    ,v';'G#aRXRiNiJjJjnoJo'opp	 ,s1v7N3N/O/OST/TTV\VhVt
 
 &56	&"4"@&BTB`aar0   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S r   )r   r.  r0  r1  )r+   image_featuresr:   s      r/   r=   z#InternVLMultiModalProjector.forward  sL    77m44//m44r0   )rC   rD   rE   r   r$   r=   rF   rG   s   @r/   r)  r)    sZ        b~ b b b b b b      r0   r)  zM
    Base class for InternVL outputs, with hidden states and attentions.
    c                   8    e Zd ZU dZdZeej                 ed<   dS )InternVLModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	rC   rD   rE   r   r6  r   r'   FloatTensorr	  r   r0   r/   r5  r5    s7         	 	 8<%"34;;;;;r0   r5  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    c                   2    e Zd ZddiZdef fdZd Zd Zd Zd Z		 	 dd
e
j        deeeee         f                  dee         fdZde
j        de
j        de
j        fdZee	 	 	 	 	 	 	 	 	 ddee
j                 d
ee
j                 dee
j                 dee
j                 dee         dee
j                 deeeee         f                  dee         dee
j                 dee         deeef         fd                        Zdde
j        defdZ xZS ) InternVLModelzlanguage_model.modellanguage_modelrd   c                    t                                          |           t          j        |j                  | _        t          |          | _        t          j        |j                  | _	        | 
                                 d S r   )r#   r$   r   from_configr+  vision_towerr)  multi_modal_projectorr-  r:  r  r   s     r/   r$   zInternVLModel.__init__  sm       %1&2FGG%@%H%H"'3F4FGGr0   c                 4    | j                                         S r   )r:  r  rA   s    r/   r  z"InternVLModel.get_input_embeddings  s    "77999r0   c                 :    | j                             |           d S r   )r:  set_input_embeddingsr+   rL   s     r/   rA  z"InternVLModel.set_input_embeddings  s    0077777r0   c                     || _         d S r   r:  r+   decoders     r/   set_decoderzInternVLModel.set_decoder!  s    %r0   c                     | j         S r   rD  rA   s    r/   get_decoderzInternVLModel.get_decoder$  s    ""r0   Nr   vision_feature_layervision_feature_select_strategyc                 l   ||n| j         j        }||n| j         j        }|                    | j                  }| j         j        }|dk    r|                     |          j        }n!|                     |          j	        |         }|dk    r|ddddddf         }|j
        d         }t          |dz            }|j
        d         }	|                    |	||d          }|                     ||	          }|                    |	d|j
        d                   }|                     |          }|S )
a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `list[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        N)r4   r2   )r   defaultr   r   r   )scale_factor)rd   rJ  rK  r5   r4   r,  r=  r   vision_modelr:   r@   r   r   pixel_shuffler>  )
r+   r   rJ  rK  r[   r,  vision_featureschannelsfeature_sizer   s
             r/   get_image_featuresz InternVLModel.get_image_features'  sh   & %9$D  $+Jj 	
 .9 +*; 	'
 $TZ88;72%%"//\/JJ\OO"//\/JJXYmnO)Y66-aaaQQQh7O #(+8S=))$*1-
 *11*lLZ\]] ,,_K[,\\ *11*b/BWXZB[\\ 44_EEr0   	input_idsinputs_embedsr3  c                 \   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }|j        d         |j        d         z  }||                                         |                                k    rt          d| d|           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)r4   devicer2   r   r   z6Image features and image tokens do not match: tokens: z, features )r  r'   tensorrd   image_token_idlongrX  allsumr   	expand_asr5   r@   numelrl   )r+   rU  rV  r3  special_image_maskn_image_tokensn_image_featuress          r/   get_placeholder_maskz"InternVLModel.get_placeholder_mask]  s/    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H+//11/99"==GGVVYYZgZnoo)/2^5I!5LL+,22448L8L8N8NNNvvvdtvv   "!r0   rM   position_idsr%  cache_positionr[   r   c
           	         ||n| j         j        }||n| j         j        }|d u |d uz  rt          d          | |                                 |          }|f|                     |||          }|                    |j        |j                  }| 	                    |||          }|
                    ||          } | j        d|||||	d|
}t          |j        |j        |j        |j        ||nd           S )Nz:You must specify exactly one of input_ids or inputs_embedsr   rJ  rK  )rV  r3  )rM   rd  r%  rV  re  )r   r%  r:   r  r6  r   )rd   rJ  rK  rl   r  rT  r5   rX  r4   rc  masked_scatterr:  r5  r   r%  r:   r  )r+   rU  r   rM   rd  r%  rV  rJ  rK  re  r[   r3  r`  outputss                 r/   r=   zInternVLModel.forwardu  sx     %9$D  $+Jj 	
 .9 +*; 	' -t";< 	[YZZZ 7D5577	BBM#!44)%9/M 5  N
 ,..}/C]EXYYN!%!:!:~ "; " " *889K^\\M%$% 
)%+')
 
 
 
 +%7#3!/)2>2JPT
 
 
 	
r0   r   rQ  rN  c           
      (   |                                 \  }}}}||z  dk    s	||z  dk    rt          d          |                    ||t          ||z            t          ||z                      }|                    dddd                                          }|                    |t          ||z            t          ||z            t          ||dz  z                      }|                    dddd                                          }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r   rl   r   r   r   rZ   )r+   rQ  rN  r   r   r   rR  s          r/   rP  zInternVLModel.pixel_shuffle  s)    />.B.B.D.D+
E68L A%%)=)B)Bjkkk *..s6L#8993x,?V;W;W
 
 *11!Q1==HHJJ *..F\122C8L4M4MsS[_kmn_nSoOpOp
 

 *11!Q1==HHJJr0   NN)	NNNNNNNNN)r   ) rC   rD   rE   _checkpoint_conversion_mappingr   r$   r  rA  rG  rI  r'   r7  r   r   r   liststrrT  
LongTensorrc  r   r   r   r	   r   r   r?   r5  r=   floatrP  rF   rG   s   @r/   r9  r9    sY        '=>N%O"~      : : :8 8 8& & &# # # AE8<	4 4'4 'uS$s)^'<=4 )1	4 4 4 4l")":?:K"]b]n" " " "0  15481537+/59@D8<597
 7
E,-7
 u017
 !.	7

 u/07
 "%7
   127
 'uS$s)^'<=7
 )17
 !!127
 +,7
 
u11	27
 7
 7
 ^ 7
r! !U\ ! ! ! ! ! ! ! ! !r0   r9  zT
    Base class for InternVL causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dS )	InternVLCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitsr%  r:   r  r6  )rC   rD   rE   r   rs  r   r'   r7  r	  rt  r%  r	   r:   r?   r  r6  r   r0   r/   rr  rr    s           )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju0129997;%"34;;;;;r0   rr  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c            !           e Zd ZdddddZdgZdef fdZd	 Zd
 Zde	j
        fdZd Zd Z	 	 d$dej        deeeee         f                  dee         fdZed             Zed             Zed             Zee	 	 	 	 	 	 	 	 	 	 	 	 d%deej                 deej                 deej                 deej                 dee         deej                 deeeee         f                  dee         deej                 deej                 deeej        f         d eej                 d!ee          dee!e"f         fd"                        Z#	 	 	 	 	 	 d& fd#	Z$ xZ%S )' InternVLForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightrd   c                     t                                          |           t          |          | _        t	          j        |j        j        |j        j        d          | _	        | 
                                 d S )NFrf   )r#   r$   r9  modelr%   rr   r-  r,   
vocab_sizerw  r  r   s     r/   r$   z)InternVLForConditionalGeneration.__init__   se       "6**
y!3!?ASA^ejkkkr0   c                 4    | j                                         S r   )ry  r  rA   s    r/   r  z5InternVLForConditionalGeneration.get_input_embeddings  s    z..000r0   c                 :    | j                             |           d S r   )ry  rA  rB  s     r/   rA  z5InternVLForConditionalGeneration.set_input_embeddings	  s    
''.....r0   r   c                     | j         S r   )rw  rA   s    r/   get_output_embeddingsz6InternVLForConditionalGeneration.get_output_embeddings  s
    |r0   c                 :    | j                             |           d S r   )ry  rG  rE  s     r/   rG  z,InternVLForConditionalGeneration.set_decoder  s    
w'''''r0   c                 4    | j                                         S r   )ry  rI  rA   s    r/   rI  z,InternVLForConditionalGeneration.get_decoder  s    z%%'''r0   Nr   rJ  rK  c                 .     | j         j        d|||d|S )Nrg  r   )ry  rT  )r+   r   rJ  rK  r[   s        r/   rT  z3InternVLForConditionalGeneration.get_image_features  s:     -tz, 
%!5+I
 
 	
 
 	
r0   c                     | j         j        S r   )ry  r:  rA   s    r/   r:  z/InternVLForConditionalGeneration.language_model$  s    z((r0   c                     | j         j        S r   )ry  r=  rA   s    r/   r=  z-InternVLForConditionalGeneration.vision_tower(  s    z&&r0   c                     | j         j        S r   )ry  r>  rA   s    r/   r>  z6InternVLForConditionalGeneration.multi_modal_projector,  s    z//r0   r   rU  rM   rd  r%  rV  labelsre  logits_to_keepimage_sizesr[   c                    ||n| j         j        }||n| j         j        } | j        d|||||||||
|d
|}|d         }t	          |t
                    rt          | d          n|}|                     |dd|ddf                   }d}|	  | j        d||	| j         j	        j
        d|}t          |||j        |j        |j        |j                  S )ac  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```N)
rU  r   rM   rd  r%  rV  rJ  rK  re  r  r   )rt  r  rz  )rs  rt  r%  r:   r  r6  r   )rd   rJ  rK  ry  r   r   slicerw  loss_functionr-  rz  rr  r%  r:   r  r6  )r+   rU  r   rM   rd  r%  rV  rJ  rK  r  re  r  r  r[   ri  r:   slice_indicesrt  rs  s                      r/   r=   z(InternVLForConditionalGeneration.forward0  sO   l %9$D  $+Jj 	
 .9 +*; 	' $* 
%)%+'!5+I)#
 
 
 
  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4% f9P9[ _e D .#3!/) ' ;
 
 
 	
r0   c           	      j     t                      j        |f|||||d|}	|d         dk    r||	d<   |	S )N)r%  rV  rM   re  r  r   r   )r#   prepare_inputs_for_generation)r+   rU  r%  rV  r   rM   re  r  r[   model_inputsr.   s             r/   r  z>InternVLForConditionalGeneration.prepare_inputs_for_generation  sg     =uww<
+')))
 
 
 
 !!! ,8L(r0   rk  )NNNNNNNNNNr   N)NNNNNN)&rC   rD   rE   rl  _tied_weights_keysr   r$   r  rA  r%   Moduler~  rG  rI  r'   r7  r   r   r   rm  rn  rT  propertyr:  r=  r>  r   r   ro  r   r	   r   r   r?   rr  r=   r  rF   rG   s   @r/   rv  rv    s        "8-"?#,	& &" ++~      1 1 1/ / /ry    ( ( (( ( ( AE8<	
 
'
 'uS$s)^'<=
 )1	
 
 
 
 ) ) X) ' ' X' 0 0 X0  15481537+/59@D8<-15934.2\
 \
E,-\
 u01\
 !.	\

 u/0\
 "%\
   12\
 'uS$s)^'<=\
 )1\
 )*\
 !!12\
 c5</0\
 el+\
 +,\
 
u44	5\
 \
 \
 ^ \
B          r0   rv  )r  r  r#  r9  rv  )rH   )Bcollections.abcr   dataclassesr   typingr   r   r   r'   torch.nnr%   activationsr   cache_utilsr	   
generationr
   integrationsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   autor   configuration_internvlr   r   r  r    r   rp  ra   rc   r   r   r   r   r  r   r   r   r  r  r#  r)  r5  r9  rr  rv  __all__r   r0   r/   <module>r     s  .     ! ! ! ! ! ! , , , , , , , , , ,        ! ! ! ! ! !             ) ) ) ) ) ) 7 7 7 7 7 7 9 9 9 9 9 9 d d d d d d d d d d F F F F F F F F & & & & & & a a a a a a a a a a a a a a / / / / / /       H H H H H H H H Y''J J J J JBI J J ('J6 % %I%<% 
% <	%
 U\*% % % % % %6F$ F$ F$ F$ F$bi F$ F$ F$R   
    +E    !7 !7 !7 !7 !7BI !7 !7 !7L[7 [7 [7 [7 [7ry [7 [7 [7|    	    3H
I
I+ + + + +4 + + +\
 
 
 
 
BI 
 
 
& K K K K KO K K K< '
 '
 '
 '
 '
7 '
 '
 '
T ' ' ' ' 'o ' ' '    ")   $   
< < < < <"9 < <  <   
A A A A A+ A A 
AH   
< < < < <[ < <  <0   
u u u u u'> u u 
up  r0   