
     `ie                        d dl Zd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/  ej0        e1          Z2	 dAde
j3        dej4        dej4        dej4        deej4                 de5de5fdZ6 G d de&          Z7 G d d e$          Z8e ed!"           G d# d$e                                  Z9 G d% d&e
j3                  Z: G d' d(e
j3                  Z; G d) d*e"          Z<e
j=        e7d+Z> G d, d-e          Z? G d. d/e
j3                  Z@e G d0 d1e                      ZAe G d2 d3eA                      ZB G d4 d5e,          ZCdZD G d6 d7e
j3                  ZE G d8 d9e+          ZF G d: d;e*          ZG G d< d=e(          ZH G d> d?e)          ZIg d@ZJdS )B    N)	dataclass)CallableOptionalUnion   )ACT2FN)Cache)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)check_model_inputs   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfig        modulequerykeyvalueattention_maskscalingdropoutc                    |}|}	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
d          }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd          	                                }||
fS )Nr   r   dim)ptrainingr   )
torchmatmul	transposeshapenn
functionalsoftmaxr)   r0   
contiguous)r#   r$   r%   r&   r'   r(   r)   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forwardr@   0   s     JL<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1 =((2(>>L=((6?([[L,|\::K''1--88::K$$    c                       e Zd ZdS )InternVLVisionRMSNormN__name__
__module____qualname__ rA   r?   rC   rC   K           DrA   rC   c                   h     e Zd Zdef fdZ	 ddej        deej                 dee	         fdZ
 xZS )	InternVLVisionAttentionconfigc                 "   t                                          |           | `d| _        |j        }|rt          | j                  nt          j                    | _	        |rt          | j                  nt          j                    | _
        d S )NF)super__init__num_key_value_groups	is_causaluse_qk_normrC   	embed_dimr5   Identityq_normk_norm)selfrL   qk_norm	__class__s      r?   rO   z InternVLVisionAttention.__init__P   sz       % $?FY+DN;;;BKMM?FY+DN;;;BKMMrA   Nhidden_statesr'   r9   c                    |                                 \  }}}|                     |          }|                     |          }|                     |          }	|                     |          }|                     |          }|                    ||| j        | j                  	                    dd          }|                    ||| j        | j                  	                    dd          }|	
                    ||| j        | j                  	                    dd          }	t          }
| j        j        dk    rt          | j        j                 }
 |
| |||	|f| j        sdn| j        | j        dd|\  }}|                    ||| j                  }|                     |          }|                     |          }||fS )Nr   r   eagerr"   F)r)   r(   rQ   )sizeq_projk_projv_projrU   rV   reshape	num_headshead_dimr3   viewr@   rL   _attn_implementationr   r0   attention_dropoutscalerS   projection_layerprojection_dropout)rW   rZ   r'   r9   
batch_sizeseq_len_query_statesr:   r;   attention_interfacer>   r<   outputs                 r?   forwardzInternVLVisionAttention.forward[   s    "/!3!3!5!5
GQ{{=11[[//
{{=11{{<00[[,,
#++JQUQ^__iijkmnoo''
GT^T][[eefgijkk
#((Wdndm\\ffghjkll(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HJ
%
 
%
 
%
 
%
!\ "))*gt~NN&&{33((00|##rA   N)rE   rF   rG   r!   rO   r1   Tensorr   r   r   rp   __classcell__rY   s   @r?   rK   rK   O   s        	Z3 	Z 	Z 	Z 	Z 	Z 	Z 26'$ '$|'$ !.'$ +,	'$ '$ '$ '$ '$ '$ '$ '$rA   rK   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                       e Zd ZdZdS )$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)rE   rF   rG   __doc__rH   rA   r?   rw   rw      s           rA   rw   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}|d         |d         z  |d         |d         z  z  }|d         |d         z  |d         |d         z  f}|| _        || _        || _        || _        || _        t          j	        ||||          | _
        d S )Nr   r   )kernel_sizestride)rN   rO   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper5   Conv2d
projection)	rW   rL   r~   r   r   r   r   r   rY   s	           r?   rO   z&InternVLVisionPatchEmbeddings.__init__   s    !'!2F4EJ
$*$79Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&)L+:^hiiirA   pixel_valuesreturnc                 
   |j         \  }}}}|| j        k    rt          d          |                     |          }|j         d         |j         d         }}|                    d                              dd          }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r4   r   
ValueErrorr   flattenr3   )	rW   r   rj   r   heightwidth
embeddingspatch_heightpatch_widths	            r?   rp   z%InternVLVisionPatchEmbeddings.forward   s    2>2D/
L&%4,,,w   __\22
$.$4Q$79I!9Lk''**44Q::
L+666rA   )	rE   rF   rG   rx   rO   r1   rr   rp   rs   rt   s   @r?   rz   rz      sm         j j j j j7EL 7U\ 7 7 7 7 7 7 7 7rA   rz   c                        e Zd ZdZdeddf fdZdej        dededej        fd	Z		 dd
ej        de
ej                 dej        fdZ xZS )InternVLVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rL   r   Nc                    t                                                       t          j        t	          j        dd|j                            | _        |j        r3t          j        t	          j        dd|j                            | _	        nd | _	        t          |          | _        |j        | _        t          |j        t          j        j                  r|j        n|j        |j        f| _        | j        j        }|j        r6t          j        t	          j        d|dz   |j                            | _        nd | _        t          j        |j                  | _        d S )Nr   )rN   rO   r5   	Parameterr1   zerosr   	cls_tokenuse_mask_token
mask_tokenrz   patch_embeddingsr   
isinstancer~   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probr)   )rW   rL   r   rY   s      r?   rO   z!InternVLVisionEmbeddings.__init__   s(   ek!Q8J&K&KLL  	# l5;q!V=O+P+PQQDOO"DO =f E E + &+[_-EFF8F#V%67 	
 +72 	,')|EK;QR?TZTf4g4g'h'hD$$'+D$z&"<==rA   r   r   r   c                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        d         z  }	|| j        d         z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr,   r         ?r   r   bicubicF)r]   modealign_cornersr-   )r4   r   r1   jit
is_tracingr   r   ra   permuter5   r6   interpolaterd   cat)rW   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedr.   
new_height	new_widthsqrt_num_positionss               r?   interpolate_pos_encodingz1InternVLVisionEmbeddings.interpolate_pos_encoding   s|    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"tq11
T_Q//	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCrA   r   bool_masked_posc                    |j         \  }}}}|                     |          \  }\  }}|                                \  }	}
}|R| j                            |	|
d          }|                    d                              |          }|d|z
  z  ||z  z   }| j                            |	dd          }t          j	        ||fd          }| j
        ||                     |||          z   }|                     |          }|||ffS )Nr,   r   r-   )r4   r   r]   r   expand	unsqueezetype_asr   r1   r   r   r   r)   )rW   r   r   rl   r   r   r   r   r   rj   rk   mask_tokensw
cls_tokenss                 r?   rp   z InternVLVisionEmbeddings.forward   s   
 +01fe262G2G2U2U/
/\;!+!2!2
GQ&/00WbIIK))"--55kBBA#q1u-a?J^**:r2>>
Y
J7Q???
#/#d&C&CJPVX]&^&^^J\\*--
L+666rA   rq   )rE   rF   rG   rx   r!   rO   r1   rr   intr   r   
BoolTensorrp   rs   rt   s   @r?   r   r      s         
>3 > > > > > > >,&D5< &D &DUX &D]b]i &D &D &D &DV 7;7 7l7 "%"237 
	7 7 7 7 7 7 7 7rA   r   c                       e Zd ZdS )InternVLVisionMLPNrD   rH   rA   r?   r   r     rI   rA   r   )
layer_normrms_normc                        e Zd ZdZdeddf fdZdej        dee	ej                 e	ej        ej        f         f         fdZ
 xZS )InternVLVisionLayerz?This corresponds to the Block class in the timm implementation.rL   r   Nc                    t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |j	                 |j
        |j                  | _        t          |j	                 |j
        |j                  | _        |j        }t          j        |t#          j        |j
                  z  d          | _        t          j        |t#          j        |j
                  z  d          | _        t          j        |j                  | _        d S )Nr   epsT)requires_grad)rN   rO   chunk_size_feed_forwardseq_len_dimrK   	attentionr   mlpNORM2FN	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer5   r   r1   oneslambda_1lambda_2r   r   r)   )rW   rL   init_valuesrY   s      r?   rO   zInternVLVisionLayer.__init__"  s   '-'E$088$V,, '(8 9&:LRXRg h h h&v'789KQWQfggg3[5:f>P3Q3Q%Qaefff[5:f>P3Q3Q%Qaefffz&"<==rA   rZ   c                 $   |                      |                     |                    \  }}| j        |z  }||z   }|                     |          }|                     |          }|                     |          }| j        
| j        |z  }||z   }|S rq   )r   r   r   r   r   r)   r   )rW   rZ   attention_outputrl   layer_outputs        r?   rp   zInternVLVisionLayer.forward1  s     #nn!!-00
 
!  =+;; )=8 ++M::xx--||L11=$=<7L $m3rA   )rE   rF   rG   rx   r!   rO   r1   rr   r   tuplerp   rs   rt   s   @r?   r   r     s        II>3 > > > > > > >| 
uU\"E%,*D$EE	F       rA   r   c                   R     e Zd Zdeddf fdZdej        deee	f         fdZ
 xZS )InternVLVisionEncoderrL   r   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rH   )r   ).0irL   s     r?   
<listcomp>z2InternVLVisionEncoder.__init__.<locals>.<listcomp>Q  s"    #i#i#iA$7$?$?#i#i#irA   F)	rN   rO   rL   r5   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrW   rL   rY   s    `r?   rO   zInternVLVisionEncoder.__init__N  s`    ]#i#i#i#ivOgIhIh#i#i#ijj
&+###rA   rZ   c                 L    | j         D ]} ||          }t          |          S )N)last_hidden_state)r   r   )rW   rZ   layer_modules      r?   rp   zInternVLVisionEncoder.forwardT  s@     !J 	8 	8L(L77MM+
 
 
 	
rA   )rE   rF   rG   r!   rO   r1   rr   r   r   r   rp   rs   rt   s   @r?   r   r   M  s~        ,3 , , , , , , ,	
|	
 
uo%	&	
 	
 	
 	
 	
 	
 	
 	
rA   r   c                   V     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZeedZ fdZ xZS )InternVLVisionPreTrainedModelrL   internvl_visionr   Tr   )rZ   
attentionsc                    t                                          |           t          |t                    rl|j        j                                         |j        |j        j                                         |j         |j        j                                         dS dS t          |t                    rT|j
        j                            | j        j                   |j        j                            | j        j                   dS dS )zInitialize the weightsN)rN   _init_weightsr   r   r   datazero_r   r   r   r   fill_rL   r   r   )rW   r#   rY   s     r?   r   z+InternVLVisionPreTrainedModel._init_weightsq  s    f%%%f677 	K!''))) ,!&,,...)5*/5577777 65 344 	KO &&t{'IJJJO &&t{'IJJJJJ	K 	KrA   )rE   rF   rG   r!   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rK   _can_record_outputsr   rs   rt   s   @r?   r   r   `  s             )$O&*#./N"& -- 
K K K K K K K K KrA   r   c                        e Zd Zdeddf fdZd Z ed          e	 ddej	        d	e
ej                 deeef         fd
                        Z xZS )InternVLVisionModelrL   r   Nc                 N   t                                          |           || _        t          |          | _        t          |          | _        |j        rt          j	                    nt          j
        |j        |j                  | _        |                                  d S )Nr   )rN   rO   rL   r   r   r   encoderuse_mean_poolingr5   rT   	LayerNormr   r   	layernorm	post_initr   s     r?   rO   zInternVLVisionModel.__init__  s       26::,V44 $4uBKMMM",vGY_e_t:u:u:u 	
 	rA   c                     | j         j        S rq   )r   r   )rW   s    r?   get_input_embeddingsz(InternVLVisionModel.get_input_embeddings  s    //rA   F)tie_last_hidden_statesr   r   c                     |                      ||          \  }}|                     |          }|d         }|                     |          }t          ||j        |j                  S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   r   )r   rZ   r   )r   r   r  rw   rZ   r   )rW   r   r   embedding_outputrl   encoder_outputssequence_outputs          r?   rp   zInternVLVisionModel.forward  sr     #oolOo\\!,,'788)!,..993-)7&1
 
 
 	
rA   rq   )rE   rF   rG   r!   rO   r  r   r   r1   rr   r   r   r   r   rw   rp   rs   rt   s   @r?   r   r     s        3       0 0 0 u555 7;
 
l
 "%"23
 
u::	;	
 
 
 ^ 65
 
 
 
 
rA   r   c                       e Zd ZdS )InternVLPreTrainedModelNrD   rH   rA   r?   r  r    rI   rA   r  c                   *     e Zd Zdef fdZd Z xZS )InternVLMultiModalProjectorrL   c                    t                                                       t          j        |j        j        t          d|j        z            dz  z            | _        t          j	        |j        j        t          d|j        z            dz  z  |j
        j                  | _        t          |j                 | _        t          j	        |j
        j        |j
        j                  | _        d S )Nr   r   )rN   rO   r5   r   vision_configr   r   downsample_ratior   Lineartext_configlinear_1r   projector_hidden_actactlinear_2r   s     r?   rO   z$InternVLMultiModalProjector.__init__  s    ,v';'G#aRXRiNiJjJjnoJo'opp	 ,s1v7N3N/O/OST/TTV\VhVt
 
 &56	&"4"@&BTB`aarA   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S rq   )r   r  r  r  )rW   image_featuresrZ   s      r?   rp   z#InternVLMultiModalProjector.forward  sL    77m44//m44rA   )rE   rF   rG   r    rO   rp   rs   rt   s   @r?   r  r    sZ        b~ b b b b b b      rA   r  c                       e Zd ZdS )InternVLModelOutputWithPastNrD   rH   rA   r?   r  r    rI   rA   r  c                      e Zd Zddej        defdZ	 	 ddej        dee	e
ee
         f                  dee         fd	Zee	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 dee         deej                 dee	e
ee
         f                  dee         deej                 dee         de	eef         fd                        ZdS )InternVLModelr   vision_featuresscale_factorc           
      (   |                                 \  }}}}||z  dk    s	||z  dk    rt          d          |                    ||t          ||z            t          ||z                      }|                    dddd                                          }|                    |t          ||z            t          ||z            t          ||dz  z                      }|                    dddd                                          }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r]   r   rd   r   r   r8   )rW   r  r  rj   r   r   channelss          r?   pixel_shufflezInternVLModel.pixel_shuffle  s)    />.B.B.D.D+
E68L A%%)=)B)Bjkkk *..s6L#8993x,?V;W;W
 
 *11!Q1==HHJJ *..F\122C8L4M4MsS[_kmn_nSoOpOp
 

 *11!Q1==HHJJrA   Nr   vision_feature_layervision_feature_select_strategyc                 l   ||n| j         j        }||n| j         j        }|                    | j                  }| j         j        }|dk    r|                     |          j        }n!|                     |          j	        |         }|dk    r|ddddddf         }|j
        d         }t          |dz            }|j
        d         }	|                    |	||d          }|                     ||	          }|                    |	d|j
        d                   }|                     |          }|S )
a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `list[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        N)dtyper,   )r   defaultr   r   r   )r  )rL   r"  r#  tor%  r  vision_towerr   vision_modelrZ   r4   r   ra   r!  multi_modal_projector)
rW   r   r"  r#  r9   r  r  r   feature_sizerj   s
             r?   get_image_featuresz InternVLModel.get_image_features  sh   & %9$D  $+Jj 	
 .9 +*; 	'
 $TZ88;72%%"//\/JJ\OO"//\/JJXYmnO)Y66-aaaQQQh7O #(+8S=))$*1-
 *11*lLZ\]] ,,_K[,\\ *11*b/BWXZB[\\ 44_EErA   	input_idsr'   position_idspast_key_valuesinputs_embedscache_positionr9   r   c
           	         ||n| j         j        }||n| j         j        }|d u |d uz  rt          d          | |                                 |          }|f|                     |||          }|                    |j        |j                  }| 	                    |||          }|
                    ||          } | j        d|||||	d|
}t          |j        |j        |j        |j        ||nd           S )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r"  r#  )r0  r  )r'   r.  r/  r0  r1  )r   r/  rZ   r   image_hidden_statesrH   )rL   r"  r#  r   r  r,  r'  devicer%  get_placeholder_maskmasked_scatterlanguage_modelr  r   r/  rZ   r   )rW   r-  r   r'   r.  r/  r0  r"  r#  r1  r9   r  special_image_maskoutputss                 r?   rp   zInternVLModel.forward!  sx     %9$D  $+Jj 	
 .9 +*; 	' -t";< 	[YZZZ 7D5577	BBM#!44)%9/M 5  N
 ,..}/C]EXYYN!%!:!:~ "; " " *889K^\\M%$% 
)%+')
 
 
 
 +%7#3!/)2>2JPT
 
 
 	
rA   )r   )NN)	NNNNNNNNN)rE   rF   rG   r1   rr   floatr!  FloatTensorr   r   r   liststrr,  r   r   
LongTensorr	   r   r   r   r  rp   rH   rA   r?   r  r    s       ! !U\ ! ! ! ! !L AE8<	4 4'4 'uS$s)^'<=4 )1	4 4 4 4l  15481537+/59@D8<597
 7
E,-7
 u017
 !.	7

 u/07
 "%7
   127
 'uS$s)^'<=7
 )17
 !!127
 +,7
 
u11	27
 7
 7
 ^ 7
 7
 7
rA   r  c                       e Zd ZdS )InternVLCausalLMOutputWithPastNrD   rH   rA   r?   r@  r@  ]  rI   rA   r@  c                        e Zd Z fdZ xZS ) InternVLForConditionalGenerationc                  :     t                      j        di |  dS )ac  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```NrH   )rN   rp   )super_kwargsrY   s    r?   rp   z(InternVLForConditionalGeneration.forwardb  s(    H 	'','''''rA   )rE   rF   rG   rp   rs   rt   s   @r?   rB  rB  a  s8        $( $( $( $( $( $( $( $( $(rA   rB  )r   r   r  r  rB  )r"   )Kcollections.abcr   dataclassesr   typingr   r   r   r1   torch.nnr5   activationsr   cache_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   r   configuration_internvlr    r!   
get_loggerrE   loggerModulerr   r:  r@   rC   rK   rw   rz   r   r   r   r   r   r   r   r   r  INTERNVL_INPUTS_DOCSTRINGr  r  r  r@  rB  __all__rH   rA   r?   <module>r[     sq  "     ! ! ! ! ! ! , , , , , , , , , ,        ! ! ! ! ! !             9 9 9 9 9 9 K K K K K K K K F F F F F F F F & & & & & & ] ] ] ] ] ] ] ] ] ] ] ] ] ] / / / / / / ( ( ( ( ( ( 7 7 7 7 7 7 / / / / / /              I H H H H H H H 
	H	%	% % %I%<% 
% <	%
 U\*% % % % % %6	 	 	 	 	L 	 	 	3$ 3$ 3$ 3$ 3$2 3$ 3$ 3$l   
    +E    !7 !7 !7 !7 !7BI !7 !7 !7L[7 [7 [7 [7 [7ry [7 [7 [7|	 	 	 	 	 	 	 	 3H
I
I+ + + + +4 + + +\
 
 
 
 
BI 
 
 
& K K K K KO K K K< '
 '
 '
 '
 '
7 '
 '
 '
T	 	 	 	 	2 	 	 	 !     ")   $	 	 	 	 	": 	 	 	S
 S
 S
 S
 S
J S
 S
 S
l	 	 	 	 	%@ 	 	 	%( %( %( %( %('D %( %( %(P  rA   