
    .`ib:                        d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ eej        dZ G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z  G d dej                  Z!dS )    )IterableN)PretrainedConfig)	torch_int)
get_act_fn)MMEncoderAttention)Conv2dLayer)RMSNorm)ColumnParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader)rms_norm
layer_normc                   B     e Zd Z fdZdej        dej        fdZ xZS )InternS1VisionPatchEmbeddingsc                    t                                                       |j        |j        }}|j        |j        }}|d         |d         z  |d         |d         z  z  }|d         |d         z  |d         |d         z  f}|| _        || _        || _        || _        || _        t          ||||          | _	        d S )N   r   )kernel_sizestride)
super__init__
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper   
projection)	selfconfigr   r   r   r   r   r   	__class__s	           {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/interns1_vit.pyr   z&InternS1VisionPatchEmbeddings.__init__    s    !'!2F4EJ
$*$79Kk!!}
15qMZ]*
 "!}
15z!}
ST7UV$$(&&%+:j
 
 
    pixel_valuesreturnc                 N   |j         \  }}}}|| j        k    rt          d          |                     |                    | j        j        j                            }|j         d         |j         d         }}|                    d                              dd          }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.      r   )	shaper   
ValueErrorr   toweightdtypeflatten	transpose)	r   r$   
batch_sizer   heightwidth
embeddingspatch_heightpatch_widths	            r"   forwardz%InternS1VisionPatchEmbeddings.forward3   s    2>2D/
L&%4,,,?  
 __\__T_5K5Q%R%RSS
$.$4Q$79I!9Lk''**44Q::
L+666r#   )__name__
__module____qualname__r   torchTensorr6   __classcell__r!   s   @r"   r   r      s^        
 
 
 
 
&7EL 7U\ 7 7 7 7 7 7 7 7r#   r   c                        e Zd Zdef fdZdej        dededej        fdZ	 dd	ej        d
ej	        dz  dej        fdZ
 xZS )InternS1VisionEmbeddingsr    c                    t                                                       || _        t          j        t          j        dd|j                            | _        |j	        r3t          j        t          j        dd|j                            | _
        nd | _
        t          |          | _        |j        | _        t          |j        t                     r|j        n|j        |j        f| _        | j        j        }|j        r7t          j        t          j        d|dz   |j                            | _        d S d | _        d S )Nr   )r   r   r    nn	Parameterr:   zerosr   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   r   r    use_absolute_position_embeddingsposition_embeddings)r   r    r   r!   s      r"   r   z!InternS1VisionEmbeddings.__init__C   s    ek!Q8J&K&KLL  	# l5;q!V=O+P+PQQDOO"DO =f E E + &+X668F#V%67 	
 +72 	,')|A{Q0BCC( (D$$$ (,D$$$r#   r3   r1   r2   r%   c                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        d         z  }	|| j        d         z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr   g      ?r(   r'   bicubicF)sizemodealign_cornersdim)r)   rJ   r:   jit
is_tracingr   r   reshapepermuterA   
functionalinterpolateviewcat)r   r3   r1   r2   r   num_positionsclass_pos_embedpatch_pos_embedrR   
new_height	new_widthsqrt_num_positionss               r"   interpolate_pos_encodingz1InternS1VisionEmbeddings.interpolate_pos_encodingZ   s    !&q)A-06q9A=
 	$$&&	,},,%++2111bqb592111abb59r"tq11
T_Q//	&}c'9::)11!#5s
 
 *11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr#   Nr$   bool_masked_posc                    |j         \  }}}}|                     |          \  }\  }}|                                \  }	}
}|R| j                            |	|
d          }|                    d                              |          }|d|z
  z  ||z  z   }| j                            |	dd          }t          j	        ||fd          }| j
        ||                     |||          z   }|||ffS )NrL   r   rQ   )r)   rG   rN   rF   expand	unsqueezetype_asrD   r:   rZ   rJ   ra   )r   r$   rb   _r1   r2   r3   r4   r5   r0   seq_lenmask_tokensw
cls_tokenss                 r"   r6   z InternS1VisionEmbeddings.forward   s   
 +01fe262G2G2U2U/
/\;!+!2!2
GQ&/00WbIIK))"--55kBBA#q1u-a?J^**:r2>>
Y
J7Q???
#/#d&C&CFE' ' J L+666r#   N)r7   r8   r9   r   r   r:   r;   intra   
BoolTensorr6   r<   r=   s   @r"   r?   r?   B   s        ,/ , , , , , ,./D,/D03/D<?/D	/D /D /D /Dh 487 7l7 )D07 
	7 7 7 7 7 7 7 7r#   r?   c                   Z     e Zd ZdZdddededdf fdZd	ej        dej        fd
Z	 xZ
S )InternSdpaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   num_dummy_headsr    rr   r%   Nc                   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          || j        z   | j        z  | _	        | j        dz  | _
        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        |j        | _        | j        rLt'          | j	        |j        | j                  | _        t'          | j	        |j        | j                  | _        t          j        | j	        | j                  | _        t1          | j        | j        | j
                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )bias)epsvar_hidden_size)r   r   r    r   	embed_dimnum_attention_heads	num_headshead_dimr*   	dummy_dimscalerA   Linearattention_biasq_projk_projv_projuse_qk_normqk_normalizationr	   layer_norm_epsq_normk_normprojection_layerr   attn)r   r    rr   r!   s      r"   r   zInternSdpaAttention.__init__   s    	+3$.8=4>)T^;;'%)^' 'N' ' '   *DN:dmK]D(
iNDNT]:AV
 
 
 iNDNT]:AV
 
 
 iNDNT]:AV
 
 
 !' 2  
	!) $  DK
 ") $  DK !#	$.$. I I 't~t}djQQ			r#   xc                 >   |                      |          }|                     |          }|                     |          }| j        r*|                     |          }|                     |          }|                     |||          }|                     |          }|S )zx shape: (B, N, C))r   r   r   r   r   r   r   r   )r   r   qkvs        r"   r6   zInternSdpaAttention.forward   s     KKNNKKNNKKNN  	AAAA IIaA!!!$$r#   )r7   r8   r9   __doc__r   rm   r   r:   r;   r6   r<   r=   s   @r"   rp   rp      s        GG  !	2R 2R 2R 2R 	2R
 
2R 2R 2R 2R 2R 2Rh %,        r#   rp   c            	       `     e Zd Z	 	 d
dededz  deddf fdZdej        dej        fd	Z	 xZ
S )InternS1VisionMLPN r    quant_configprefixr%   c                     t                                                       || _        t          |j                  | _        t          |j        |j        d|| d          | _	        t          |j        |j        d|| d          | _        d S )NTz.fc1)rt   r   r   z.fc2)r   r   r    r   
hidden_actactivation_fnr
   r   intermediate_sizefc1r   fc2)r   r    r   r   r!   s       r"   r   zInternS1VisionMLP.__init__   s     	'(9::'$%???
 
 
 %$%???
 
 
r#   hidden_statesc                     |                      |          \  }}|                     |          }|                     |          \  }}|S rl   )r   r   r   )r   r   rg   s      r"   r6   zInternS1VisionMLP.forward  sG    88M22q**=9988M22qr#   )Nr   )r7   r8   r9   r   r   strr   r:   r;   r6   r<   r=   s   @r"   r   r      s         37	
 
 
 )4/
 	

 

 
 
 
 
 
2U\ el        r#   r   c                        e Zd Z	 dddddededz  deded	df
 fd
Zdddededz  dedefdZde	j
        fdZ xZS )InternS1VisionLayerNr   r   rr   r   r    r   rr   r   r%   c                L   t                                                       |                     |||| d          | _        t	          ||| d          | _        t          |j                 |j        |j	                  | _
        t          |j                 |j        |j	                  | _        |j        }t          j        |t          j        |j                  z  d          | _        t          j        |t          j        |j                  z  d          | _        d S )Nz
.attentionr   z.mlp)r   r   ru   T)requires_grad)r   r   
_init_attn	attentionr   mlpNORM2FN	norm_typer   r   layernorm_beforelayernorm_afterlayer_scale_init_valuerA   rB   r:   oneslambda_1lambda_2)r   r    r   rr   r   init_valuesr!   s         r"   r   zInternS1VisionLayer.__init__  s5    	+(((	 ) 
 
 %ooo
 
 
 !((8 9F$9!
 !
 !
  'v'78F$9 
  
  
 3%*V%7888
 
 
 %*V%7888
 
 
r#   )r   c                $    t          ||          S )Nrq   )rp   )r   r    r   rr   r   s        r"   r   zInternS1VisionLayer._init_attn4  s     #6?KKKKr#   r   c                     ||                      |                     |                    | j        z  z   }||                     |                     |                    | j        z  z   }|S rl   )r   r   r   r   r   r   )r   r   s     r"   r6   zInternS1VisionLayer.forward>  sr    
 nnT22=AABBT]RS 	 hht++M::;;dmKL 	
 r#   rl   )r7   r8   r9   r   r   rm   r   r   r   r:   r;   r6   r<   r=   s   @r"   r   r     s        37!

  !!
 !
 !
 !
 )4/!

 !
 !
 
!
 !
 !
 !
 !
 !
R L L L L )4/L
 L L L L L|       r#   r   c                   d     e Zd Z	 ddddddededz  dedz  ded	ef
 fd
Zdej	        fdZ
 xZS )InternS1VisionEncoderNr   r   num_hidden_layers_overriderr   r   r    r   r   rr   r   c                    t                                                       | _        |j        }n|}t	          j        fdt          |          D                       | _        d S )Nc           
      @    g | ]}t           d |           S )z.layer.r   )r   ).0	layer_idxr    rr   r   r   s     r"   
<listcomp>z2InternS1VisionEncoder.__init__.<locals>.<listcomp>c  sT         $ $3$88Y88	    r#   )r   r   r    num_hidden_layersrA   
ModuleListrangelayer)r   r    r   r   rr   r   r   r!   s    `` `` r"   r   zInternS1VisionEncoder.__init__P  s     	%- & 8 :]       "''8!9!9  

 




r#   inputs_embedsc                 4    |}| j         D ]} ||          }|S rl   )r   )r   r   r   encoder_layers       r"   r6   zInternS1VisionEncoder.forwardn  s/    %!Z 	9 	9M)M-88MMr#   rl   )r7   r8   r9   r   r   rm   r   r   r:   r;   r6   r<   r=   s   @r"   r   r   O  s         37

 26 
 
 
 
 )4/

 %($J
 
 
 
 
 
 
 
<U\        r#   r   c                        e Zd Z	 ddddddededz  dedz  ded	ed
df fdZd Z	 	 dde	j
        dz  de	j
        dz  d
e	j        fdZdeeee	j
        f                  d
ee         fdZ xZS )InternS1VisionModelNr   r   r   r    r   r   rr   r   r%   c                2   t                                                       || _        t          |          | _        t          |||| d          | _        |j        rt          j	                    nt          j
        |j        |j                  | _        d S )Nz.encoder)r    r   rr   r   r   )r   r   r    r?   r3   r   encoderuse_mean_poolingrA   Identity	LayerNormr   r   	layernorm)r   r    r   r   rr   r   r!   s         r"   r   zInternS1VisionModel.__init__w  s     	26::,'A+&&&	
 
 
 &MBKMMMf0f6KLLL 	r#   c                     | j         j        S rl   )r3   rG   )r   s    r"   get_input_embeddingsz(InternS1VisionModel.get_input_embeddings  s    //r#   r$   pixel_embedsc                    ||t          d          ||}n=|;|j        dk    r|                     |          \  }}nt          d|j                   |                     |          }|                     |          }|S )Nz0You have to specify pixel_values or pixel_embeds   zwrong pixel_values size: )r   )r*   ndimr3   r)   r   r   )r   r$   r   r   rg   encoder_outputss         r"   r6   zInternS1VisionModel.forward  s    
 L$8OPPP#(MM% A%%#'??<#@#@ qq !Q\=O!Q!QRRR,,],CC..99r#   weightsc                     t          |                                           }t                      }|D ]D\  }}||         }t          |dt                    } |||           |                    |           E|S )Nweight_loader)dictnamed_parameterssetgetattrr   add)r   r   params_dictloaded_paramsnameloaded_weightparamr   s           r"   load_weightsz InternS1VisionModel.load_weights  s    4002233"%%%#* 	$ 	$D-%E#E?<QRRMM%///d####r#   rl   )NN)r7   r8   r9   r   r   rm   r   r   r   r:   r;   FloatTensorr6   r   tupler   r   r<   r=   s   @r"   r   r   v  s3        37

 26 
 
 
 
 )4/

 %($J
 
 
 

 
 
 
 
 
40 0 0
 -1,0 lT) lT) 
		   *HU33D-E$F 3s8        r#   r   )"collections.abcr   r:   torch.nnrA   transformersr   transformers.utilsr   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   $vllm.model_executor.layers.layernormr	   !vllm.model_executor.layers.linearr
   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   r   r   Moduler   r?   rp   r   r   r   r    r#   r"   <module>r      sF   % $ $ $ $ $        ) ) ) ) ) ) ( ( ( ( ( ( < < < < < < X X X X X X 7 7 7 7 7 7 8 8 8 8 8 8 U U U U U U U U F F F F F F O O O O O O ,  7  7  7  7  7BI  7  7  7F`7 `7 `7 `7 `7ry `7 `7 `7FF F F F F") F F FR    	   D< < < < <") < < <~$ $ $ $ $BI $ $ $N; ; ; ; ;") ; ; ; ; ;r#   