
    .`i{                        U d dl mZmZmZ d dlmZmZ d dlmZm	Z	 d dl
Z
d dl
mZ d dlmZmZmZmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7m8Z8m9Z9 d dl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZE ddlFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZN ddlOmPZPmQZQmRZRmSZSmTZTmUZU  G d deD          ZVd d d!d"ZWeXeYeRf         eZd#<   d$eYd%eRfd&Z[ G d' d(e=          Z\ G d) d*e;e\                   Z] G d+ d,e<e\                   Z^ G d- d.ePe                   Z_ G d/ d0ej`                  Za G d1 d2ej`                  Zb G d3 d4ej`                  Zc G d5 d6ej`                  Zd G d7 d8ej`                  Ze G d9 d:ej`                  Zf G d; d<ej`                  Zg G d= d>ej`                  Zh G d? d@ej`                  ZidAeYdBe
jj        dCe
jj        dDeXeYe
jj        f         dEe(d%e
jj        fdFZk G dG dHej`                  Zl eKdIJ           e/jm        e^e\e]K           G dL dMej`        eHeI                                  ZndS )N    )CallableIterableMapping)cached_propertypartial)	AnnotatedLiteralN)nn)BatchFeatureSiglipConfigSiglipProcessorSiglipTextConfigSiglipVisionConfig)
VllmConfig)BaseDummyOptions)divide$get_tensor_model_parallel_world_size)
get_act_fn)EncoderOnlyAttention)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)DispatchPooler)QuantizationConfig)VocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModalSupportsQuant)default_pooling_type)AutoWeightsLoadermaybe_prefix)VisionEncoderInfoVisionFeatureSelectStrategyVisionFeatureSelectStrategyStrget_num_selected_vision_tokensis_vit_use_data_parallelresolve_visual_encoder_outputsc                   f    e Zd ZU dZed         ed<   eej         e	dddd          f         ed<   d	S )
SiglipImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwdataN)
__name__
__module____qualname____doc__r	   __annotations__r   torchTensorr1        u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/siglip.pyr@   r@   H   sW           .
!!!!
EL++dAsC"@"@@
AAAAAArP   r@   fullclass)MEANALLCLS_POOLING_TYPE_TO_STRATEGYpooling_typereturnc                 `    	 t           |          S # t          $ r t          d|           d w xY w)Nz;No feature selection strategy is defined for pooling_type: )rW   KeyError
ValueError)rX   s    rQ   #_get_vision_feature_select_strategyr]   \   sV    (66   .). .
 
 	s    -c                   t    e Zd Zd Zd ZdefdZdeee	dz  f         fdZ
de	d	e	de	fd
ZdefdZde	fdZdS )SiglipProcessingInfoc                 @    | j                             t                    S N)ctxget_hf_configr   selfs    rQ   rc   z"SiglipProcessingInfo.get_hf_configi   s    x%%l333rP   c                 D    t          |                                           S ra   )SiglipEncoderInforc   rd   s    rQ   get_vision_encoder_infoz,SiglipProcessingInfo.get_vision_encoder_infol   s     !3!3!5!5666rP   kwargsc                 2     | j         j        t          fi |S ra   )rb   get_hf_processorr   )re   ri   s     rQ   rk   z%SiglipProcessingInfo.get_hf_processoro   s    (tx(CCFCCCrP   rY   Nc                 
    ddiS )Nimager2   rO   rd   s    rQ   get_supported_mm_limitsz,SiglipProcessingInfo.get_supported_mm_limitsr   s    |rP   image_widthimage_heightc                    |                                  }| j        j        j        }|J t	          |                    ||          t          |j                            S Nro   rp   )rh   rb   model_configpooler_configr<   get_num_image_tokensr]   seq_pooling_type)re   ro   rp   vision_encoder_inforu   s        rQ   rv   z)SiglipProcessingInfo.get_num_image_tokensu   so     #::<<-;(((-44') 5   00NOO
 
 	
rP   c                 x    |                                  }|                                x}}t          ||          S )N)widthheight)rh   get_image_sizer'   )re   rx   rz   r{   s       rQ   !get_image_size_with_most_featuresz6SiglipProcessingInfo.get_image_size_with_most_features   s=    "::<<,;;===uV4444rP   c                 ^    |                                  \  }}|                     ||          S rr   )r}   rv   )re   target_widthtarget_heights      rQ   get_max_image_tokensz)SiglipProcessingInfo.get_max_image_tokens   s:    &*&L&L&N&N#m(($= ) 
 
 	
rP   )rH   rI   rJ   rc   rh   objectrk   r   strintrn   rv   r'   r}   r   rO   rP   rQ   r_   r_   h   s        4 4 47 7 7D D D D DcDj)A    
 
 	

 

 
 
 
&59 5 5 5 5

c 
 
 
 
 
 
rP   r_   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	SiglipDummyInputsBuilder	mm_countsrY   c                     dS N rO   )re   r   s     rQ   get_dummy_textz'SiglipDummyInputsBuilder.get_dummy_text   s    rrP   Nseq_len
mm_optionsc                     |                     dd          }| j                                        \  }}|r|                     d          nd }d|                     ||||          iS )Nrm   r   )rz   r{   
num_images	overrides)getinfor}   _get_dummy_images)re   r   r   r   r   r   r   image_overridess           rQ   get_dummy_mm_dataz*SiglipDummyInputsBuilder.get_dummy_mm_data   s|     ]]7A..
&*i&Q&Q&S&S#m5?I*..111T T++"$%)	 ,  
 	
rP   ra   )
rH   rI   rJ   r   r   r   r   r   r!   r   rO   rP   rQ   r   r      s        S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rP   r   c                   @    e Zd Zedefd            Z	 ddddeee         z  dede	ee
f         de	ee
f         dz  d	edz  def fd
Zdedede	ee
f         de	ee
f         def
dZdede	ee
f         de	eef         fdZdede	ee
f         dedee         fdZ xZS )SiglipMultiModalProcessorrY   c                     | j                                         t          fdt          j                  D                       }|S )Nc              3   .   K   | ]}|j         v|V  d S ra   )all_special_ids).0token_id	tokenizers     rQ   	<genexpr>z;SiglipMultiModalProcessor.image_token_id.<locals>.<genexpr>   s=       
 
y888 8888
 
rP   )r   get_tokenizernextrange
vocab_size)re   dummy_token_idr   s     @rQ   image_token_idz(SiglipMultiModalProcessor.image_token_id   s^    I++--	 
 
 
 
!)"677
 
 
 
 
 rP   N)mm_uuidspromptmm_datahf_processor_mm_kwargstokenization_kwargsr   c                    |r|rt          d          |r
i |pi ddi}t                                          |||||          S )Nz|Siglip accepts text-only or image-only inputs, not both! Image-only inputs means passing an image with an empty text prompt.add_special_tokensF)r   r   r   r   r   )r\   superapply)re   r   r   r   r   r   	__class__s         rQ   r   zSiglipMultiModalProcessor.apply   s      	g 	    	#&,"#$e# #
 ww}}#9 3  
 
 	
rP   prompt_textmm_itemsc                     dS )NFrO   )re   r   r   r   r   s        rQ   _hf_processor_applies_updatesz7SiglipMultiModalProcessor._hf_processor_applies_updates   s	     urP   	hf_inputsc                 F    t          t          j        d                    S )Nrm   )rA   )dictr"   batched)re   r   r   s      rQ   _get_mm_fields_configz/SiglipMultiModalProcessor._get_mm_fields_config   s!    
 !6!>w!G!GHHHHrP   out_mm_kwargsc                 |      j         dt          f fd}t          dt          j                    |          gS )Nitem_idxc                                          dt                    }|                    |           }j                            |j        |j                  }g|z  S )Nrm   rs   )	get_itemsr&   r|   r   rv   rz   r{   )r   images
image_sizenum_image_tokensr   r   re   s       rQ   get_replacementzFSiglipMultiModalProcessor._get_prompt_updates.<locals>.get_replacement   se    ''1DEEF..x88J#y==&,:;L  >     ##&666rP   rm   )modalitytargetreplacement)r   r   r-   r,   start)re   r   r   r   r   r   s   ``   @rQ   _get_prompt_updatesz-SiglipMultiModalProcessor._get_prompt_updates   sn     ,	7c 	7 	7 	7 	7 	7 	7 	7 	7  )/11+  
 	
rP   ra   )rH   rI   rJ   r   r   r   r   listr!   r   r   r%   r#   r   r(   boolr   r   r"   r   r$   r.   r   __classcell__r   s   @rQ   r   r      s           _ <@
 /3
 
 
d3i
 $
 !(V 4	

 %S&[1D8
 %t+
 

 
 
 
 
 
@ & !(V 4	
 %S&[1 
   II !(V 4I 
++	,	I I I I
%
 !(V 4
 -	

 
l	
 
 
 
 
 
 
 
rP   r   c                   F    e Zd ZdededefdZdefdZdefdZdefdZdS )	rg   ro   rp   rY   c                0    |                                  dz  S )N   )get_patch_grid_length)re   ro   rp   s      rQ   rv   z&SiglipEncoderInfo.get_num_image_tokens  s     ))++q00rP   c                     | j         j        S ra   )vision_configr   rd   s    rQ   r|   z SiglipEncoderInfo.get_image_size      !,,rP   c                     | j         j        S ra   )r   
patch_sizerd   s    rQ   get_patch_sizez SiglipEncoderInfo.get_patch_size  r   rP   c                 \    |                                  |                                 }}||z  S ra   )r|   r   )re   r   r   s      rQ   r   z'SiglipEncoderInfo.get_patch_grid_length  s.    !%!4!4!6!68K8K8M8MJ
Z''rP   N)rH   rI   rJ   r   rv   r|   r   r   rO   rP   rQ   rg   rg     s        1 1 	1
 
1 1 1 1- - - - -- - - - -(s ( ( ( ( ( (rP   rg   c                   |     e Zd Zdef fdZdej        dededej        fdZ	 dd	ej        d
e	dej        fdZ
 xZS )SiglipVisionEmbeddingsconfigc                 "   t                                                       || _        |j        | _        |j        | _        |j        | _        t          |j        | j        | j        | j        d          | _	        | j        | j        z  dz  | _
        | j
        | _        t          j        | j        | j                  | _        |                     dt!          j        | j        t           j                                      d          d           d S )	Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_idsdtyper2   F
persistent)r   __init__r   hidden_size	embed_dimr   r   r   num_channelspatch_embeddingnum_patchesnum_positionsr
   	Embeddingposition_embeddingregister_bufferrM   arangeint64expandre   r   r   s     rQ   r   zSiglipVisionEmbeddings.__init__  s    + + +*+? 
  
  
 !Ot>1D!-"$,t/A4>"R"RL+5;???FFwOO 	 	
 	
 	
 	
 	
rP   
embeddingsr{   rz   rY   c                 B   |j         d         }| j        j        j         d         }||k    r ||k    r|                     | j                  S | j        j                            d          }|j         d         }|| j        z  }|| j        z  }	t          |dz            }
|                    d|
|
|          }|                    dddd          }t          j
                            |||	fdd	          }|                    dddd                              dd|          }|S )
Nr2   r   r   g      ?rD   r   bicubicF)sizemodealign_corners)shaper   weightr   	unsqueezer   r   reshapepermuter
   
functionalinterpolateview)re   r   r{   rz   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              rQ   interpolate_pos_encodingz/SiglipVisionEmbeddings.interpolate_pos_encoding3  s:    !&q)/6<Q?-''FeOO**4+<===18BB1EEr"t.
T_,	 !344)11!#5s
 
 *11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNrP   FrA   r  c                 X   |j         \  }}}}| j        j        j        }|                     |                    |                    }|                    d                              dd          }|r||                     |||          z  }n||                     | j	                  z  }|S )Nr   r   r2   )
r   r   r   r   toflatten	transposer  r   r   )	re   rA   r  _r{   rz   target_dtypepatch_embedsr   s	            rQ   forwardzSiglipVisionEmbeddings.forwardR  s     +01fe+28++OO,O//
 
 "))!,,66q!<<
# 	E$77
FERRRJJ$11$2CDDDJrP   )F)rH   rI   rJ   r   r   rM   rN   r   r  r   r  r   r   s   @rQ   r   r     s        
1 
 
 
 
 
 
0,03<?	   @ LQ !LDH	       rP   r   c                        e Zd Z	 ddddeez  dedz  dedee         ee	         z  ddf
 fd	Z
d
ej        deej        df         fdZ xZS )SiglipAttentionNr   prefixr   quant_configr  attn_clsrY   c                   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        t                      }t          | j        | j        | j        || d|          | _        t          | j        | j        || d|          | _        |rd	nt                      | _        t#          | j        | j                  | _        |t&          k    r( || j        | j        | j	        | d
          | _        d S  || j        | j        | j	        | d
          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z	.qkv_proj)r   	head_sizetotal_num_headsr  r  
disable_tpz	.out_proj)
input_sizeoutput_sizer  r  r  r2   z.attnr  )r   r   r   r   r   num_attention_heads	num_headshead_dimr\   scaler=   r   qkv_projr   out_projr   tp_sizer   num_heads_per_partitionr   attn)re   r   r  r  r  use_data_parallelr   s         rQ   r   zSiglipAttention.__init__d  s    	+3$.8=4>)T^;;3%)^3 3 $3 3 3   ]D(
466)m N%'''(
 
 
 *~%'''(
 
 
 #NAA(L(N(N 	 (.dndl'K'K$))) ,
 '''	  DIII !,
 '''	  DIIIrP   hidden_statesc                     |                      |          \  }}|                    dd          \  }}}|                     |||          }|                     |          \  }}|dfS )z#Input shape: Batch x Time x ChannelrD   r   r  N)r   chunkr$  r!  )	re   r&  
qkv_statesr  query_states
key_statesvalue_statesoutattn_outputs	            rQ   r  zSiglipAttention.forward  sm    
 m44
A1;1A1A!1A1L1L.j,iij,??s++QD  rP   ra   rH   rI   rJ   r   r   r   r   rB   r   r   r   rM   rN   tupler  r   r   s   @rQ   r  r  c  s         37;
 ; ; ;"%55; )4/;
 ; +,t4F/GG; 
; ; ; ; ; ;z
!|
! 
u|T!	"
! 
! 
! 
! 
! 
! 
! 
!rP   r  c            	       f     e Zd Z	 	 d
deez  dedz  deddf fdZdej	        dej	        fd	Z
 xZS )	SiglipMLPNr   r   r  r  rY   c                    t                                                       || _        t                      }t	          |j                  | _        |r|                                dv rd}n|j        dz  dk    o|j	        dz  dk    }t          |j        |j	        |r|nd | d|          | _        t          |j	        |j        |r|nd | d|          | _        d S )N)bitsandbytestorchaoT@   r   z.fc1)r  r  r  z.fc2)r   r   r   r=   r   
hidden_actactivation_fnget_namer   intermediate_sizer   fc1r   fc2)re   r   r  r  r%  quantizabler   s         rQ   r   zSiglipMLP.__init__  s    	466'(9::  	L11337RRRKK
 "R'1,S1IB1NRS1S  ($)4>$???(
 
 
 %$)4>$???(
 
 
rP   r&  c                     |                      |          \  }}|                     |          }|                     |          \  }}|S ra   )r<  r9  r=  )re   r&  r  s      rQ   r  zSiglipMLP.forward  sG    88M22q**=9988M22qrP   r   )rH   rI   rJ   r   r   r   r   r   rM   rN   r  r   r   s   @rQ   r3  r3    s         37	#
 #
"%55#
 )4/#
 	#

 
#
 #
 #
 #
 #
 #
JU\ el        rP   r3  c                        e Zd Z	 ddddeez  dedz  dedee         ee	         z  ddf
 fd	Z
d
ej        deej        df         fdZ xZS )SiglipEncoderLayerNr   r  r   r  r  r  rY   c                ^   t                                                       |j        | _        t	          ||| d|          | _        t          j        | j        |j                  | _	        t          ||| d          | _        t          j        | j        |j                  | _        d S )Nz
.self_attnr  r  r  eps.mlpr  r  )r   r   r   r   r  	self_attnr
   	LayerNormlayer_norm_epslayer_norm1r3  mlplayer_norm2)re   r   r  r  r  r   s        rQ   r   zSiglipEncoderLayer.__init__  s     	+(%(((	
 
 
 <F<QRRR%???
 
 

 <F<QRRRrP   r&  c                     |}|                      |          }|                     |          \  }}||z  }|}|                     |          }|                     |          }||z  }|d fS )N)r&  )rK  rH  rM  rL  )re   r&  residualr  s       rQ   r  zSiglipEncoderLayer.forward  s|     !((77>>>FFq! ((77//!d""rP   ra   r0  r   s   @rQ   rA  rA    s         37S
 S S S"%55S )4/S
 S +,t4F/GGS 
S S S S S S4#|# 
u|T!	"# # # # # # # #rP   rA  c                        e Zd Z	 	 ddddeez  dedz  dedz  dedee	         ee
         z  d	df fd
Zdej        ded	ej        eej                 z  fdZ xZS )SiglipEncoderNr   r  r   r  num_hidden_layers_overrider  r  rY   c                    t                                                       | _        |j        }n|}t	          j        fdt          |          D                       | _        d S )Nc           	      @    g | ]}t           d |           S )z.layers.rC  )rA  )r   	layer_idxr  r   r  r  s     rQ   
<listcomp>z*SiglipEncoder.__init__.<locals>.<listcomp>  sT         #!-$99i99%	    rP   )r   r   r   num_hidden_layersr
   
ModuleListr   layers)re   r   r  rR  r  r  rW  r   s    `` `` rQ   r   zSiglipEncoder.__init__	  s     	%- & 8 :m       "''8!9!9  

 

rP   inputs_embedsreturn_all_hidden_statesc                 v    |g}|}| j         D ]'} ||          \  }}|r|                    |           (|r|S |S ra   )rY  append)re   rZ  r[  hidden_states_poolr&  encoder_layerr  s          rQ   r  zSiglipEncoder.forward'  si    
 ,_%![ 	9 	9M,}];;M1' 9"))-888 $ 	&%%rP   NN)rH   rI   rJ   r   r   r   r   r   rB   r   r   r   rM   rN   r   r   r  r   r   s   @rQ   rQ  rQ    s         3715	
 
 
 
"%55
 )4/
 %($J	
 
 +,t4F/GG
 

 
 
 
 
 
<| #' 
U\*	*	       rP   rQ  c            
            e Zd Z	 ddddededz  deddf fdZd	ej        dej        fd
Z		 dd	ej        dz  dej        dej        dz  dej        fdZ
deeeej        f                  dee         fdZ xZS )SiglipTextTransformerNr   r  r   r  r  rY   c                J   t                                                       || _        |j        }t	          |          | _        t          ||| dt                    | _        t          j
        ||j                  | _        t          j        ||j                  | _        d S )N.encoder)r   r  r  r  rD  )r   r   r   r   SiglipTextEmbeddingsr   rQ  r   encoderr
   rI  rJ  final_layer_normLinearprojection_sizehead)re   r   r  r  r   r   s        rQ   r   zSiglipTextTransformer.__init__;  s     	&	.v66$%&&&)	
 
 
 !#YF<Q R R RIi)?@@			rP   	input_idsc                 6    | j                             |          S ra   )r   token_embedding)re   rk  s     rQ   embed_input_idsz%SiglipTextTransformer.embed_input_idsS  s    ..y999rP   r   rZ  c                     |                      |||          }|                     |d          }|                     |          }|S )NFrZ  r[  )r   rf  rg  )re   rk  r   rZ  r&  last_hidden_states         rQ   r  zSiglipTextTransformer.forwardV  sT     	<OO LL'% ) 
 
 !112CDD  rP   weightsc                 |   g d}t          |                                           }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N)r   q_projq)r   k_projk)r   v_projvweight_loader)r   named_parameterssetreplacer{  getattrr   add)re   rr  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr{  s               rQ   load_weightsz"SiglipTextTransformer.load_weightsf  s    "
 "
 "
 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####rP   ra   )rH   rI   rJ   r   r   r   r   rM   rN   rn  r  r   r1  r}  r  r   r   s   @rQ   rb  rb  :  sA        37A
 A A A A )4/A
 A 
A A A A A A0: :%, : : : : .2	! !<$&! l! |d*	!
 
! ! ! ! HU33D-E$F 3s8        rP   rb  c            	       d     e Zd ZdZ	 	 ddededz  deddf fdZd	ej	        dej	        fd
Z
 xZS )#SiglipMultiheadAttentionPoolingHeadzMultihead Attention Pooling.Nr   r   r  r  rY   c                    t                                                       t          j        t	          j        dd|j                            | _        t          j                            |j        |j	        d          | _
        t          j        |j        |j                  | _        t          ||| d          | _        d S )Nr2   T)batch_firstrD  rF  r   r  r  )r   r   r
   	ParameterrM   randnr   probeMultiheadAttentionr  	attentionrI  rJ  	layernormr3  rL  )re   r   r  r  r   s       rQ   r   z,SiglipMultiheadAttentionPoolingHead.__init__  s     	\%+aF4F"G"GHH
44 : 5 
 
 f&8f>STTT%???
 
 
rP   hidden_statec                    |                     d          }| j                            |dd          }|                     |||          d         }|}|                     |          }|                     |          }||z  }|S )Nr   r   )r   r  r   r  r  rL  )re   r  
batch_sizer  rO  s        rQ   r  z+SiglipMultiheadAttentionPoolingHead.forward  s    !&&q))

!!*b"55~~e\<HHK~~l33xx--  rP   r   )rH   rI   rJ   rK   r   r   r   r   rM   rN   r  r   r   s   @rQ   r  r    s        &&
 37	
 
"
 )4/
 	

 

 
 
 
 
 
(EL U\        rP   r  c                   H    e Zd Z	 dddddddededz  dedz  dedz  d	ed
edz  ddf fdZe	d             Z
e	d             Zdddddej        dedee         dz  dedz  dej        f
dZdej        dej        fdZdeeeej        f                  dee         fdZ xZS )SiglipVisionTransformerNr   FrR  require_post_normr  use_headr   r  rR  r  r  r  rY   c                   t                                                       || _        |j        }t	          |          | _        t          |||| dt                    | _        |j	        }t          | j        j                  |j	        k    r-t          d| dt          | j        j                   d          |t          | j        j                  |k    }|r!t          j        ||j                  | _        nd | _        t#          |t$                    r|| _        nt)          |d          sdn|j        | _        | j        rt-          ||| d	
          nd | _        t1          | j                  | _        d S )Nrd  )r  rR  r  r  zThe original encoder only has z layers, but you requested z layers.rD  vision_use_headTz.headr  )r   r   r   r   r   r   rQ  r   rf  rW  lenrY  r\   r
   rI  rJ  post_layernorm
isinstancer   r  hasattrr  r  rj  r   maybe_layer_norm_and_apply_headlast_hs_proc)
re   r   r  rR  r  r  r  r   rW  r   s
            rQ   r   z SiglipVisionTransformer.__init__  s    	&	088$%'A&&&'
 
 
 #4t|"##f&>>>P1B P P-01D-E-EP P P   $ #DL$7 8 8<M M 	'"$,yf>S"T"T"TD"&D
 h%% 	$DMM v'899,+ M }/) '''     		 $D$HIIrP   c                 N    t          |                                           j        S ra   )r   
parametersr   rd   s    rQ   r   zSiglipVisionTransformer.dtype  s    DOO%%&&,,rP   c                 N    t          |                                           j        S ra   )r   r  devicerd   s    rQ   r  zSiglipVisionTransformer.device  s    DOO%%&&--rP   )r  select_layersfeature_select_strategyrA   r  r  r  c                    |                      ||          }|                     ||d u          }t          |d || j        j        | j        |          }|S )N)r  rp  )r  max_possible_layersr  r  )r   rf  r>   r   rW  r  )re   rA   r  r  r  r&  encoder_outputss          rQ   r  zSiglipVisionTransformer.forward  s     %= ( 
 
 ,,'%2$%> ' 
 
 9' $ =*$;
 
 
 rP   r  c                 v    | j         |                      |          }| j        |                     |          }|S )zApply the post layer norm and head if they are enabled,
        given the last hidden states tensor.

        args:
            encoder_outputs: The last hidden states from the visual encoder.
        )r  rj  )re   r  s     rQ   r  z7SiglipVisionTransformer.maybe_layer_norm_and_apply_head  s?     *"11/BBO9 "ii88OrP   rr  c                    g d}t          |                                           }t                      }t          | j        j                  }|D ]\  }}|                    d          r| j        #| j        |                    d          r@|                    d          r/t          |
                    d          d                   }||k    r|D ]>\  }	}
}|
|vr|                    |
|	          }||         }|j        } ||||            n*||         }t          |dt                    } |||           |                    |           |S )Nrt  r  rj  zencoder.layers.r   r{  )r   r|  r}  r  rf  rY  
startswithr  rj  r   splitr~  r{  r  r   r  )re   rr  r  r  r  layer_countr  r  rU  r  r  r  r  r{  s                 rQ   r  z$SiglipVisionTransformer.load_weights%  s   "
 "
 "
 4002233"%%%$,-..#* 	$ 	$D-/00 T5H5P y T__V%<%<  /00 

3 233	++5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####rP   ra   )rH   rI   rJ   r   r   r   r   r   r   propertyr   r  rM   rN   r   r:   r  r  r   r1  r}  r  r   r   s   @rQ   r  r    s        37?J
 26)- %?J ?J ?J"?J )4/?J
 %($J?J  $;?J ?J +?J 
?J ?J ?J ?J ?J ?JB - - X- . . X. */*.FJ! ! !l! #'	!
 Cy4'! "=t!C! 
! ! ! !F$|	   (HU33D-E$F (3s8 ( ( ( ( ( ( ( (rP   r  c                   8    e Zd Z	 dddddddededz  dedz  dedz  d	ed
edz  ddf fdZde	j
        fdZed             Zed             Z	 	 	 ddej        dedee         dz  dedz  dej        f
dZdeeeej        f                  dee         fdZ xZS )SiglipVisionModelNr   Fr  r   r  rR  r  r  r  rY   c                    t                                                       || _        t          ||||| d|          | _        d S )Nz.vision_model)r  rR  r  r  r  )r   r   r  r  vision_model)re   r   r  rR  r  r  r  r   s          rQ   r   zSiglipVisionModel.__init__Q  s[     	(3%'A/+++
 
 
rP   c                 $    | j         j        j        S ra   )r  r   r   rd   s    rQ   get_input_embeddingsz&SiglipVisionModel.get_input_embeddingsg  s     +;;rP   c                     | j         j        S ra   )r  r   rd   s    rQ   r   zSiglipVisionModel.dtypej  s     &&rP   c                     | j         j        S ra   )r  r  rd   s    rQ   r  zSiglipVisionModel.devicen  s     ''rP   rA   r  r  r  c                 4    |                      ||||          S )N)rA   r  r  r  )r  )re   rA   r  r  r  s        rQ   r  zSiglipVisionModel.forwardr  s-       %%='$;	 ! 
 
 	
rP   rr  c                    g d}t          |                                           }t                      }t          | j        j        j                  }|D ]\  }}|                    d          r| j        j        (| j        j	        |                    d          rJ|                    d          r/t          |                    d          d                   }||k    r|                    d          rVt          ||          }	|	D|	|v r@||	         }
t          |
dt                    } ||
|           |                    |	           |D ]>\  }}}||vr|                    ||          }||         }
|
j        } ||
||            nB||         }
t'          ||
||| j                  }
t          |
dt                    } ||
|           |                    |           |S )	Nrt  zvision_model.post_layernormzvision_model.headzvision_model.encoder.layersr  rD   )z.k_scalez.v_scalez.q_scalez.prob_scaler{  )r   r|  r}  r  r  rf  rY  r  r  rj  r   r  endswithr   r  r   r  r~  r{  maybe_swap_ffn_paramr  )re   rr  r  r  r  r  r  r  rU  remapped_namer  r{  r  r  r  s                  rQ   r  zSiglipVisionModel.load_weights  s=   "
 "
 "
 4002233"%%%$+3:;;#* 2	$ 2	$D-  =>>%4<  %-$//BU2V2V- <== 

3 233	++ }}PQQ  9$ L L ,+1M1M'6E$+0E% %M "M%777!%%m444 6L 4 41
Kd**||K<<#D) % 3e]H===#D),%T=N  !(@U V Ve]333d####rP   ra   )FNN)rH   rI   rJ   r   r   r   r   r   r   r
   Moduler  r  r   r  rM   rN   r   r:   r  r   r1  r}  r  r   r   s   @rQ   r  r  P  s        37

 26)- %
 
 
"
 )4/

 %($J
  $;
 
 +
 

 
 
 
 
 
,<bi < < < < ' ' X' ( ( X( */*.FJ
 
l
 #'
 Cy4'	

 "=t!C
 

 
 
 
>HU33D-E$F >3s8 > > > > > > > >rP   r  r  r  r  r  r  c                    |r|                                 dk    rd| vr|S t                      }t          |dd          }|                    |          |z  }|                    |          }d| v r%||k    r|                     dd          }	||	         }n(d| v r$||k    r|                     dd          }	||	         }|S )Nggufz.fc
output_dimr   z.fc1.z.fc2.)r:  r   r  r   r~  )
r  r  r  r  r  r"  r  r  weight_out_sizenew_names
             rQ   r  r    s      \2244>>5PTCTCT244Ga00J**Z((72K#((44O$;/99<<11H%	D[O;;<<11H%LrP   c            	       t     e Zd Zdef fdZ	 d	dej        dz  dej        dej        dz  dej        fdZ xZS )
re  r   c                 V   t                                                       || _        t          |j        |j                  | _        t          |j        |j                  | _        | 	                    dt          j        |j                                      d          d           d S )Nr   r   Fr   )r   r   r   r   r   r   rm  max_position_embeddingsr   r   rM   r   r   r   s     rQ   r   zSiglipTextEmbeddings.__init__  s    5v1 
  
 #9*F,>#
 #
 	L788??HH 	 	
 	
 	
 	
 	
rP   Nrk  r   rZ  rY   c                 h    ||                      |          }|                     |          }||z   }|S ra   )rm  r   )re   rk  r   rZ  position_embeddingsr   s         rQ   r  zSiglipTextEmbeddings.forward  sB       00;;M"55lCC"%88
rP   ra   )	rH   rI   rJ   r   r   rM   rN   r  r   r   s   @rQ   re  re    s        
/ 
 
 
 
 
 
, .2	 <$& l |d*	
 
       rP   re  rV   )rw   )r   dummy_inputsc                       e Zd ZdZdg diZededededz  fd            Zd	d
de	def fdZ
	 d)dej        dz  dej        dej        dz  dej        fdZdej        dej        dej        fdZ	 d)dej        dedz  dej        fdZdededz  fdZdedej        fdZdej        deej        gej        f         dej        dz  dedej        f
 fdZ	 d)ddd dej        d!edz  dej        dz  dedej        f
 fd"Zdedefd#Z	 	 d*dej        dz  d$ej        d%edz  dej        dz  dedej        fd&Zd'eeeej        f                  fd(Z xZ S )+SiglipEmbeddingModelTr   )ru  rw  ry  r   irY   Nc                 N    |                     d          rd S t          d          )Nrm   z Only image modality is supported)r  r\   )clsr   r  s      rQ   get_placeholder_strz(SiglipEmbeddingModel.get_placeholder_str  s,    w'' 	4;<<<rP   r   r  vllm_configr  c          	         t                                                       |j        j        }|j        }|| _        t          |d          rd|_        |j        }|j	        }|j
        | _        |j
        | _        |j        | _        |                     |          5  t!          ||t#          |d                    | _        d d d            n# 1 swxY w Y   |                     |d          5  t)          ||t#          |d          d           | _        d d d            n# 1 swxY w Y   |j        j        }|J || _        t/          j        |          | _        d| _        d S )	N
num_labelsr   
text_modelrG  rm   r  )r  r  r  T)r   r   rt   	hf_configr  r   r  r  text_configr   r   text_embed_dimvision_embed_dimri  text_projection_size_mark_language_modelrb  r8   r  _mark_tower_modelr  r  ru   r   for_embeddingpooler_is_text_input)	re   r  r  r   r  r  r   ru   r   s	           rQ   r   zSiglipEmbeddingModel.__init__  s   *7A"/6<(( 	" !F(,)5 - 9$/$?!&&{33 	 	3)#FL99  DO	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ##K99 	 	 7)#FN;;	! ! !D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 $0>(((*$2=AA"s$   &CCC-'D  D$'D$rk  r   rZ  c                     |                      |||          }| j                             |          }|                     ||          }|S )N)rk  r   rZ  )r  rj  _flip_sequences_by_position_ids)re   rk  r   rZ  rq  text_featuress         rQ   get_text_featuresz&SiglipEmbeddingModel.get_text_features6  sb     !OO%' , 
 

 ,,->?? <<<
 
 rP   featuresc           	         t          |          dk    r|S |dd         |dd         z
  }|dk    }t          j        t          j        dg|j                  t          j        |          d         dz   t          j        t          |          g|j                  g          }|dd         |dd         z
  }|dd         }|dd         }t          j        t          |          |j                                      |          }	t          j        t          |          |j                  }
||	         ||	         z   dz
  |
z
  }||         S )zFlip sequences so EOS token moves to first position for CLS pooling.

        SigLIP position_ids are reversed within each sequence. This method detects
        sequence boundaries and flips each sequence individually.
        r2   Nr   r   )r  )r  rM   cattensorr  wherer   repeat_interleave)re   r  r   position_diffsboundary_maskboundary_indiceslengthsstartsendssequence_idscurrent_positionsflip_indicess               rQ   r  z4SiglipEmbeddingModel._flip_sequences_by_position_idsK  s\    x==AO &abb)L",==&!+ 9aS999M**1-1c(mm_X_EEE
 
 #122&)9#2#)>>!#2#&# |LL
 
 


G
$
$ 	
 "LXxOOOl+d<.@@1DGXX%%rP   rA   r  c                 l    |t          | j        j                  }|                     |d |          }|S )N)rA   r  r  )r]   ru   rw   r  )re   rA   r  pooled_outputs       rQ   get_image_featuresz'SiglipEmbeddingModel.get_image_featurest  sP    
 #*&I"3' '# ))%$; * 
 
 rP   ri   c                     |                     dd           }|d S | j        j        j        x}}t	          d|||d          S )NrA   )rE   rF   )rB   rG   resolve_bindings)popr   r   r   r@   )re   ri   rA   
expected_h
expected_ws        rQ   _parse_and_validate_image_inputz4SiglipEmbeddingModel._parse_and_validate_image_input  s\     zz.$774"&+";"FF
Z%#-J??
 
 
 	
rP   inputsc                 <    |d         }|                      |          S )NrG   )r  )re   r  rA   s      rQ   _process_image_inputsz*SiglipEmbeddingModel._process_image_inputs  s    f~&&|444rP   rn  is_multimodalhandle_oov_mm_tokenc          	      >   t                                          ||||          }| j        }|j        d         |k     rFt	          j        ||                    |j        d         ||j        d         z
            gd          }n|j        d         |k    rt          |S )Nr  r  r2   r   r(  )r   _embed_text_input_idsr  r   rM   r  	new_emptyNotImplementedError)re   rk  rn  r  r  rZ  inputs_embeds_sizer   s          rQ   r  z*SiglipEmbeddingModel._embed_text_input_ids  s     55' 3	 6 
 
 "6q!$666!I!!++%+A.*]-@-CC  	 	 	MM  #&888%%rP   Fr  multimodal_embeddingsc                    |d u pt          |          dk    | _        ||!t                                          |          S t                                          ||||          S )Nr   )r
  r  r  )r  r  r   rn  )re   rk  r
  r  r  r   s        rQ   rn  z$SiglipEmbeddingModel.embed_input_ids  sz     "T)LS1F-G-G1-L 	 !(M,A77**9555ww&&"7' 3	 ' 
 
 	
rP   c                 R     | j         di |}|g S |                     |          }|S )NrO   )r  r  )re   ri   image_inputvision_embeddingss       rQ   embed_multimodalz%SiglipEmbeddingModel.embed_multimodal  s?    :d:DDVDDI 66{CC  rP   	positionsintermediate_tensorsc                     |t          d          | j        s|S | j        }|j        d         |k    r|d d d |f         }n|j        d         |k     rt          |                     |||          S )Nz"PP is not supported for this modelr2   )RuntimeErrorr  r  r   r  r  )re   rk  r  r  rZ  ri   r   s          rQ   r  zSiglipEmbeddingModel.forward  s      +CDDD " 	!   )q!K//)!!!\k\/:MM #k11%%%%iMJJJrP   rr  c                 V    t          | dgddg          }|                    |          S )Nz.position_idszlogit_scale.zlogit_bias.)skip_substrsignore_unexpected_prefixes)r7   r  )re   rr  loaders      rQ   r  z!SiglipEmbeddingModel.load_weights  s=    ")*(6'F
 
 
 ""7+++rP   ra   r`  )!rH   rI   rJ   is_pooling_modelpacked_modules_mappingclassmethodr   r   r  r   r   rM   rN   r  r  r:   r  r   r@   r  r  r   r   r  r3   rn  r  r/   r  r   r1  r  r   r   s   @rQ   r  r    s        (*H*H*HI=3 =3 =3: = = = [= BD &# &# &#z &#3 &# &# &# &# &# &#X .2	 <$& l |d*	
 
   *'&,'& l'& 
	'& '& '& '&X GK l "=t!C 
	   $

	$	&
 
 
 
5,B 5u| 5 5 5 5
!<! "5<.%,">?!
 |d*! "! 
! ! ! ! ! !L >B

 .2$)
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
,! !4H ! ! ! ! <@-1K K<$&K <K 2D8	K
 |d*K K 
K K K K4,HU33D-E$F , , , , , , , ,rP   r  )ocollections.abcr   r   r   	functoolsr   r   typingr   r	   rM   r
   transformersr   r   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   %vllm.model_executor.layers.activationr   ;vllm.model_executor.layers.attention.encoder_only_attentionr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   !vllm.model_executor.layers.poolerr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.multimodalr    vllm.multimodal.inputsr!   r"   r#   r$   r%   vllm.multimodal.parser&   r'   r(   vllm.multimodal.processingr)   r*   r+   r,   r-   r.   vllm.sequencer/   vllm.utils.tensor_schemar0   r1   
interfacesr3   r4   r5   interfaces_baser6   utilsr7   r8   visionr9   r:   r;   r<   r=   r>   r@   rW   r   r   rL   r]   r_   r   r   rg   r  r   r  r3  rA  rQ  rb  r  r  r  rN   r  re  register_processorr  rO   rP   rQ   <module>r6     s   8 7 7 7 7 7 7 7 7 7 7 . . . . . . . . % % % % % % % %                     # " " " " " 3 3 3 3 3 3 I I I I I I I I < < < < < <      Y X X X X X 7 7 7 7 7 7         
 = < < < < < F F F F F F V V V V V V        0 / / / / /              V U U U U U U U U U                . - - - - - > > > > > > > > O O O O O O O O O O 1 1 1 1 1 1 2 2 2 2 2 2 2 2               
B 
B 
B 
B 
B\ 
B 
B 
B H H 4%C CD   		#	 	 	 	*
 *
 *
 *
 *
- *
 *
 *
Z
 
 
 
 
56JK 
 
 
4S
 S
 S
 S
 S
 78L M S
 S
 S
l( ( ( ( ()*<= ( ( (*F F F F FRY F F FRH! H! H! H! H!bi H! H! H!V* * * * *	 * * *Z*# *# *# *# *# *# *# *#Z/ / / / /BI / / /dD D D D DBI D D DN% % % % %") % % %Pd d d d dbi d d dNn n n n n	 n n nb
< < c5<'(	
 % \   0    29   F u---''	)  
y, y, y, y, y,29&8- y, y,  .-y, y, y,rP   