
    .`i              	          U d dl mZmZmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9m:Z:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@mAZAmBZB d dlCmDZD d dlEmFZFmGZG ddlHmIZImJZJ ddlKmLZL ddlMmNZNmOZO ddlPmQZQmRZRmSZSmTZTmUZUmVZV  G d deF          ZW G d  d!eQe                   ZXd"d"d#d#d$ZYeZe[eSf         e\d%<   d&e[fd'Z] G d( d)e?          Z^ G d* d+e=e^                   Z_ G d, d-e>e^                   Z` G d. d/eja                  Zb G d0 d1eja                  Zc G d2 d3eja                  Zd G d4 d5eja                  Ze G d6 d7eja                  Zf G d8 d9eja                  Zg G d: d;eja                  Zh G d< d=eja                  Zi G d> d?eja                  Zj eLd@A           e1jk        e`e^e_B           G dC dDeja        eJe/                                  ZldS )E    )CallableIterableMappingSequence)cached_property)	AnnotatedLiteralN)BatchFeature
CLIPConfigCLIPProcessorCLIPTextConfigCLIPVisionConfig)	Attention)
VllmConfig)BaseDummyOptions)divide$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)DispatchPooler)QuantizationConfig)VocabParallelEmbedding)default_weight_loader)SupportsQuant)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal)default_pooling_type)AutoWeightsLoadermaybe_prefix)VisionEncoderInfoVisionFeatureSelectStrategyVisionFeatureSelectStrategyStrget_num_selected_vision_tokensis_vit_use_data_parallelresolve_visual_encoder_outputsc                   f    e Zd ZU dZed         ed<   eej         e	dddd          f         ed<   d	S )
CLIPImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwdataN)
__name__
__module____qualname____doc__r	   __annotations__r   torchTensorr0        s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/clip.pyr>   r>   C   sW           .
!!!!
EL++dAsC"@"@@
AAAAAArN   r>   c                   F    e Zd ZdededefdZdefdZdefdZdefdZdS )	CLIPEncoderInfoimage_widthimage_heightreturnc                6    |                                  dz  dz   S )N   r1   )get_patch_grid_length)selfrR   rS   s      rO   get_num_image_tokensz$CLIPEncoderInfo.get_num_image_tokensQ   s      ))++q0144rN   c                     | j         j        S N)vision_config
image_sizerX   s    rO   get_image_sizezCLIPEncoderInfo.get_image_sizeY       !,,rN   c                     | j         j        S r[   )r\   
patch_sizer^   s    rO   get_patch_sizezCLIPEncoderInfo.get_patch_size\   r`   rN   c                 r    |                                  |                                 }}||z  dk    sJ ||z  S Nr   )r_   rc   )rX   r]   rb   s      rO   rW   z%CLIPEncoderInfo.get_patch_grid_length_   sD    !%!4!4!6!68K8K8M8MJ
J&!++++Z''rN   N)rF   rG   rH   intrY   r_   rc   rW   rM   rN   rO   rQ   rQ   P   s        5 5 	5
 
5 5 5 5- - - - -- - - - -(s ( ( ( ( ( (rN   rQ   fullclass)MEANALLCLSLAST_POOLING_TYPE_TO_STRATEGYpooling_typec                 `    	 t           |          S # t          $ r t          d|           d w xY w)Nz;No feature selection strategy is defined for pooling_type: )rm   KeyError
ValueError)rn   s    rO   #_get_vision_feature_select_strategyrr   n   sV    (66   .). .
 
 	s    -c                   t    e Zd Zd Zd ZdefdZdeee	dz  f         fdZ
de	d	e	de	fd
ZdefdZde	fdZdS )CLIPProcessingInfoc                 @    | j                             t                    S r[   )ctxget_hf_configr   r^   s    rO   rw   z CLIPProcessingInfo.get_hf_configy   s    x%%j111rN   c                 D    t          |                                           S r[   )rQ   rw   r^   s    rO   get_vision_encoder_infoz*CLIPProcessingInfo.get_vision_encoder_info|   s    t1133444rN   kwargsc                 2     | j         j        t          fi |S r[   )rv   get_hf_processorr   )rX   rz   s     rO   r|   z#CLIPProcessingInfo.get_hf_processor   s    (tx(AA&AAArN   rT   Nc                 
    ddiS )Nimager1   rM   r^   s    rO   get_supported_mm_limitsz*CLIPProcessingInfo.get_supported_mm_limits   s    |rN   rR   rS   c                    |                                  }| j        j        j        }|J t	          |                    ||          t          |j                            S NrR   rS   )ry   rv   model_configpooler_configr:   rY   rr   seq_pooling_type)rX   rR   rS   vision_encoder_infor   s        rO   rY   z'CLIPProcessingInfo.get_num_image_tokens   so     #::<<-;(((-44') 5   00NOO
 
 	
rN   c                 x    |                                  }|                                x}}t          ||          S )N)widthheight)ry   r_   r&   )rX   r   r   r   s       rO   !get_image_size_with_most_featuresz4CLIPProcessingInfo.get_image_size_with_most_features   s=    "::<<,;;===uV4444rN   c                 ^    |                                  \  }}|                     ||          S r   )r   rY   )rX   target_widthtarget_heights      rO   get_max_image_tokensz'CLIPProcessingInfo.get_max_image_tokens   s;    &*&L&L&N&N#m(($& ) 
 
 	
rN   )rF   rG   rH   rw   ry   objectr|   r   strrf   r   rY   r&   r   r   rM   rN   rO   rt   rt   x   s        2 2 25 5 5B B B B BcDj)A    
 
 	

 

 
 
 
&59 5 5 5 5

c 
 
 
 
 
 
rN   rt   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	CLIPDummyInputsBuilder	mm_countsrT   c                     dS N rM   )rX   r   s     rO   get_dummy_textz%CLIPDummyInputsBuilder.get_dummy_text   s    rrN   Nseq_len
mm_optionsc                     |                     dd          }| j                                        \  }}|r|                     d          nd }d|                     ||||          iS )Nr~   r   )r   r   
num_images	overrides)getinfor   _get_dummy_images)rX   r   r   r   r   r   r   image_overridess           rO   get_dummy_mm_dataz(CLIPDummyInputsBuilder.get_dummy_mm_data   s|     ]]7A..
&*i&Q&Q&S&S#m5?I*..111T T++"$%)	 ,  
 	
rN   r[   )
rF   rG   rH   r   r   rf   r   r   r    r   rM   rN   rO   r   r      s        S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rN   r   c                   @    e Zd Zedefd            Z	 ddddeee         z  dede	ee
f         de	ee
f         dz  d	edz  def fd
Zdedede	ee
f         de	ee
f         def
dZdede	ee
f         de	eef         fdZdede	ee
f         dedee         fdZ xZS )CLIPMultiModalProcessorrT   c                 R    | j                                         }d}||j        vsJ |S re   )r   get_tokenizerall_special_ids)rX   	tokenizerdummy_token_ids      rO   image_token_idz&CLIPMultiModalProcessor.image_token_id   s4    I++--	Y%>>>>>rN   N)mm_uuidspromptmm_datahf_processor_mm_kwargstokenization_kwargsr   c                    |r|rt          d          |r
i |pi ddi}t                                          |||||          S )NzzCLIP accepts text-only or image-only inputs, not both! Image-only inputs means passing an image with an empty text prompt.add_special_tokensF)r   r   r   r   r   )rq   superapply)rX   r   r   r   r   r   	__class__s         rO   r   zCLIPMultiModalProcessor.apply   s      	g 	    	#&,"#$e# #
 ww}}#9 3  
 
 	
rN   prompt_textmm_itemsc                     dS )NFrM   )rX   r   r   r   r   s        rO   _hf_processor_applies_updatesz5CLIPMultiModalProcessor._hf_processor_applies_updates   s	     urN   	hf_inputsc                 F    t          t          j        d                    S )Nr~   )r?   )dictr!   batched)rX   r   r   s      rO   _get_mm_fields_configz-CLIPMultiModalProcessor._get_mm_fields_config   s!    
 !6!>w!G!GHHHHrN   out_mm_kwargsc                 |      j         dt          f fd}t          dt          j                    |          gS )Nitem_idxc                                          dt                    }|                    |           }j                            |j        |j                  }g|z  S )Nr~   r   )	get_itemsr%   r_   r   rY   r   r   )r   imagesr]   num_image_tokensr   r   rX   s       rO   get_replacementzDCLIPMultiModalProcessor._get_prompt_updates.<locals>.get_replacement  sf    ''1DEEF..x88J#y==&,'.  >     ##&666rN   r~   )modalitytargetreplacement)r   rf   r,   r+   start)rX   r   r   r   r   r   s   ``   @rO   _get_prompt_updatesz+CLIPMultiModalProcessor._get_prompt_updates   sn     ,	7c 	7 	7 	7 	7 	7 	7 	7 	7  )/11+  
 	
rN   r[   )rF   rG   rH   r   rf   r   r   listr    r   r   r$   r"   r   r'   boolr   r
   r!   r   r#   r   r-   r   __classcell__r   s   @rO   r   r      s           _ <@
 /3
 
 
d3i
 $
 !(V 4	

 %S&[1D8
 %t+
 

 
 
 
 
 
@ & !(V 4	
 %S&[1 
   II !(V 4I 
++	,	I I I I
%
 !(V 4
 -	

 
,	
 
 
 
 
 
 
 
rN   r   c            	       t     e Zd Zdef fdZ	 d	dej        dz  dej        dej        dz  dej        fdZ xZS )
CLIPTextEmbeddingsconfigc                     t                                                       |j        }t          |j        |          | _        t          |j        |          | _        d S r[   )r   __init__hidden_sizer   
vocab_sizetoken_embeddingmax_position_embeddingsposition_embedding)rX   r   	embed_dimr   s      rO   r   zCLIPTextEmbeddings.__init__  sV    &	5f6GSS"8*I#
 #
rN   N	input_idsposition_idsinputs_embedsrT   c                     |&|t          d          |                     |          }|                     |          }||z   }|S )Nz5Either `input_ids` or `input_embeds` must be provided)rq   r   r   )rX   r   r   r   position_embeddings
embeddingss         rO   forwardzCLIPTextEmbeddings.forward!  s_        K   !00;;M"55lCC"%88
rN   r[   )	rF   rG   rH   r   r   rK   rL   r   r   r   s   @rO   r   r     s        
~ 
 
 
 
 
 
 .2	 <$& l |d*	
 
       rN   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )CLIPVisionEmbeddingsr   c                    t                                                       || _        |j        | _        |j        | _        |j        | _        | j        | j        z  dk    sJ t          j        t          j
        | j                            | _        t          |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )	Nr   F)in_channelsout_channelskernel_sizestridebiasrV   r1   r   )r1   )
persistent)r   r   r   r   r   r]   rb   nn	ParameterrK   randnclass_embeddingr   num_channelspatch_embeddingnum_patchesnum_positions	Embeddingr   register_bufferarangeexpand)rX   r   r   s     rO   r   zCLIPVisionEmbeddings.__init__6  s1   + + +0A5555!|EK,G,GHH*+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"RL+,,33G<< 	 	
 	
 	
 	
 	
rN   r?   rT   c                    |j         d         }| j        j        j        }|                     |                    |                    }|                    d                              dd          }| j                            |dd          }t          j
        ||gd          }||                     | j                  z   }|S )Nr   )dtyperV   r1   r   dim)shaper   weightr   toflatten	transposer   r   rK   catr   r   )rX   r?   
batch_sizetarget_dtypepatch_embedsclass_embedsr   s          rO   r   zCLIPVisionEmbeddings.forwardQ  s    !'*
+28++OO,O//
 
 $++A..88A>>+22:q"EEYl;CCC
$"9"9$:K"L"LL
rN   )	rF   rG   rH   r   r   rK   rL   r   r   r   s   @rO   r   r   5  sk        
/ 
 
 
 
 
 
6EL U\        rN   r   c                   ~     e Zd Z	 ddddeez  dedz  dedee         ee	         z  ddf
 fd	Z
d
ej        fdZ xZS )CLIPAttentionNr   prefixr   quant_configr  attn_clsrT   c                   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        t                      }t          | j        | j        | j        || d|          | _        t          | j        | j        || d|          | _        |rd	nt                      | _        t#          | j        | j                  | _        |t&          k    r( || j        | j        | j	        | d
          | _        d S  || j        | j        | j	        | d
          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z	.qkv_proj)r   	head_sizetotal_num_headsr  r  
disable_tpz	.out_proj)
input_sizeoutput_sizer  r  r  r1   z.attnr  )r   r   r   r   r   num_attention_heads	num_headshead_dimrq   scaler;   r   qkv_projr   out_projr   tp_sizer   num_heads_per_partitionr   attn)rX   r   r  r  r	  use_data_parallelr   s         rO   r   zCLIPAttention.__init__a  s    	+3$.8=4>)T^;;3%)^3 3 $3 3 3  
 ]D(
466)m N%'''(
 
 
 *~%'''(
 
 
 #NAA(L(N(N 	 (.dndl'K'K$))) ,
 '''	  DIII !,
 '''	  DIIIrN   hidden_statesc                     |                      |          \  }}|                    dd          \  }}}|                     |||          }|                     |          \  }}|dfS )z#Input shape: Batch x Time x ChannelrB   r   r   N)r  chunkr  r  )	rX   r  
qkv_states_query_states
key_statesvalue_statesoutattn_outputs	            rO   r   zCLIPAttention.forward  sm     m44
A1;1A1A!1A1L1L.j,iij,??s++QD  rN   r[   rF   rG   rH   r   r   r   r   r@   r   r   r   rK   rL   r   r   r   s   @rO   r  r  `  s         37:
 : : :!11: )4/:
 : y/D);$<<: 
: : : : : :x!|! ! ! ! ! ! ! !rN   r  c            	       f     e Zd Z	 	 d
deez  dedz  deddf fdZdej	        dej	        fd	Z
 xZS )CLIPMLPNr   r   r  r  rT   c                 @   t                                                       || _        t                      }t	          |j                  | _        t          |j        |j	        d|| d|          | _
        t          |j	        |j        d|| d|          | _        d S )NTz.fc1)r   r  r  r  z.fc2)r   r   r   r;   r   
hidden_actactivation_fnr   r   intermediate_sizefc1r   fc2)rX   r   r  r  r  r   s        rO   r   zCLIPMLP.__init__  s     	466'(9::'$%???(
 
 
 %$%???(
 
 
rN   r  c                     |                      |          \  }}|                     |          }|                     |          \  }}|S r[   )r+  r)  r,  )rX   r  r  s      rO   r   zCLIPMLP.forward  sG    88M22q**=9988M22qrN   r   )rF   rG   rH   r   r   r   r   r   rK   rL   r   r   r   s   @rO   r&  r&    s         37	
 
!11
 )4/
 	

 

 
 
 
 
 
:U\ el        rN   r&  c                        e Zd Z	 ddddeez  dedz  dedee         ee	         z  ddf
 fd	Z
d
ej        dej        fdZ xZS )CLIPEncoderLayerNr   r  r   r  r  r	  rT   c                F   t                                                       t          ||| d|          | _        t	          j        |j        |j                  | _        t          ||| d          | _
        t	          j        |j        |j                  | _        d S )Nz
.self_attn)r  r  r	  epsz.mlpr  r  )r   r   r  	self_attnr   	LayerNormr   layer_norm_epslayer_norm1r&  mlplayer_norm2)rX   r   r  r  r	  r   s        rO   r   zCLIPEncoderLayer.__init__  s     	&%(((	
 
 
 <(:@UVVV%???
 
 

 <(:@UVVVrN   r  c                     |}|                      |          }|                     |          \  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)r  )r7  r4  r9  r8  )rX   r  residualr  s       rO   r   zCLIPEncoderLayer.forward  su     ((77>>>FFq =0 ((77// =0rN   r[   r$  r   s   @rO   r/  r/    s         37W
 W W W!11W )4/W
 W y/D);$<<W 
W W W W W W0U\ el        rN   r/  c                        e Zd ZdZ	 	 ddddeez  dedz  dedz  ded	e	e
         e	e         z  d
df fdZdej        ded
ej        eej                 z  fdZ xZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self
    attention layers. Each layer is a [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    Nr   r  r   r  num_hidden_layers_overrider  r	  rT   c                    t                                                       | _        |j        }n|}t	          j        fdt          |          D                       | _        d S )Nc           	      @    g | ]}t           d |           S )z.layers.r   r  r  r	  )r/  ).0	layer_idxr	  r   r  r  s     rO   
<listcomp>z(CLIPEncoder.__init__.<locals>.<listcomp>  sT         !!!-$99i99%	    rN   )r   r   r   num_hidden_layersr   
ModuleListrangelayers)rX   r   r  r>  r  r	  rE  r   s    `` `` rO   r   zCLIPEncoder.__init__  s     	%- & 8 :m       "''8!9!9  

 

rN   r   return_all_hidden_statesc                 p    |g}|}| j         D ]$} ||          }|r|                    |           %|r|S |S r[   )rH  append)rX   r   rI  hidden_states_poolr  encoder_layers         rO   r   zCLIPEncoder.forward   sd    
 ,_%![ 	9 	9M)M-88M' 9"))-888 $ 	&%%rN   NN)rF   rG   rH   rI   r   r   r   rf   r   r@   r   r   r   rK   rL   r   r   r   r   r   s   @rO   r=  r=    s          3715	
 
 
 
!11
 )4/
 %($J	
 
 y/D);$<<
 

 
 
 
 
 
<| #' 
U\*	*	       rN   r=  c            
            e Zd Z	 ddddededz  deddf fdZd	ej        dej        fd
Z		 dd	ej        dz  dej        dej        dz  dej        fdZ
deeeej        f                  dee         fdZ xZS )CLIPTextTransformerNr   r  r   r  r  rT   c                   t                                                       || _        |j        }t	          |          | _        t          ||| dt                    | _        t          j
        ||j                  | _        d S )N.encoderrA  r1  )r   r   r   r   r   r   r=  r   encoderr   r5  r6  final_layer_norm)rX   r   r  r  r   r   s        rO   r   zCLIPTextTransformer.__init__4  s     	&	,V44"%&&&	
 
 
 !#%!
 !
 !
rN   r   c                 6    | j                             |          S r[   )r   r   )rX   r   s     rO   embed_input_idsz#CLIPTextTransformer.embed_input_idsN  s    ..y999rN   r   r   c                     |                      |||          }|                     |d          }|                     |          }|S )Nr   r   r   Fr   rI  )r   rS  rT  )rX   r   r   r   r  last_hidden_states         rO   r   zCLIPTextTransformer.forwardQ  sc     %' ( 
 
 !LL'%* ) 
 
 !112CDD  rN   weightsc                 |   g d}t          |                                           }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N)r  q_projq)r  k_projk)r  v_projvweight_loader)r   named_parameterssetreplacerd  getattrr   add)rX   r[  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrd  s               rO   load_weightsz CLIPTextTransformer.load_weightse  s    "
 "
 "
 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####rN   r[   )rF   rG   rH   r   r   r   r   rK   rL   rV  r   r   tuplerf  rs  r   r   s   @rO   rP  rP  3  s4        37

 
 
 

 )4/

 
 

 
 
 
 
 
4: :%, : : : : .2	! !<$&! l! |d*	!
 
! ! ! !(HU33D-E$F 3s8        rN   rP  c                       e Zd Z	 ddddddededz  dedz  dedz  ded	df fd
Ze	d             Z
e	d             Zddddej        dee         dz  dedz  d	ej        fdZdeeeej        f                  d	ee         fdZ xZS )CLIPVisionTransformerNr   r>  require_post_normr  r   r  r>  rx  r  rT   c                N   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |||| dt                    | _        |j        }t          | j        j                  |j        k    r-t!          d| dt          | j        j                   d          |t          | j        j                  |k    }|r"t          j        ||j                  | _        d S d | _        d S )Nr1  rR  )r   r  r>  r  r	  zThe original encoder only has z layers, but you requested z layers.)r   r   r   r   r   r   r   r5  r6  pre_layrnormr=  r   rS  rE  lenrH  rq   post_layernorm)	rX   r   r  r>  rx  r  r   rE  r   s	           rO   r   zCLIPVisionTransformer.__init__  sI    	&	.v66 L8MNNN"%'A&&&'
 
 
 #4t|"##f&>>>P1B P P-01D-E-EP P P   $ #DL$7 8 8<M M 	'"$,yf>S"T"T"TD"&DrN   c                 N    t          |                                           j        S r[   )next
parametersr   r^   s    rO   r   zCLIPVisionTransformer.dtype  s    DOO%%&&,,rN   c                 N    t          |                                           j        S r[   )r~  r  devicer^   s    rO   r  zCLIPVisionTransformer.device  s    DOO%%&&--rN   select_layersfeature_select_strategyr?   r  r  c                    |                      |          }|                     |          }|                     ||d u          }t          || j        || j        j        |          }|S )NrY  )r  max_possible_layersr  )r   rz  rS  r<   r|  r   rE  )rX   r?   r  r  r  encoder_outputss         rO   r   zCLIPVisionTransformer.forward  s     55))-88 ,,'%2$%> ' 
 
 9' $ =$;
 
 
 rN   r[  c                 p   g d}t          |                                           }t                      }t          | j        j                  }|D ]\  }}|                    d          r| j        "|                    d          r/t          |	                    d          d                   }||k    rf|D ]>\  }	}
}|
|vr|
                    |
|	          }||         }|j        } ||||            n*||         }t          |dt                    } |||           |                    |           |S )Nr]  r|  zencoder.layers.rV   rd  )r   re  rf  r{  rS  rH  
startswithr|  rf   splitrg  rd  rh  r   ri  )rX   r[  rj  rk  rl  layer_countrm  rn  rC  ro  rp  rq  rr  rd  s                 rO   rs  z"CLIPVisionTransformer.load_weights  sl   "
 "
 "
 4002233"%%%$,-..#* 	$ 	$D-/00 T5H5P /00 

3 233	++5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####rN   r[   )rF   rG   rH   r   r   rf   r   r   r   propertyr   r  rK   rL   r   r8   r   r   rt  rf  rs  r   r   s   @rO   rv  rv    so        37*'
 26)-*' *' *' *' )4/*'
 %($J*'  $;*' *' 
*' *' *' *' *' *'X - - X- . . X. +/FJ  l Cy4'	
 "=t!C 
   8$HU33D-E$F $3s8 $ $ $ $ $ $ $ $rN   rv  c                        e Zd Z	 ddddddededz  dedz  dedz  ded	df fd
Z	 	 dde	j
        dee         dz  dedz  d	e	j
        fdZed             Zed             Z xZS )CLIPVisionModelNr   rw  r   r  r>  rx  r  rT   c                    t                                                       t          ||||| d          | _        d S )Nz.vision_model)r   r  r>  rx  r  )r   r   rv  vision_model)rX   r   r  r>  rx  r  r   s         rO   r   zCLIPVisionModel.__init__  sP     	1%'A/+++
 
 
rN   r?   r  r  c                 2    |                      |||          S )Nr  )r  )rX   r?   r  r  s       rO   r   zCLIPVisionModel.forward  s*       '$; ! 
 
 	
rN   c                     | j         j        S r[   )r  r   r^   s    rO   r   zCLIPVisionModel.dtype  s     &&rN   c                     | j         j        S r[   )r  r  r^   s    rO   r  zCLIPVisionModel.device  s     ''rN   r[   rN  )rF   rG   rH   r   r   rf   r   r   r   rK   rL   r   r8   r   r  r   r  r   r   s   @rO   r  r    s2        37

 26)-
 
 
 
 )4/

 %($J
  $;
 
 

 
 
 
 
 
, +/FJ	

 

l

 Cy4'

 "=t!C	


 


 

 

 

 ' ' X' ( ( X( ( ( ( (rN   r  rl   )r   )r   dummy_inputsc                   |    e Zd ZdZdg diZededededz  fd            Zd	d
de	def fdZ
	 d'dej        dz  dej        dej        dz  dej        fdZ	 d'dej        dedz  dej        fdZdededz  fdZdedej        fdZdej        deej        gej        f         dej        dz  dedej        f
 fdZ	 d'ddddej        dedz  dej        dz  dedej        f
 fd Zdedefd!Z	 	 d(dej        dz  d"ej        d#edz  dej        dz  dedej        fd$Zd%eeeej        f                  fd&Z xZS ))CLIPEmbeddingModelTr  )r^  r`  rb  r   irT   Nc                 N    |                     d          rd S t          d          )Nr~   z Only image modality is supported)r  rq   )clsr   r  s      rO   get_placeholder_strz&CLIPEmbeddingModel.get_placeholder_str.  s,    w'' 	4;<<<rN   r   r  vllm_configr  c          	      V   t                                                       |j        j        }|j        }|j        j        }|| _        || _        |j        }|j        }|j	        | _	        |j
        | _        |j
        | _        |                     |          5  t          ||t          |d                    | _        t#          j        | j        | j	        d          | _        d d d            n# 1 swxY w Y   |                     |d          5  t+          ||t          |d                    | _        t#          j        | j        | j	        d          | _        d d d            n# 1 swxY w Y   |j        j        }|J || _        t3          j        |          | _        d| _        d S )N
text_modelr3  F)r   r~   r  T)r   r   r   	hf_configr  multimodal_configr   text_configr\   projection_dimr   text_embed_dimvision_embed_dim_mark_language_modelrP  r6   r  r   Lineartext_projection_mark_tower_modelrv  r  visual_projectionr   r   for_embeddingpooler_is_text_input)
rX   r  r  r   r  r  r  r\   r   r   s
            rO   r   zCLIPEmbeddingModel.__init__5  s8   (5?"/'4F!2(,$3)5 - 9&&{33 
	 
	1)#FL99  DO
 $&9##$ $ $D 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 ##K99 
	 
	 5)#FN;;! ! !D
 &(Y%#& & &D"
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 $0>(((*$2=AA #s&   AC..C25C2AE''E+.E+r   r   r   c                 `    |                      |||          }|                     |          }|S )NrX  )r  r  )rX   r   r   r   pooled_outputtext_featuress         rO   get_text_featuresz$CLIPEmbeddingModel.get_text_featuresf  s@     %' ( 
 
 ,,];;rN   r?   r  c                     |t          | j        j                  }|                     |d |          }|                     |          }|S )N)r?   r  r  )rr   r   r   r  r  )rX   r?   r  r  image_featuress        rO   get_image_featuresz%CLIPEmbeddingModel.get_image_featuresv  sc    
 #*&I"3' '# ))%$; * 
 
 //>>rN   rz   c                     |                     dd           }|d S | j        j        j        x}}t	          d|||d          S )Nr?   )rC   rD   )r@   rE   resolve_bindings)popr   r\   r]   r>   )rX   rz   r?   
expected_h
expected_ws        rO   _parse_and_validate_image_inputz2CLIPEmbeddingModel._parse_and_validate_image_input  s\     zz.$774"&+";"FF
Z##-J??
 
 
 	
rN   inputsc                 <    |d         }|                      |          S )NrE   )r  )rX   r  r?   s      rO   _process_image_inputsz(CLIPEmbeddingModel._process_image_inputs  s    f~&&|444rN   rV  is_multimodalhandle_oov_mm_tokenc          	      >   t                                          ||||          }| j        }|j        d         |k     rFt	          j        ||                    |j        d         ||j        d         z
            gd          }n|j        d         |k    rt          |S )Nr  r  r1   r   r   )r   _embed_text_input_idsr  r   rK   r   	new_emptyNotImplementedError)rX   r   rV  r  r  r   inputs_embeds_sizer   s          rO   r  z(CLIPEmbeddingModel._embed_text_input_ids  s     55' 3	 6 
 
 "0q!$666!I!!++%+A.*]-@-CC  	 	 	MM  #&888%%rN   Fr  multimodal_embeddingsc                    |d u pt          |          dk    | _        ||!t                                          |          S t                                          ||||          S )Nr   )r  r  r  )r{  r  r   rV  )rX   r   r  r  r  r   s        rO   rV  z"CLIPEmbeddingModel.embed_input_ids  sz     "T)LS1F-G-G1-L 	
 !(M,A77**9555ww&&"7' 3	 ' 
 
 	
rN   c                 R     | j         di |}|g S |                     |          }|S )NrM   )r  r  )rX   rz   image_inputvision_embeddingss       rO   embed_multimodalz#CLIPEmbeddingModel.embed_multimodal  s?    :d:DDVDDI 66{CC  rN   	positionsintermediate_tensorsc                     |t          d          | j        s|S | j        }|j        d         |k    r|d d d |f         }n|j        d         |k     rt          |                     |||          S )Nz"PP is not supported for this modelr1   )RuntimeErrorr  r  r   r  r  )rX   r   r  r  r   rz   r   s          rO   r   zCLIPEmbeddingModel.forward  s      +CDDD " 	!   )q!K//)!!!\k\/:MM #k11%%%%iMJJJrN   r[  c                 T    t          | dgdg          }|                    |          S )Nz.position_idszlogit_scale.)skip_substrsignore_unexpected_prefixes)r5   rs  )rX   r[  loaders      rO   rs  zCLIPEmbeddingModel.load_weights  s;    ")*(6'7
 
 
 ""7+++rN   r[   rN  ) rF   rG   rH   is_pooling_modelpacked_modules_mappingclassmethodr   rf   r  r   r   rK   rL   r  r8   r  r   r>   r  r  r   r   r  r2   rV  r  r.   r   r   rt  rs  r   r   s   @rO   r  r  #  sI        (*H*H*HI=3 =3 =3: = = = [= BD /# /# /#z /#3 /# /# /# /# /# /#j .2	 <$& l |d*	
 
   & GK l "=t!C 
	   (

		$
 
 
 
5,@ 5U\ 5 5 5 5
!<! "5<.%,">?!
 |d*! "! 
! ! ! ! ! !L >B

 .2$)
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
.! !4H ! ! ! ! <@-1K K<$&K <K 2D8	K
 |d*K K 
K K K K4,HU33D-E$F , , , , , , , ,rN   r  )mcollections.abcr   r   r   r   	functoolsr   typingr   r	   rK   torch.nnr   transformersr
   r   r   r   r   vllm.attention.layerr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   !vllm.model_executor.layers.poolerr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   vllm.multimodalr   vllm.multimodal.inputsr    r!   r"   r#   r$   vllm.multimodal.parser%   r&   r'   vllm.multimodal.processingr(   r)   r*   r+   r,   r-   vllm.sequencer.   vllm.utils.tensor_schemar/   r0   
interfacesr2   r3   interfaces_baser4   utilsr5   r6   visionr7   r8   r9   r:   r;   r<   r>   rQ   rm   r   r   rJ   rr   rt   r   r   Moduler   r   r  r&  r/  r=  rP  rv  r  register_processorr  rM   rN   rO   <module>r     s   B A A A A A A A A A A A A % % % % % % % % % % % % % %                     + * * * * * " " " " " " 3 3 3 3 3 3 I I I I I I I I < < < < < < X X X X X X 7 7 7 7 7 7         
 = < < < < < F F F F F F V V V V V V O O O O O O ? ? ? ? ? ? / / / / / /              V U U U U U U U U U                . - - - - - > > > > > > > > @ @ @ @ @ @ @ @ 1 1 1 1 1 1 2 2 2 2 2 2 2 2               
B 
B 
B 
B 
B< 
B 
B 
B( ( ( ( ('(89 ( ( (, H H 4%C CD   c    +
 +
 +
 +
 +
+ +
 +
 +
\
 
 
 
 
34FG 
 
 
4R
 R
 R
 R
 R
56HI R
 R
 R
l       >( ( ( ( (29 ( ( (VH! H! H! H! H!BI H! H! H!V# # # # #bi # # #L% % % % %ry % % %P7 7 7 7 7") 7 7 7tK K K K K") K K K\u u u u uBI u u up&( &( &( &( &(bi &( &( &(T v...''	'  
W, W, W, W, W,$6 W, W,  /.W, W, W,rN   