
    .`i                        U d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC d dlDmEZEmFZFmGZGmHZHmIZI d dlJmKZKmLZLmMZMmNZNmOZO d dlPmQZQ d dlRmSZSmTZT ddlUmVZVmWZWmXZXmYZYmZZZ ddl[m\Z\ dd l]m^Z^m_Z_m`Z`maZambZb dd!lcmdZd  e(ee          Zfd"egd#egd$egd%egd&egf
d'Zh G d( d)eS          Zi G d* d+eS          Zjeiejz  Zkeeld,<    G d- d.eS          Zm G d/ d0eS          Znemenz  Zoeeld1<    G d2 d3ejp                  Zqd4ejr        d5ejr        d6ejr        d7ejr        d8e4d9esejr        ejr        f         fd:Zt G d; d<ejp                  Zu G d= d>ejp                  Zv G d? d@ejp                  Zw G dA dBejp                  Zx G dC dDejp                  Zy G dE dFejp                  Zz G dG dHejp                  Z{dIee|ejr        f         fdJZ} G dK dLeI          Z~ G dM dNeM          Z edOeP          Z G dQ dReKe                   Z G dS dTee                   Z G dU dVeLe                   Z G dW dXejp        eY          Z e;j        eeeY           G dZ d[eeYeWeZeX                      ZdS )\    N)abstractmethod)IterableMappingSequence)partial)	AnnotatedAnyLiteral	TypeAliasTypeVar)	rearrange)PretrainedConfig)GELUActivation)BatchFeature)BaseModelOutputBaseModelOutputWithPooling)	torch_int)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)init_logger)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loadermaybe_remap_kv_scale_name)MultiModelKeys)MULTIMODAL_REGISTRY)	ImageItemModalityDataMultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems	VideoItem)DictEmbeddingItems	ImageSizeModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)	SiglipMLP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelis_pp_missing_parametermaybe_prefix)is_vit_use_data_parallelheightwidthfactor
min_pixels
max_pixelsc                     | |k     r3t                               d| |           t          ||z  | z            }|} ||k     r3t                               d||           t          | |z  |z            } |}t          | |          t	          | |          z  dk    rt          d          t          | |z            |z  }t          ||z            |z  }||z  |k    rUt          j        | |z  |z            }t          j        | |z  |z            |z  }t          j        ||z  |z            |z  }n]||z  |k     rTt          j        || |z  z            }t          j	        | |z  |z            |z  }t          j	        ||z  |z            |z  }||fS )Nz8smart_resize: height=%s < factor=%s, reset height=factorz6smart_resize: width=%s < factor=%s, reset width=factor   z]absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)})
loggerwarningroundmaxmin
ValueErrormathsqrtfloorceil)rD   rE   rF   rG   rH   h_barw_barbetas           s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/keye.pysmart_resizerY   X   s    F	
 	
 	

 uv~/00v~~D	
 	
 	

 5011
65C...448
 
 	
 &6/""V+E%&.!!F*Eu}z!!y&5.J677
6D=6122V;
54<&011F:		#	#yv~677	&4-&011F:	%$,/0069%<    c            	           e Zd ZU dZed         ed<   eej         e	dddddh          f         ed<   eej         e	dd          f         ed	<   d
S )KeyeImagePixelInputs
    Dimensions:
        - bnp: Batch size * Number of patches
        - c: Number of channels
        - ps: Patch size
        - ni: Number of images
        - g: Grid dimensions (3 for t, h, w)
    pixel_valuestypebnp   psdynamic_dimsniimage_grid_thwN
__name__
__module____qualname____doc__r
   __annotations__r   torchTensorr6    rZ   rX   r\   r\      s           .
!!!!kk%D$eWMMMM    elKKa,@,@@AAAAAArZ   r\   c                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
KeyeImageEmbeddingInputsz
    Dimensions:
        - nf: Number of image features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
        - ni: Number of images
        - g: Grid dimensions (3 for t, h, w)
    image_embedsr_   nfhsre   ra   rf   Nrg   ro   rZ   rX   rq   rq      w           .
!!!!EL++dD*A*AABBBBelKKa,@,@@AAAAAArZ   rq   KeyeImageInputsc            	           e Zd ZU dZed         ed<   eej         e	dddddh          f         ed<   eej         e	dd          f         ed	<   d
S )KeyeVideoPixelInputsr]   pixel_values_videosr_   r`   ra   rb   rc   nvvideo_grid_thwNrg   ro   rZ   rX   rx   rx      s           '
(((("kk%D$eWMMMM    elKKa,@,@@AAAAAArZ   rx   c                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
KeyeVideoEmbeddingInputsz
    Dimensions:
        - nf: Number of video features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
        - nv: Number of videos
        - g: Grid dimensions (3 for t, h, w)
    video_embedsr_   rs   rt   rz   ra   r{   Nrg   ro   rZ   rX   r}   r}      ru   rZ   r}   KeyeVideoInputsc                        e Zd Zdef fdZ	 ddej        dedededej        f
d	Z	ddefdZ
	 	 	 ddej        dej        dz  deeeeef         eeeeef                  z           dz  dej        fdZ xZS )KeyeVisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        t          |j        | j        | j        | j        d          | _	        | j        | j        z  dz  | _
        | j
        | _        t                      | _        t                      | _        t          j        | j        | j                  | _        t          j        d| j                  | _        |                     dt)          j        | j                                      d          d           d S )	Nvalid)in_channelsout_channelskernel_sizestridepadding   i   position_ids)r7   F
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizer   num_channelspatch_embeddingnum_patchesnum_positionsdictcache_position_embeddingcache_position_countnn	Embeddingposition_embeddingpacking_position_embeddingregister_bufferrm   arangeexpand)selfr   	__class__s     rX   r   zKeyeVisionEmbeddings.__init__   s"   + + +*+? 
  
  
 !Ot>1D!-(,%$(FF!"$,t/A4>"R"R*,,udn*M*M'L+,,33G<< 	 	
 	
 	
 	
 	
rZ   F
embeddingsrD   rE   is_after_patchifyreturnc                    | j         j        j        d         }| j         j                            d          }|j        d         }|r|}|}	n|| j        z  }|| j        z  }	t          |dz            }
|                    d|
|
|          }|                    dddd          }t          j	        
                    |||	fdd	          }|                    dddd                              dd|          }|S )
Nr   r   g      ?r7   ra   r   bilinearF)sizemodealign_corners)r   weightshape	unsqueezer   r   reshapepermuter   
functionalinterpolateview)r   r   rD   rE   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              rX   interpolate_pos_encodingz-KeyeVisionEmbeddings.interpolate_pos_encoding   s    /6<Q?18BB1EEr" 	1JII4?2J0I&}c'9::)11!#5s
 
 *11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNrZ      	max_cachec                    ||f}|| j         v r"| j        |xx         dz  cc<   | j         |         S t          | j                   |k    rTt          | j        | j        j                  }| j                            |           | j                             |           |                     |||d          }d| j        |<   || j         |<   |S )Nr7   )keyT)r   r   lenrO   getpopr   )r   r   hwr   gridmin_hit_gridr   s           rX   "fetch_position_embedding_lfu_cachez7KeyeVisionEmbeddings.fetch_position_embedding_lfu_cache
  s    1v4000%d+++q0+++066t,--::)-1  L %)),777)--l;;;!:::q!TRR*+!$'.@%d+!!rZ   Nr^   r   rf   c                    |                                 dk    r|                    d          }|                                 dk    rg|t          d          |j        \  }}}}}	| j        j        j        }
t          |d          }|                     |                    |
                    }|	                    d          
                    d          }|r|d}t                      }|D ]|}|\  }}}|||z  |z  z   }|||d d f         }|                     |||d	          
                    d                              |d
          }||z   }|                    |           |}}t          j        |d                              d          }n||                     |          z   }|S t          d|                                  d          )N   r      z9position_ids cannot be None when pixel_values.dim() is 5.zb l c h w -> (b l) c h wdtyper   Tr7   r   z$Unsupported pixel_values dimension: z. Expected 4 or 5.)r   r   rP   r   r   r   r   r   toflattensqueezelistr   repeatappendrm   concatr   )r   r^   r   rf   r   
batch_sizesquence_lenchannelrD   rE   target_dtypepatch_embedsr   starttmp_embeddings
image_gridtr   r   endimage_embeddingsr   s                         rX   forwardzKeyeVisionEmbeddings.forward  s    ""'11!44L""# O   "/6<L$\3MNNL//l0S0STTL%--b1199"==J' XN,F!%"0    J(GAq!!a%!)+C'1%)QQQ,'?$556F1dSS 1 '
 (8:L'L$"))*:;;;EE"\.a@@@JJ1MM

'$*I*I,*W*WW
; $$&&; ; ;  rZ   )F)r   )NNF)rh   ri   rj   r   r   rm   rn   intboolr   r   FloatTensorr   tupler   __classcell__r   s   @rX   r   r      s<       
/ 
 
 
 
 
 
B #(" "L" " 	"
  " 
" " " "H" "c " " " ", -1!&2 2'2 lT)2 U3S=1DsC}9M4NNO
	2 
2 2 2 2 2 2 2 2rZ   r   qkcossinapply_rotary_embr   c                     |                     dd          d                                         }|                     dd          d                                         } || ||          } ||||          }||fS )Nr   r   r   r   )chunk
contiguous)r   r   r   r   r   q_embedk_embeds          rX   apply_rotary_pos_emb_flashattr   R  s     ))A2)

q
!
,
,
.
.C
))A2)

q
!
,
,
.
.Cq#s++Gq#s++GGrZ   c                        e Zd ZdZ	 	 ddededz  def fdZ	 	 	 	 dd	ej	        d
ej	        dz  de
dz  deej	                 dz  deej	        ej	        f         dz  dej	        fdZ xZS )KeyeSiglipAttentionzBMulti-headed attention from 'Attention Is All You
    Need' paper.N r   quant_configprefixc           
      n   t                                                       || _        |j        }|j        | _        t	                      }|rdnt                      }|j        | _        | j        |z  dk    sJ | j        |z  | _        |j        | _	        | j	        |k    r| j	        |z  dk    sJ n|| j	        z  dk    sJ t          d| j	        |z            | _        |j        | j        z  | _        | j        | j        z  | _        | j        | j        z  | _        | j        dz  | _        t!          || j        | j        | j	        d|| d          | _        t%          |||| d          | _        t)          | j        | j        | j        | j        | d	
          | _        t-          dd          | _        d S )Nr7   r   g      Tz	.qkv_projbiasr   r   z	.out_proj)
input_sizeoutput_sizer   r   z.attn)	num_heads	head_sizescalenum_kv_headsr   )enforce_enableenable_fp32_compute)r   r   r   r   rC   r   num_attention_headstotal_num_headsr   total_num_kv_headsrN   r   head_dimq_sizekv_sizer   r   qkv_projr   out_projr   attnr   r   )r   r   r   r   r   use_data_paralleltp_sizer   s          rX   r   zKeyeSiglipAttention.__init__f  s    	(!-466(T!!.R.T.T%9#g-2222-8"("<"g--*W499999T4499994#:g#EFF*d.BBnt}4(4=8]D(
)M #%'''
 
 
 *"#%'''	
 
 
 'nm**###
 
 
	 !/ $!
 !
 !
rZ   Fhidden_statesattention_maskoutput_attentions
cu_seqlensrope_embr   c                    |                      |          \  }}|                    | j        | j        | j        gd          \  }}	}
|dd          |d d         z
                                  }|y |j        g |j        d d         | j        | j        R  } |	j        g |	j        d d         | j	        | j        R  }	 |
j        g |
j        d d         | j	        | j        R  }
n|t          d          |\  }} |j        g |j        d d         | j        | j        R  } |	j        g |	j        d d         | j	        | j        R  }	t          ||	||| j                  \  }}	 |
j        g |
j        d d         | j	        | j        R  }
|                     ||	|
||          }t          |d          }|                     |          \  }}|S )Nr   r   r7   z4cu_seqlens cannot be None when rope_emb is not None.)queryr   valuer  
max_seqlenzb s h d -> b s (h d))r  splitr  r  rN   r   r   r   r  r   rP   r   r   r
  r   r	  )r   r  r  r  r  r  qkv_r   r   vr  r   r   context_layeroutputs                   rX   r   zKeyeSiglipAttention.forward  sq    }--Q))[$,5  
 
1a
 !nz#2#6;;==
DDdnDdmDDDA "!   A
  "!   AA ! !WXXXHCDDdnDdmDDDA "!   A
 1AsCAVWWDAq "!   A 		!! " 
 
 "-1GHHMM-00	rZ   Nr   )NFNN)rh   ri   rj   rk   r   r   strr   rm   rn   r   r   r   r   r   r   s   @rX   r   r   b  s         37	6
 6
 6
 )4/6
 	6
 6
 6
 6
 6
 6
v /3).04=A7 7|7 t+7  $;	7
 &-7 el23d:7 
7 7 7 7 7 7 7 7rZ   r   c                   N     e Zd Zd
dededdf fdZd Zdedej        fd	Z	 xZ
S )SigLIPRotaryEmbedding     @r   thetar   Nc                     t                                                       || _        || _        |                                  d S N)r   r   r   r!  	rope_init)r   r   r!  r   s      rX   r   zSigLIPRotaryEmbedding.__init__  s=    
rZ   c                     d| j         t          j        d| j        dt          j                  | j        z  z  z  }|                     d|d           d S )Ng      ?r   r   r   inv_freqFr   )r!  rm   r   r   floatr   )r   r&  s     rX   r$  zSigLIPRotaryEmbedding.rope_init  sW    J5<48QekJJJTXUV
 	ZeDDDDDrZ   seqlenc                     t          j        || j        j        | j        j                  }t          j        || j                  }|S )N)devicer   )rm   r   r&  r*  r   outer)r   r(  seqfreqss       rX   r   zSigLIPRotaryEmbedding.forward  sE    l='-%
 
 

 C//rZ   )r   )rh   ri   rj   r   r'  r   r$  rm   rn   r   r   r   s   @rX   r  r    s         C  D      E E Ec el        rZ   r  c                        e Zd Z	 	 ddededz  def fdZ	 	 	 ddej        d	ej        d
e	dz  de
ej                 dz  deej        ej        f         dz  deej                 fdZ xZS )KeyeSiglipEncoderLayerNr   r   r   r   c                 \   t                                                       |j        | _        t	          j        | j        |j                  | _        t          ||| d          | _	        t	          j        | j        |j                  | _
        t          ||| d          | _        d S )Nepsz
.self_attnr   r   z.mlp)r   r   r   r   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r=   mlpr   r   r   r   r   s       rX   r   zKeyeSiglipEncoderLayer.__init__  s     	+<F<QRRR,%(((
 
 

 <F<QRRR%???
 
 
rZ   Fr  r  r  r  r  r   c                     |}|                      |          }|                     |||||          }||z   }|}|                     |          }|                     |          }||z   }|S )N)r  r  r  r  r  )r6  r7  r8  r9  )r   r  r  r  r  r  residuals          rX   r   zKeyeSiglipEncoderLayer.forward  s     !((77')/! ' 
 
 !=0 ((77// =0rZ   r  )FNN)rh   ri   rj   r   r   r  r   rm   rn   r   r   r   r   r   r   r   s   @rX   r/  r/    s         37	
 
 
 )4/
 	
 
 
 
 
 
2 */04=A |   $;	
 &- el23d: 
u 	!       rZ   r/  c                   D    e Zd Z	 	 ddededz  def fdZed             Z	 	 	 	 	 	 	 	 	 	 dde	j
        dz  dedz  dedz  dee	j
                 dz  deeeeef         eeeeef                  z           dz  de	j
        dz  de	j
        dz  dedz  dedz  dedefdZ xZS )KeyeSiglipEncoderNr   r   r   r   c                 "   t                                                       | _        j        }j        }||z  }t          j        fdt          j                  D                       | _	        t          |dz            | _        d S )Nc           	      >    g | ]}t           d |           S )z.layers.r3  )r/  ).0	layer_idxr   r   r   s     rX   
<listcomp>z.KeyeSiglipEncoder.__init__.<locals>.<listcomp>0  sQ         '!-$99i99    rZ   r   )r   r   r   r   r  r   
ModuleListrangenum_hidden_layerslayersr  rotary_pos_emb)r   r   r   r   r   r   r  r   s    ```   rX   r   zKeyeSiglipEncoder.__init__$  s     	&	.		)m      "'v'?!@!@  	
 	
 4HMBBrZ   c                     t                      }| D ]B}t          |t                     r|                    |           -|                    |           C|S r#  )r   
isinstanceextendr   )rf   tmp_image_grid_thwr   s      rX   flatten_listzKeyeSiglipEncoder.flatten_list;  sa    !VV( 	6 	6J*d++ 6"))*5555"))*5555!!rZ   Fr   visionr  r  output_hidden_statesr  rf   height_position_idswidth_position_idsuse_ropewindow_sizevision_or_textr   c                 &   |j         }|}|	du r^|                     |          }||t                      }t                      }|D ]\\  }}}t          j        ||z  |z  |          ||z  z  }||z  }||z  }|                    |           |                    |           ]t          j        |d          }t          j        |d          }t          j        ||gd          }|                                dz   }| 	                    |          }||         
                    d          }|                    dd          }|                                |                                f}nd }|}|}|J | j        D ]} ||||||          }|S )	NT)r*  r   r   r   r7   r   )r  r  r  )r*  rM  r   rm   r   r   r   stackrN   rH  r   r   r   r   rG  )r   inputs_embedsr  r  rO  r  rf   rP  rQ  rR  rS  rT  r*  r  flatten_image_grid_thw
split_hids
split_widsr   r   r   
image_pidssample_hidssample_widspidsmax_grid_sizerope_emb_max_gridr  attn_cu_seqlensencoder_layers                                rX   r   zKeyeSiglipEncoder.forwardE  s    %%t%)%6%6~%F%F"!)-@-H!VV
!VV
5 3 3GAq!!&a!eai!G!G!G1q5!QJ",/K",q.K%%k222%%k2222%*\*!%D%D%D"&+l:1&E&E&E#;$&89  D !HHJJNM $ 3 3M B B(.66q99Hq!,,H 7HHH$%%%%![ 	 	M)M"3*!  MM rZ   r  )
NNNNNNNFr   rN  )rh   ri   rj   r   r   r  r   staticmethodrM  rm   rn   r   r   r   r   r   r   r   r   s   @rX   r>  r>  #  s        37	C C C )4/C 	C C C C C C. " " \" /3)-,0043726 %#%&8 8 t+8  $;	8
 #Tk8 &-8 U3S=1DsC}9M4NNO
8 #\D08 "L4/8 +8 D[8 8 
8 8 8 8 8 8 8 8rZ   r>  c            #           e Zd Z	 	 ddededz  def fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
edz  dedz  dedz  dej	        dz  dej	        dz  dej	        dz  dej	        dz  dej	        dz  dej	        dz  de
ej	                 dz  dej	        dz  dedz  de
eeeef         e
eeeef                  z           dz  dedz  dedz  dedz  def"dZ xZS )KeyeSiglipVisionTransformerNr   r   r   r   c                     t                                                       || _        |j        }t	          |          | _        t          ||| d          | _        t          j	        ||j
                  | _        d S )Nz.encoderr3  r1  )r   r   r   r   r   r   r>  encoderr   r4  r5  post_layernorm)r   r   r   r   r   r   s        rX   r   z$KeyeSiglipVisionTransformer.__init__  s     	&	.v66(%&&&
 
 

 !l9&:OPPPrZ   FTr   r  rO  r   r  sample_indicesimage_indicesr   rP  rQ  r  padding_maskvision_return_embed_listrf   return_pooler_outputrR  rS  r   c                    |                      ||||          }|                     ||||||||	|
|d          }|                     |          }t                      }|t	          d          t          |j        d         dz
            D ]N}||         }||dz            }|d d ||d d f                             d          }|                    |           O|S )N)r   r   rf   rN  )rW  r  rO  r  r  rf   rR  rP  rQ  rS  rT  zHcu_seqlens cannot be None for SiglipVisionTransformer output processing.r   r7   )	r   rg  rh  r   rP   rE  r   r   r   )r   r^   r  rO  r   r  ri  rj  r   rP  rQ  r  rk  rl  rf   rm  rR  rS  r  last_hidden_statesample_hidden_stateir   r   tensors                            rX   r   z#KeyeSiglipVisionTransformer.forward  s&   * %=%)	 ( 
 
 !LL'/!5)!) 31## ) 
 
 !//0ABB"ff=   z'*Q.// 	/ 	/AqMEQU#C&qqq%)QQQ7??BBF&&v....""rZ   r  )NNFNNNNNNNNFNTFr   )rh   ri   rj   r   r   r  r   r   rm   rn   r   r   r   r   r   r   r   s   @rX   re  re    s        37	Q Q Q )4/Q 	Q Q Q Q Q Q* *.,005.2.2-1,0372604,005,0 %#%'8# 8#  $;8# #Tk	8#
 #'+8# t+8# t+8# |d*8# lT)8# #\D08# "L4/8# &-8# lT)8# #'+8# U3S=1DsC}9M4NNO
8#" #Tk#8#$ +%8#& D['8#( 
$)8# 8# 8# 8# 8# 8# 8# 8#rZ   re  c                       e Zd ZeZdZ	 	 ddededz  def fdZe	de
j        fd	            Ze	de
j        fd
            Zdej        fdZ	 	 	 	 	 	 	 	 	 	 	 dde
j        dz  dedz  dedz  dede
j        dz  dedz  deeeeef         eeeeef                  z           dz  dee
j                 dz  dedz  dedz  dedz  defdZdeeee
j        f                  dee         fdZ xZS )KeyeSiglipVisionModelr^   Nr   r   r   r   c                     t                                                       t          ||| d          | _        || _        d S )Nz.vision_modelr3  )r   r   re  vision_modelr   r:  s       rX   r   zKeyeSiglipVisionModel.__init__  sT     	7%+++
 
 

 )rZ   r   c                 8    | j         j        j        j        j        S r#  )rv  r   r   r   r   r   s    rX   r   zKeyeSiglipVisionModel.dtype  s     +;BHHrZ   c                 8    | j         j        j        j        j        S r#  )rv  r   r   r   r*  rx  s    rX   r*  zKeyeSiglipVisionModel.device  s     +;BIIrZ   c                 $    | j         j        j        S r#  )rv  r   r   rx  s    rX   get_input_embeddingsz*KeyeSiglipVisionModel.get_input_embeddings  s     +;;rZ   FTr   ri  r  rO  r   r   rl  rf   r  rm  rR  rS  c                 D    |                      |||||||||	|
||          S )N)r^   r  rO  r   r   rl  rf   ri  r  rm  rR  rS  )rv  )r   r^   ri  r  rO  r   r   rl  rf   r  rm  rR  rS  s                rX   r   zKeyeSiglipVisionModel.forward  sE        %/!5%=%%=))!!5# ! 
 
 	
rZ   weightsc                    g d}t          |                     d                    }t                      }|D ]\  }}d|v rd|v sd|v rd|v sd|v r| j        ~| j                            |          x}rb||         }t          |d	t                    }	|                                d
k    r|n|d
         } |	||           |                    |           |D ]i\  }
}}||vr|	                    ||
          }|
                    d          r||vr;t          ||           rL||         }|j        }	 |	|||            nk|
                    d          r||vr)t          ||          }|=t          ||           rO||         }t          |d	t                    }	 |	||           |                    |           |S )N))r  q_projr   )r  k_projr   )r  v_projr  F)remove_duplicatezrotary_emb.inv_freqzhead.attentionzhead.layernormzhead.mlpz
head.probeweight_loaderr   z.bias)r   named_parameterssetr   get_cache_scalegetattrr   r   addreplaceendswithrA   r  r    )r   r}  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr  
param_nameweight_nameshard_ids                rX   load_weightsz"KeyeSiglipVisionModel.load_weights  so   "
 "
 "

 400%0HHII"%%%#* 5	$ 5	$D-$,,4''+;t+C+CT!!\T%9%9 ,"/??EEE
 - $J/ '#)! ! &3%6%6%8%8A%=%=MM=QRCS  e]333!!*---
 (	4 4 d**||K<<==)) d+.E.E*466 #D) % 3e]H=====)) d+.E.E0{CC<*466 #D) '#)! !
 e]333d####rZ   r  )NNNFNFNNTFr   )rh   ri   rj   r   config_classmain_input_namer   r  r   propertyrm   r   r*  r   Moduler{  rn   r   r   r   r   r   r   r   r  r  r   r   s   @rX   rt  rt    s8       #L$O
 37	) ) ) )4/) 	) ) ) ) ) ) Iu{ I I I XI J J J J XJ<bi < < < < /3)-,0).,00504,0 %#%
 
 t+
  $;	

 #Tk
 #'
 lT)
 #'+
 U3S=1DsC}9M4NNO

 &-
 #Tk
 +
 D[
 
$
 
 
 
>>HU33D-E$F >3s8 > > > > > > > >rZ   rt  c            	            e Zd Z	 	 ddedededz  def fdZdej        e	ej                 z  d	e	e
eeef                  d
ej        e	ej                 z  fdZ xZS )	ProjectorNr   text_configvision_configr   r   c                    t                                                       || _        || _        d| _        | j        j        | j        d         z  | j        d         z  | _        t          j                            | j        j        d          | _	        t                      | _        t          | j        | j        d|| d          | _        t          | j        | j        j        d|| d	          | _        d S )
N)r   r   r   r7   gh㈵>r1  Tz	.linear_1r   z	.linear_2)r   r   r  r  merge_kernel_sizer   rm   r   r4  pre_normr   actr   linear_1r   linear_2)r   r  r  r   r   r   s        rX   r   zProjector.__init__M  s    	&*!' *$Q'($Q'( 	 **4+=+Iu*UU!##,%'''
 
 
 *(%'''
 
 
rZ   image_featuresrf   r   c           
      <   | j         \  }}t          |t          t          f          rt                      }t	          ||          D ]\  }}|                     |          }|\  }}	}
t          |d||	|z  ||
|z  |          }|                     |          \  }}|                     |          }| 	                    |          \  }}|
                    |           |S |j        d d         }|j        d         }|                    t          j        |          |          }|                     |                              d| j                  }|                     |          }|                     |          }| 	                    |          } |j        g |dR  S )Nz$(t h p1 w p2) d -> (t h w) (p1 p2 d))r   r   p1r   p2r   )r  rJ  r   r   zipr  r   r  r  r  r   r   r   npprodr   )r   r  rf   m1m2processed_featuresimage_featurer   r   r   r   r  r  dimsr   s                  rX   r   zProjector.forwardq  s   
 'BntUm44 	&!%-0-P-P 9 9)z $m < <$1a )!:2g2g! ! ! $(==#?#? q $ 7 7#'==#?#? q"))-8888%%#CRC("2&',,RWT]]C@@n55::2t?OPPm44//m44!}!,4,,,,,rZ   r  )rh   ri   rj   r   r   r  r   rm   rn   r   r   r   r   r   r   s   @rX   r  r  L  s        
 37"
 "
%"
 ("
 )4/	"

 "
 "
 "
 "
 "
 "
H$-tEL'99$- U3S=12$- 
U\*	*	$- $- $- $- $- $- $- $-rZ   r  	hf_inputsc           
          |                      dt          j        d                    }|                    d          }|                      dt          j        d                    }|                    d          }t	          t          j        d|          t          j        d|          t          j        d          t          j        d|          t          j        d|          t          j        d                    S )Nrf   )r   ra   r   r{   imagevideo)r^   rr   rf   ry   r~   r{   )r   rm   emptyr  r   r'   flat_from_sizesbatched)r  rf   image_grid_sizesr{   video_grid_sizess        rX   _keye_field_configr    s     ]]#3U[5H5HIIN%**2..]]#3U[5H5HIIN%**2..*:7DTUU*:7DTUU,4W==1A%
 
 +:7DTUU,4W==	 	 	 	rZ   c                        e Zd Zdeeej        f         ee         z  de	e
e
f         dz  f fdZdeeej        f         ee         z  de	e
e
f         dz  f fdZ xZS )KeyeMultiModalDataParserdatar   Nc                     t          |t                    rt          |dddht                    S t	                                          |          S )Nr  rr   rf   modalityrequired_fieldsfields_factory)rJ  r   r*   r  r   _parse_image_datar   r  r   s     rX   r  z*KeyeMultiModalDataParser._parse_image_data  ^     dD!! 		% "$!  2    ww((...rZ   c                     t          |t                    rt          |dddht                    S t	                                          |          S )Nr  r~   r{   r  )rJ  r   r*   r  r   _parse_video_datar  s     rX   r  z*KeyeMultiModalDataParser._parse_video_data  r  rZ   )rh   ri   rj   r   r  rm   rn   r$   r#   r,   r	   r  r)   r  r   r   s   @rX   r  r    s        /3$%Y(??/ 
38	$t	+/ / / / / /"/3$%Y(??/ 
38	$t	+/ / / / / / / / / /rZ   r  c                   0   e Zd ZdefdZdefdZdefdZdee	edz  f         fdZ
ded	ee	ef         dee	ef         fd
Zddddededededeeef         f
dZdededefdZdedededefdZdefdZdefdZdedefdZdedefdZdedefdZdS )KeyeProcessingInfor   c                     dS )Ni ro   rx  s    rX   get_max_image_sizez%KeyeProcessingInfo.get_max_image_size  s    wrZ   c                     dS )N   ro   rx  s    rX   get_max_frame_per_videoz*KeyeProcessingInfo.get_max_frame_per_video  s    rrZ   kwargsc                 &     | j         di |j        S )Nro   )get_hf_processorimage_processor)r   r  s     rX   get_image_processorz&KeyeProcessingInfo.get_image_processor  s    $t$..v..>>rZ   Nc                     d d dS Nr  r  ro   rx  s    rX   get_supported_mm_limitsz*KeyeProcessingInfo.get_supported_mm_limits  s     ---rZ   seq_len	mm_countsc                 V    |                                  |                     |          dS r  )get_max_image_tokensget_max_video_tokens)r   r  r  s      rX   get_mm_max_tokens_per_itemz-KeyeProcessingInfo.get_mm_max_tokens_per_item  s3     ..00..w77
 
 	
rZ   r7   T)
num_frames	do_resizeimage_widthimage_heightr  r  c                   ||                                  }|                                 }|j        }|j        }|j        }	d}
|r6t          ||||	z  |j        |j                  \  }}t          ||          }nt          ||          }|||
z  z   }t          ||
z  d          }|j
        |z  }|j        |z  }||z  |z  }||	dz  z  }||fS )Nr7   )rD   rE   rF   rG   rH   )rE   rD   r   )r  get_hf_configr  r   spatial_merge_sizerY   rG   rH   r+   rN   rD   rE   )r   r  r  r  r  r  	hf_configr  r   
merge_sizetemporal_patch_sizeresized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wr   num_vision_tokenss                       rX   _get_vision_infoz#KeyeProcessingInfo._get_vision_info  s    ""6688O&&((	!/"-
"5
 
	R,8#!!J.*5*5- - -)NM !*n U U U )L Q Q Q&6I)II&*==qAA")Z7"(J6vo.'JM: "333rZ   c                <    |                      |||          \  }}|S N)r  r  r  r  )r   r  r  r  r  num_image_tokenss         rX   get_num_image_tokensz'KeyeProcessingInfo.get_num_image_tokens  s5     #33#%+ 4 
 

  rZ   c                >    |                      ||||          \  }}|S Nr  r  r  r  r  )r   r  r  r  r  r  num_video_tokenss          rX   get_num_video_tokensz'KeyeProcessingInfo.get_num_video_tokens!  s8     #33#%!+	 4 
 
  rZ   c                     |                      |                                 |                                 d           \  }}|S r  )r  r  )r   max_image_sizer  s      rX   !get_image_size_with_most_featuresz4KeyeProcessingInfo.get_image_size_with_most_features1  sL     !11//110022  2 
 

 rZ   c                 `    |                                  \  }}|                     ||d           S r  )r  r  )r   target_widthtarget_heights      rX   r  z'KeyeProcessingInfo.get_max_image_tokens;  s>    &*&L&L&N&N#m(($&  ) 
 
 	
rZ   
max_tokensc                     |                                  \  }}d}	 |dz   }|                     |||d           }||k    rn|}(|S )Nr   Tr7   r  )r  r  )r   r   r  r  r  next_num_framesnext_max_tokenss          rX   _get_max_video_framesz(KeyeProcessingInfo._get_max_video_framesD  ss    &*&L&L&N&N#m
	)(1nO"77(** $	 8  O ++(J	) rZ   c                 l   | j                                         }|                    d          }|                    d          }|                                 |z  }|                     ||z
            }t          |t          |d          z  |                                           }t          |d          S )Nr  r  r7   )ctxget_mm_configget_limit_per_promptr  r  rO   rN   r  )r   r  	mm_config
max_images
max_videosmax_image_tokensmax_total_framesmax_frames_per_videos           rX   !get_num_frames_with_most_featuresz4KeyeProcessingInfo.get_num_frames_with_most_featuresY  s    H**,,	33G<<
33G<<
4466C55g@P6PQQ"J 2 22((** 
  

 '+++rZ   c                     |                                  \  }}|                     |||                     |          d           S r  )r  r  r  )r   r  r  r  s       rX   r  z'KeyeProcessingInfo.get_max_video_tokensg  sO    &*&L&L&N&N#m(($&==gFF 	 ) 
 
 	
rZ   )rh   ri   rj   r   r  r  objectr  r   r  r  r  r   r   r+   r  r  r  r  r  r  r  r  ro   rZ   rX   r  r    s=       C        ?F ? ? ? ?.	cDj	!. . . .


 38$
 
c		
 
 
 
 '4 '4 '4 '4 	'4
 '4 '4 
y#~	'4 '4 '4 '4R    	  
           	 
   
        	   
c 
 
 
 
     *, , , , , ,
C 
C 
 
 
 
 
 
rZ   r  _I)boundc            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	KeyeBaseDummyInputsBuilderr  r   c                     |                     dd          }|                     dd          }| j                                        }|j        }|j        }||z  ||z  z   S )Nr  r   r  )r   infor  image_tokenvideo_token)r   r  
num_images
num_videoshf_processorr  r  s          rX   get_dummy_textz)KeyeBaseDummyInputsBuilder.get_dummy_textv  s`    ]]7A..
]]7A..
y1133'3'3Z'+
*BBBrZ   Nr  
mm_optionsc                    |                     dd          }|                     dd          }| j                                        \  }}| j                            |          }|r|                     d          nd }	|r|                     d          nd }
|                     ||||	          |                     |||||
          d}|S )Nr  r   r  )rE   rD   r  	overrides)rE   rD   r  r  r   r  )r   r  r  r  _get_dummy_images_get_dummy_videos)r   r  r  r  r  r  r  r  target_num_framesimage_overridesvideo_overridesmm_datas               rX   get_dummy_mm_dataz,KeyeBaseDummyInputsBuilder.get_dummy_mm_data  s     ]]7A..
]]7A..
&*i&Q&Q&S&S#m IGGPP5?I*..111T5?I*..111T ++"$%)	 ,   ++"$,%) ,  
 
  rZ   r#  )
rh   ri   rj   r   r  r   r  r   r%   r'  ro   rZ   rX   r  r  u  s        CS(9 Cc C C C C =A	  38$ C!112T9	
 
     rZ   r  c                       e Zd ZdS )KeyeDummyInputsBuilderN)rh   ri   rj   ro   rZ   rX   r)  r)    s          rZ   r)  c            	           e Zd ZdefdZdedeeef         de	de
e         fdZdedeeef         deeef         fdZd	S )
KeyeMultiModalProcessorr   c                     t                      S r#  )r  rx  s    rX   _get_data_parserz(KeyeMultiModalProcessor._get_data_parser  s    ')))rZ   mm_itemshf_processor_mm_kwargsout_mm_kwargsc                 @  	
  | j         j        di |} | j         j        di |}| j                                         }|                                }||j                 ||j                 d
|j        dz  	dt          dt          f	
fd
fddD             S )Nr  r   item_idxr  c                     |         |          }|| d         j         }t          |t          j                  sJ t	          |                                          z  }|         g|z  S )N	_grid_thw)r  rJ  rm   rn   r   r  )r2  r  out_itemgrid_thw
num_tokensmerge_lengthr0  placeholders        rX   get_replacement_keyezIKeyeMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_keye  sm    $X.x8H8 6 6 67<Hh55555X]]__--=J)*Z77rZ   c           
      `    g | ]*}t          ||         gt          |                     +S ))r  )r  targetreplacement)r2   r   )rA  r  r:  r9  s     rX   rC  z?KeyeMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>  sY     
 
 
  !#H-.#$88LLL  
 
 
rZ   ro   )
r  r  r  get_tokenizer	get_vocabr  r  r  r   r  )r   r.  r/  r0  r  r  	tokenizervocabr:  r8  r9  s      `    @@@rX   _get_prompt_updatesz+KeyeMultiModalProcessor._get_prompt_updates  s     2ty1KK4JKK7$)7QQ:PQQI++--	##%% <34<34
 

 '114	83 	8# 	8 	8 	8 	8 	8 	8 	8 	8
 
 
 
 
 /
 
 
 	
rZ   r  c                      t          |          S r#  )r  )r   r  r/  s      rX   _get_mm_fields_configz-KeyeMultiModalProcessor._get_mm_fields_config  s    
 "),,,rZ   N)rh   ri   rj   r.   r-  r-   r   r  r	   r(   r   r3   rB  r   r  r'   rD  ro   rZ   rX   r+  r+    s        *"6 * * * *!
%!
 !(S 1!
 -	!

 
,	!
 !
 !
 !
F-- !(V 4- 
++	,	- - - - - -rZ   r+  c                   f    e Zd Zg dddgdZ eddd          Zed	ed
ededz  fd            Z	ddde
def fdZe	 	 d,dedededz  dedej        f
d            Zdedeej        df         fdZ	 d-ded         deej                 dej        dz  dej        eej                 z  fdZdedefd Zdededz  fd!Z	 	 d.d"ej        d#ej        d$edz  d%ej        dz  dedej        ez  fd&Z d'ej        dej        dz  fd(Z!d)e"eeej        f                  de#e         fd*Z$de%fd+Z& xZ'S )/BaseKeyeModule)r  r  r  	gate_projup_proj)r  gate_up_projzlanguage_model.lm_head.zlanguage_model.model.)zlm_head.zmodel.)orig_to_new_prefixr  rq  r   Nc                 |    |                     d          rdS |                     d          rdS t          d          )Nr  z+<|vision_start|><|image_pad|><|vision_end|>r  z+<|vision_start|><|video_pad|><|vision_end|>z)Only image or video modality is supported)
startswithrP   )clsr  rq  s      rX   get_placeholder_strz"BaseKeyeModule.get_placeholder_str  sK    w'' 	A@@w'' 	A@@DEEErZ   r   )r   vllm_configr   c          
      Z   t                                                       |j        j        }|j        }|| _        |                     |ddh          5  t          |j        |t          |d                    | _
        |                     ||j        |t          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t          |t          |d          dg          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )	Nr  r  visualr3  mlp_ARlanguage_modelQwen3ForCausalLM)rO  r   architectures)r   r   model_configr  r   r   _mark_tower_modelrt  r  rB   rQ  _build_projectorrR  _mark_language_modelr@   rS  make_empty_intermediate_tensors)r   rO  r   r   r   r   s        rX   r   zBaseKeyeModule.__init__  s   #.#;#E"/##K'71CDD 	 	/$)#FH55  DK
 //$)#FH55	 0  DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	"<'#F,<==12# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   AB<<C C 'DDDr  r  r   c                      t          d          )NzNeed projector)NotImplementedErrorr   r  r  r   r   s        rX   rX  zBaseKeyeModule._build_projector  s     ""2333rZ   image_input.c                    t                      }t                      }t                      }dg}|d         }|j        dk    sJ t          |          D ]\  }}t          |                                                                                                                                          }	t          j	        |	          }
|
                    |	           t          j        |
          t          j	        |	dd                    z  }|
                    |           |
                    t          j        |
f|t          j                             |
                    |d         |
z              !|d         dk    rt          d	          |d
                             | j        j                  }t          j        |d                              |j                  }t          j        |t          j                                      |j                  }t          j        |d                              |j                  }|                     |||dd||dd	  	        }t          |                     ||                    }|S )Nr   rf   r   r7   r   r   r_   rr   z<Image embeddings are not supported for this processing path.r^   r   FT	r^   rf   r   rl  r   ri  r  rR  rS  r   ndim	enumerater   detachcpunumpytolistr  r  r   rm   r   fullint64rP   r_   rQ  r   r   r   r*  rr  int32rR  )r   r^  siglip_position_idsimage_grid_hwsri  r  rf   idxthaw	thw_tuplenumelimage_position_idsr^   rr   s                 rX   _process_image_inputz#BaseKeyeModule._process_image_input  s[   "ffS
$%56"a''''">22 	6 	6ICdkkmm//117799@@BBCCIGI&&E!!),,,!&e!4!4rwy}7M7M!M&&'9:::!!%*eXs%+"N"N"NOOOjnu45555v.00N   '~6;;DK<MNNL"',/B"J"J"J"M"M## # jDDDGG# J #\.a@@@CCLDWXXN;;)-0).)--% ' 
 
L !\>!J!JKKLrZ   
video_type)r~   ry   r{   ry   c                    t                      }t                      }t                      }dg}|j        dk    sJ t          |          D ]\  }}	t          |	                                                                                                                                          }
t          j	        |
          }|
                    |
           t          j        |          t          j	        |
dd                    z  }|
                    |           |
                    t          j        |f|t          j                             |
                    |d         |z              !|dk    rt          d          |                    | j        j                  }t          j        |d                              |j                  }t          j        |t          j                                      |j                  }t          j        |d                              |j                  }|                     |||d	d	||d	d
	  	        }|                     ||          }|S )Nr   r   r7   r   r   r~   z<Video embeddings are not supported for this processing path.r   Tr`  ra  )r   rs  r{   ry   rk  video_grid_hwsri  r  rm  sub_thwro  rp  video_position_idsr~   s                 rX   _process_video_embedsz$BaseKeyeModule._process_video_embedsH  sJ    #ffS
"a''''%n55 	6 	6LCgnn..2244::<<CCEEFFIGI&&E!!),,,!&e!4!4rwy}7M7M!M&&'9:::!!%*eXs%+"N"N"NOOOjnu45555''N   #6":":4;;L"M"M"',/B"J"J"J"M"M#*# # jDDDGG#* J #\.a@@@CC#* N  ;;0-0)-)--% ' 
 
L  ;;|^DDLrZ   r  c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)r^   rr   images)ry   r~   videosro   )_parse_and_validate_image_input_parse_and_validate_video_input)r   r  
modalities	input_keys       rX   %_parse_and_validate_multimodal_inputsz4BaseKeyeModule._parse_and_validate_multimodal_inputs|  s    
 
	V 
	VI===J..'Kt'K'U'Uf'U'U
8$DDDJ..'Kt'K'U'Uf'U'U
8$rZ   c                 
    | j         di |}|sd S d}|D ]l}|dk    r/|d         }|                     |          }|t          |          z  }|dk    r/|d         }|                     |          }|t          |          z  }m|S )Nro   rz  r{  )r  rr  r   _process_video_input)	r   r  r~  multimodal_embeddingsr  r^  r   video_inputvideo_embeddingss	            rX   embed_multimodalzBaseKeyeModule.embed_multimodal  s    ?T?II&II
 	4:<" 	A 	AH8##(2#'#<#<[#I#I %/?)@)@@%8##(2#'#<#<[#I#I %/?)@)@@%$$rZ   	input_ids	positionsintermediate_tensorsrW  c                 J    |d}| j                             ||||          }|S )aU  Run forward pass for Keye-VL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,)`.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.
        N)r  r  r  rW  )rS  model)r   r  r  r  rW  r  r  s          rX   r   zBaseKeyeModule.forward  s@    *  + M+11!5'	 2 
 
 rZ   r  c                 6    | j                             |          S r#  )rS  compute_logits)r   r  s     rX   r  zBaseKeyeModule.compute_logits  s     "11-@@@rZ   r}  c                 X    t          |           }|                    || j                  S )N)mapper)r>   r  hf_to_vllm_mapper)r   r}  loaders      rX   r  zBaseKeyeModule.load_weights  s+    "4((""743I"JJJrZ   c                 0    t          j        ddd          S )z+Get the module prefix in multimodal models.rS  zmlp_AR.zvisual.)rS  	connectortower_model)r!   from_string_fieldrx  s    rX   get_mm_mappingzBaseKeyeModule.get_mm_mapping  s%    /+!
 
 
 	
rZ   r  r#  )NN)(rh   ri   rj   packed_modules_mappingr?   r  classmethodr  r   rN  r   r   r   r   r   r   r  rX  r	   r   rm   rn   rr  r
   r   rx  r  r   r  r8   r  r4   r   r  r   r  r  r!   r  r   r   s   @rX   rF  rF    s4       
 
 
 

 
 &1-
 
   F3 F3 F3: F F F [F BD 
 
 
z 
3 
 
 
 
 
 
> 
 374 4%4 (4 )4/	4
 4 
4 4 4 ^4,  , elC>O8P ,  ,  ,  , d 48	2  2 AB2  U\*2  #\D0	2 
 
U\*	*2  2  2  2 hf     "% %4H44O % % % %, <@-1 < < 2D8	
 |d*  
+	+   BA|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
 
 
 
 
rZ   rF  )r  dummy_inputsc                       e Zd Z	 	 ddedededz  dedej        f
dZd	e	de
dz  fd
Zd	e	dedz  fdZdedeej        df         fdZdee         dee         deej        ef         fdZdS )KeyeForConditionalGenerationNr   r  r  r   r   r   c                 &    t          ||||          S r#  )r  r]  s        rX   rX  z-KeyeForConditionalGeneration._build_projector  s     m\6JJJrZ   r  c                     |                     dd           }|                     dd           }|                     dd           }||d S |t          d||          S |t          d||          S d S )Nr^   rr   rf   )r_   r^   rf   )r_   rr   rf   )r   r\   rq   )r   r  r^   rr   rf   s        rX   r|  z<KeyeForConditionalGeneration._parse_and_validate_image_input  s     zz.$77zz.$77$4d;;L$84#'#)-    #+#)-    $#rZ   c                     |                     dd           }|                     dd           }|                     dd           }||d S |t          d||          S |t          d||          S d S )Nry   r~   r{   )r_   ry   r{   )r_   r~   r{   )r   rx   r}   )r   r  ry   r~   r{   s        rX   r}  z<KeyeForConditionalGeneration._parse_and_validate_video_input  s     %jj)>EEzz.$77$4d;;&<+?4*'*$7-    #+#)-    $#rZ   r  .c                     |d         }|d         }|                     dd           }t          |                     |||                    S )Nr_   r{   ry   )r   r   rx  )r   r  rs  r{   ry   s        rX   r  z1KeyeForConditionalGeneration._process_video_input  sV     !(
$%56)oo.CTJJ&&z>CVWW
 
 	
rZ   input_tokensmm_featuresc                 	   t          j        |ddh          }d |                    dg           D             }d |                    dg           D             }t          |t                    rt          |          dk    r|d         }dt          j        t          t                   z  dt          t          t                            fd} ||          }| j	        }|j
        }|j        }	|j        j        }
t          |          }t          |          }g }d}||}}d	\  }}t          ||z             D ]}|dk    r:	 |                    ||          }n4# t           $ r t          |          d
z   }Y nw xY wt          |          d
z   }|dk    r:	 |                    |	|          }n4# t           $ r t          |          d
z   }Y nw xY wt          |          d
z   }||k     r||         \  }}}|d
z  }|d
z  }|}n||         \  }}}|d
z  }|d
z  }|}|||
z  ||
z  }}}||z
  }t          |          dk    r|d                                         d
z   nd}|                    t          j        |                              d
d                              dd          |z              t          j        |                              dd
                              d||z                                                                            }t          j        |                              d
dd
                              |d|                                          } t          j        |                              d
d
d                              ||d                                          }!|                    t          j        || |!g          |z   |z              |||z  |z  z   }|t          |          k     rt          |          dk    r|d                                         d
z   nd}t          |          |z
  }|                    t          j        |                              d
d                              dd          |z              t          j        |d
                              dd          }"|"                                d
z   t          |          z
                                  }#|"|#fS )Nrf   r{   c                 6    g | ]}|                                 S ro   rg  rA  items     rX   rC  zJKeyeForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>(       UUUD$++--UUUrZ   c                 6    g | ]}|                                 S ro   r  r  s     rX   rC  zJKeyeForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>)  r  rZ   r   r6  r   c                    t          | t                    r t          j        | t          j                  } |                                 dk    rg S | dddf         | ddddf         }}t          j        |ddddf                   }t          j        ||gd                              |d          }|	                                S )a  
            Split grid_thw along the t dimension.

            Args:
                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].

            Returns:
                List of [1, h, w] rows, repeated t times for each original row.
            r   r   Nr7   r   )
rJ  r   rm   rr  longrp  	ones_likecatrepeat_interleaverg  )r6  r   hwonesouts        rX   	split_thwzIKeyeForConditionalGeneration.get_mrope_input_positions.<locals>.split_thw.  s     (D)) D <
CCC~~1$$	QQQTNHQQQUOrA?2aaa!e9--D)T2JA...@@@JJC::<<rZ   )r   r   r7   r   ra   r   )r&   gather_kwargsr   rJ  r   r   rm   rn   r   r   image_token_idvideo_token_idr  r  rE  indexrP   rN   r   r   r   r   r  r   rV  r  r   r  )$r   r  r  r  rf   r{   r  r  r  r  r  
image_nums
frame_numsllm_pos_ids_liststremain_imagesremain_framesimage_indexvideo_indexr  ed_imageed_videor   r   r   ed
llm_grid_t
llm_grid_h
llm_grid_wtext_lenst_idxt_indexh_indexw_indexllm_positionsmrope_position_deltas$                                       rX   get_mrope_input_positionsz6KeyeForConditionalGeneration.get_mrope_input_positions  sQ   
 '4/0
 
 VUFJJ?OQS4T4TUUUUUFJJ?OQS4T4TUUUnd++ 	/N0C0Ca0G0G+A.N	 tCy 8 	 T$s)_ 	  	  	  	 , #>22K	"1"1&4G((
((
!#'1:}#' [zJ.// @	; @	;Aq  5+11."EEHH! 5 5 5"<0014HHH5 |,,q0q  5+11."EEHH! 5 5 5"<0014HHH5 |,,q0(""(51aq "(51aq " '''' %/
J
 BwH7:;K7L7Lq7P7P%b)--//!33VWF##X&&++Ar2299!R@@6I   L,,T"a[[VB
Z 788  Z((aQ
B
33	  Z((aB
J33	  ##Wgw7888CfL   j:-
::BBL!!!!7:;K7L7Lq7P7P%b)--//!33VWF<((2-H##X&&++Ar2299!R@@6I   	"2:::BB1bII - 1 1 3 3a 7#l:K:K KQQSS222s$   ;EE10E1F$$GGr  )rh   ri   rj   r   r   r  r   r  rX  r  rv   r|  r   r}  r   rm   rn   r  r   r   r&   r  ro   rZ   rX   r  r    sG        37K K%K (K )4/	K
 K 
K K K K	4	   0	4	   0	
*	
	u|S 	!	
 	
 	
 	
@33i@3 /0@3 
u|S 	!	@3 @3 @3 @3 @3 @3rZ   r  )rQ   abcr   collections.abcr   r   r   	functoolsr   typingr   r	   r
   r   r   rf  r  rm   torch.nnr   einopsr   transformersr   transformers.activationsr   %transformers.feature_extraction_utilsr   transformers.modeling_outputsr   r   transformers.utilsr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   vllm.loggerr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   r    )vllm.model_executor.models.module_mappingr!   vllm.multimodalr"   vllm.multimodal.inputsr#   r$   r%   r&   r'   r(   r)   vllm.multimodal.parser*   r+   r,   r-   r.   vllm.multimodal.processingr/   r0   r1   r2   r3   vllm.sequencer4   vllm.utils.tensor_schemar5   r6   
interfacesr8   r9   r:   r;   r<   siglipr=   utilsr>   r?   r@   rA   rB   rN  rC   rh   rK   r   rY   r\   rq   rv   rl   rx   r}   r   r  r   rn   r   r   r   r  r/  r>  re  rt  r  r  r  r  r  r  r  r)  r+  rF  register_processorr  ro   rZ   rX   <module>r     s!	          7 7 7 7 7 7 7 7 7 7       > > > > > > > > > > > > > >                  ) ) ) ) ) ) 3 3 3 3 3 3 > > > > > > U U U U U U U U ( ( ( ( ( ( " " " " " " 3 3 3 3 3 3 A A A A A A # # # # # #      8 7 7 7 7 7         
 G F F F F F             E D D D D D / / / / / /                                            . - - - - - > > > > > > > >                                 - , , , , ,	X		((( ( 	(
 ( ( ( (VB B B B B< B B B"B B B B B| B B B 24LL L L LB B B B B< B B B"B B B B B| B B B 24LL L L LF F F F F29 F F FR|| 
 
	
 % 5<%&    s s s s s") s s sl    BI   .1 1 1 1 1RY 1 1 1hZ Z Z Z Z	 Z Z ZzK# K# K# K# K#") K# K# K#\{ { { { {BI { { {|I- I- I- I- I-	 I- I- I-XsEL()   *!/ !/ !/ !/ !/3 !/ !/ !/H^
 ^
 ^
 ^
 ^
+ ^
 ^
 ^
B WT+,,,* * * * *!7!; * * *Z R Q Q Q Q78JK Q Q Q,- ,- ,- ,- ,-56HI ,- ,- ,-^|
 |
 |
 |
 |
RY 2 |
 |
 |
~ ('	'  
G3 G3 G3 G3 G3&j-G3 G3 
G3 G3 G3rZ   