
    .`iH              
       `   U d Z ddlZddlZddlmZmZmZmZ ddlm	Z	 ddl
mZmZmZmZ ddlZddlZddlmZ ddlmc mZ ddlmZ ddlmZmZ ddlmZ dd	lmZm Z  dd
l!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z, ddl*m-Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZMmNZN ddlOmPZPmQZQmRZR ddlSmTZTmUZUmVZVmWZWmXZXmYZY ddlZm[Z[ ddl\m]Z]m^Z^ dd l_m`Z` d!d"lambZb d#d$lcmdZdmeZemfZfmgZgmhZh d#d%limjZj d#d&l-mkZkmlZlmmZmmnZn d#d'lompZpmqZqmrZr  e0es          Ztd(Zu G d) d*e]          Zv G d+ d,e]          Zwevewz  Zxeeyd-<    G d. d/e]          Zz G d0 d1e]          Z{eze{z  Z|eeyd2<    G d3 d4ej}                  Z~d5ed6efd7Z G d8 d9ej}                  Z G d: d;ej}                  Z G d< d=ej}                  Z G d> d?ej}                  Z G d@ dAej}                  Z G dB dCej}                  Z G dD dEeV          Z G dF dGeTe                   Z G dH dIeUe                   Z eHj        eeeJ           G dK dLej}        egeeehef                      Z eHj        eeeJ           G dM dNe                      ZdS )Oz@Inference-only GLM-4V model compatible with HuggingFace weights.    N)CallableIterableMappingSequence)partial)	AnnotatedAnyLiteral	TypeAlias)	rearrange)BatchFeatureGlm4vProcessor)Glm4vVisionConfig)Glm4vImageProcessorsmart_resize)Glm4vVideoProcessor)VideoMetadata)
VllmConfig)BaseDummyOptionsVideoDummyOptions)$get_tensor_model_parallel_world_sizeparallel_state)utils)init_logger)MMEncoderAttention)Conv2dLayerConv3dLayer)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)get_rope)ApplyRotaryEmb)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems	VideoItem)	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape)AttentionBackendEnum   )
SiluAndMul   )MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)_create_qwen2vl_field_factory)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)get_vit_attn_backendis_vit_use_data_parallel!run_dp_sharded_mrope_vision_modeliX  c                       e Zd ZU dZdZed         ed<   eej	         e
dd          f         ed<   eej	         e
dd          f         ed<   d	S )
Glm4vImagePixelInputsz
    Dimensions:
        - np: Number of patches
        - cpp: Number of channels * patch_size * patch_size
        - ni: Number of images
        - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
    pixel_valuestypenpcppni   image_grid_thwN__name__
__module____qualname____doc__rN   r
   __annotations__r   torchTensorr9        v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/glm4_1v.pyrL   rL   z   s{           %3D'.
!222EL++dE*B*BBCCCCelKKa,@,@@AAAAAAr]   rL   c                       e Zd ZU dZdZed         ed<   eej	         e
dd          f         ed<   eej	         e
dd          f         ed<   d	S )
Glm4vImageEmbeddingInputsz
    Dimensions:
        - f: Number of image features (varies based on image resolution)
        - h: Hidden size (must match language model backbone)
        - n: Number of images
        - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
    image_embedsrN   fhnrR   rS   NrT   r\   r]   r^   r`   r`      s{           %3D'.
!222EL++c3*?*??@@@@elKKQ,?,??@@@@@@r]   r`   Glm4vImageInputsc                       e Zd ZU dZdZed         ed<   eej	         e
dd          f         ed<   eej	         e
dd          f         ed<   d	S )
Glm4vVideoPixelInputsa  
    Dimensions:
        - np: Number of patches
        - ctpp: Number of channels * temporal_patch_size *
            patch_size * patch_size
        - f: Number of frames
        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
          video, grid_h, grid_w)
    pixel_values_videosrN   rO   ctpprb   rR   video_grid_thwNrT   r\   r]   r^   rg   rg      s}           ,AD''
(@@@"5<T61J1J#JKKKKelKKQ,?,??@@@@@@r]   rg   c                       e Zd ZU dZdZed         ed<   eej	         e
dd          f         ed<   eej	         e
dd          f         ed<   d	S )
Glm4vVideoEmbeddingInputsa  
    Dimensions:
        - p: Number of video patches across all frames
        - h: Hidden size (must match language model backbone)
        - f: Number of frames
        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
          video, grid_h, grid_w)
    video_embedsrN   prc   rb   rR   rj   NrT   r\   r]   r^   rl   rl      s{           %3D'.
!222EL++c3*?*??@@@@elKKQ,?,??@@@@@@r]   rl   Glm4vVideoInputsc                   X     e Zd Z	 	 	 ddededededz  def
 fd	Zd
ej	        fdZ
 xZS )Glm4vVisionMLPFN in_featureshidden_featuresbiasquant_configprefixc                    t                                                       t                      }t          ||gdz  ||| d|          | _        t          ||||| d|          | _        t                      | _        d S )Nr;   .gate_up_proj
input_sizeoutput_sizesru   rv   rw   
disable_tp
.down_projru   rv   rw   r}   )	super__init__rI   r    gate_up_projr"   	down_projr<   act_fn)selfrs   rt   ru   rv   rw   use_data_parallel	__class__s          r^   r   zGlm4vVisionMLP.__init__   s     	4666")*Q.%+++(
 
 
 +%((((
 
 
 !llr]   xc                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r   r   r   )r   r   _s      r^   forwardzGlm4vVisionMLP.forward   sB      ##1KKNN~~a  1r]   )FNrr   )rU   rV   rW   intboolr#   strr   rZ   r[   r   __classcell__r   s   @r^   rq   rq      s        
 26# ## # 	#
 )4/# # # # # # #8        r]   rq   hidden_sizetp_sizec                     ddl m}  fdt                    D             }|                    | t	          j                    j                   fd|D             }d t          | D             }t          j	        |d          }|S )	zEAll-gather the input tensor interleavely across model parallel group.r   Nc                 8    g | ]}t          j                  S r\   )rZ   
zeros_like).0r   local_tensors     r^   
<listcomp>z)all_gather_interleave.<locals>.<listcomp>   s$    OOO1(66OOOr]   )groupc                 B    g | ]}t          j        |z  d           S ))rZ   split)r   tensorr   r   s     r^   r   z)all_gather_interleave.<locals>.<listcomp>   s9       <BFK72B77  r]   c                     g | ]	}|D ]}|
S r\   r\   )r   pairr   s      r^   r   z)all_gather_interleave.<locals>.<listcomp>   s9       d <B   r]   r   dim)
torch.distributeddistributedrange
all_gatherr   get_tp_groupdevice_groupziprZ   cat)r   r   r   distgathered_tensorsgathered_tensors_splitordered_tensorsresult_tensors   ```     r^   all_gather_interleaver      s    $$$$$$OOOOgOOOOO)++8        FV    67  O Io2666Mr]   c                        e Zd Z	 	 ddededededz  deddf fd	Zd
ej        de	ej        df         fdZ
	 ddej        dej        dej        dej        dej        dz  dej        fdZ xZS )Glm4vVisionAttentionNrr   	embed_dim	num_headsprojection_sizerv   rw   returnc           
      H   t                                                       t                      }|rdnt                      | _        |rdnt          j                    | _        t          j	        ||          | _
        t          j	        || j                  | _        t          || j
        ||d||r| dn| d|          | _        t          |||| dd|          | _        t!          | j        | j
        | j
        d	z  
          | _        t%          d          | _        d S )Nr=   r   Fz	.qkv_projz.qkv)r   	head_sizetotal_num_headstotal_num_kv_headsru   rv   rw   r}   .proj)r{   output_sizerv   rw   ru   r}   g      )r   r   scaleT)enforce_enable)r   r   rI   r   r   r   get_tensor_model_parallel_ranktp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr!   qkvr"   projr   attnr%   apply_rotary_emb)r   r   r   r   rv   rw   r   r   s          r^   r   zGlm4vVisionAttention.__init__   sb    	466"NAA(L(N(N 	 #WAA(U(W(W 	 /9.?Y/
 /
+ 2<1Bt|2
 2
. %!9%(%+7Lf''''___(

 

 

 &&!%###(
 
 
	 '<95t;
 
 
	 !/d C C Cr]   r   .c                     |j         \  }}}|                    dd          \  }}}||| j        | j        ffd|||fD             \  }}}|||fS )NrR   r;   r   c              3   ,   K   | ]} |j          V  d S r   )view)r   r   	new_shapes     r^   	<genexpr>z1Glm4vVisionAttention.split_qkv.<locals>.<genexpr>=  s,      99!6169%999999r]   )shapechunkr   r   )	r   r   seq_lenbsr   qkvr   s	           @r^   	split_qkvzGlm4vVisionAttention.split_qkv/  sz    Q ))A1)%%1a 2/	
	 :9991ay9991a!Qwr]   r   
cu_seqlensrotary_pos_emb_cosrotary_pos_emb_sin
max_seqlenc                    |                      |          \  }}|                     |          \  }}}	d |||	fD             \  }}}	|K|It          j        ||gd          }
|                     |
||          }t          j        |dd          \  }}|                     |||	||          }t          |d                                          }| 	                    |          \  }}|S )Nc              3   Z   K   | ]&}t          |d                                           V  'dS )zs b ... -> b s ...N)r   
contiguous)r   r   s     r^   r   z/Glm4vVisionAttention.forward.<locals>.<genexpr>N  s9      VVq9Q 455@@BBVVVVVVr]   r   r   r;   )querykeyvaluer   r   zb s h d -> s b (h d))
r   r   rZ   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   	qk_concat
qk_rotatedcontext_layeroutputs                 r^   r   zGlm4vVisionAttention.forward@  s    xx{{1 ..##1aVVQPQSTIVVV1a).@.L	1a&a000I.."" J
 ;z1!444DAq		!! " 
 
 "-1GHHSSUUIIm,,	r]   )Nrr   r   )rU   rV   rW   r   r#   r   r   rZ   r[   tupler   r   r   r   s   @r^   r   r      s0        372D 2D2D 2D 	2D
 )4/2D 2D 
2D 2D 2D 2D 2D 2DhU\ eEL#4E.F    . +/# #<# L# "L	#
 "L# L4'# 
# # # # # # # #r]   r   c                        e Zd Z	 	 	 ddedededeegej        f         dz  dedz  ded	df fd
Z		 dde
j        de
j        de
j        de
j        dedz  d	e
j        fdZ xZS )Glm4vVisionBlockNrr   r   r   mlp_hidden_dim
norm_layerrv   rw   r   c                 2   t                                                       |t          t          j        d          } ||          | _         ||          | _        t          ||||| d          | _        t          ||d|| d          | _
        d S )Nư>epsz.attn)r   r   r   rv   rw   Fz.mlp)ru   rv   rw   )r   r   r   nn	LayerNormnorm1norm2r   r   rq   mlp)r   r   r   r   r   rv   rw   r   s          r^   r   zGlm4vVisionBlock.__init__g  s     	 4888JZ__
Z__
(%###
 
 
	 "%???
 
 
r]   r   r   r   r   r   c                     |                      |                     |          ||||          }|                     ||          \  }}||                     |          z   }|S )Nr   r   r   r   )residual)r   r   r   r   )	r   r   r   r   r   r   x_attnx_fused_normr   s	            r^   r   zGlm4vVisionBlock.forward  sl     JJqMM!11!  
 
 "&A!?!?htxx---r]   )NNrr   r   )rU   rV   rW   r   r   r   Moduler#   r   r   rZ   r[   r   r   r   s   @r^   r   r   f  s        9=26
 

 
 	

 cUBI-.5
 )4/
 
 

 
 
 
 
 
F "& < L "L	
 "L $J 
       r]   r   c                   b     e Zd Z	 	 	 	 ddedededed	d
f
 fdZdej        d	ej        fdZ xZS )Glm4vVisionPatchEmbed   r=   rR      
patch_sizetemporal_patch_sizein_channelsr   r   Nc                     t                                                       || _        || _        || _        |||f}t          ||||d          | _        d S )NT)kernel_sizestrideru   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   s         r^   r   zGlm4vVisionPatchEmbed.__init__  sg     	$#6 &*J
C#
 
 
			r]   r   c                     |j         \  }}|                    |d| j        | j        | j                  }|                     |                              || j                  }|S )Nr   )r   r   r   r   r   r   )r   r   LCs       r^   r   zGlm4vVisionPatchEmbed.forward  sT    w1FF1b$2DOT_UUIIaLLa!122r]   )r   r=   rR   r   )	rU   rV   rW   r   r   rZ   r[   r   r   r   s   @r^   r   r     s         #$
 

 !
 	

 
 

 
 
 
 
 
* %,        r]   r   c                   \     e Zd Z	 	 	 ddedededz  deded	df fd
Zdej	        fdZ
 xZS )Glm4vPatchMergerNFrr   d_modelcontext_dimrv   ru   rw   r   c           	         t                                                       t                      }|| _        t	          | j        | j        |d|| d|          | _        t          j        | j                  | _        t          | j        |gdz  ||| d|          | _
        t          || j        ||| d|          | _        t                      | _        t          j                    | _        d S )	NTr   )ru   gather_outputrv   rw   r}   r;   ry   rz   r~   r   )r   r   rI   r   r   r   r   r   post_projection_normr    r   r"   r   r<   r   GELUextra_activation_func)r   r  r  rv   ru   rw   r   r   s          r^   r   zGlm4vPatchMerger.__init__  s    	466"(%###(
 
 
	 %'L1A$B$B!6'%*%+++(
 
 
 +%((((
 
 
 !ll%'WYY"""r]   r   c                    |                      |          \  }}|                     |                     |                    }|                     |          \  }}|                     |          }|                     |          \  }}|S r   )r   r  r  r   r   r   )r   r   r   gate_ups       r^   r   zGlm4vPatchMerger.forward  sx    yy||1&&t'@'@'C'CDD&&q))
KK  ~~a  1r]   )NFrr   )rU   rV   rW   r   r#   r   r   r   rZ   r[   r   r   r   s   @r^   r  r    s        
 37&/ &/&/ &/ )4/	&/
 &/ &/ 
&/ &/ &/ &/ &/ &/P        r]   r  c                   :     e Zd Zdef fdZdej        fdZ xZS )Glm4vVisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        | j        | j        z  dz  | _        | j        | _        t          j
        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )Nr;   position_ids)r=   r   F)
persistent)r   r   r  r   r   
image_sizer   num_patchesnum_positionsr   	Embeddingposition_embeddingregister_bufferrZ   arangeexpand)r   r  r   s     r^   r   zGlm4vVisionEmbeddings.__init__  s    + + + Ot>1D!-"$,t/A4>"R"RL+,,33G<< 	 	
 	
 	
 	
 	
r]   r   c                    | j         j        }|j        d         }|j        d         }|j        }	|                    |	          |                    |	          }}|dk    rt          j        d||	|j                  }
nt          t                    r!t          j
        |	t
          j                  t          t
          j                  s!t          j
        |	t
          j                  |j        d         }t          |dz            }|                    |||                              ddd                              d                              |	t
          j                  }t%                    j        d         k    rg }g }t'          t%                              D ]~}|j        d         z  }|                    |df                             |                              |                    |df                             |                              t          j        |                              |	t
          j                  }t          j        |                              |	t
          j                  }nt          j        fdt'          t%                              D                                           |	t
          j                  }t          j        fdt'          t%                              D                                           |	t
          j                  }|                    |	t
          j                  }|                    |	t
          j                  }|dz   |z  dz  dz
  }|dz   |z  dz  dz
  }t          j        ||fd	                              d                              d          }t1          j        ||d
dd          }|                    d                              d                              dd          }|                    |j                                      |j                  }
||
z   }|S )Nr=   r   devicedtype      ?r;   c                 V    g | ]%}|d f                              |                   &S )r=   repeatr   iimage_shapeslengthss     r^   r   z1Glm4vVisionEmbeddings.forward.<locals>.<listcomp>1  4    XXXq\!Q$'..wqz::XXXr]   c                 V    g | ]%}|d f                              |                   &S )r;   r$  r&  s     r^   r   z1Glm4vVisionEmbeddings.forward.<locals>.<listcomp>4  r*  r]   r   r   bicubicFborder)modealign_cornerspadding_mode)r  weightr   r   torZ   emptyr!  
isinstancelistr   longr[   r   r   permute	unsqueezefloat32lenr   appendr%  r   stackFgrid_samplesqueeze)r   
embeddingsr)  r(  h_coordsw_coordspos_embed_weightr   	total_seqr   adapted_pos_embedorig_size_sq	orig_sizepos_embed_2dtarget_h_listtarget_w_listr'  	shape_idxtarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32s     ``                     r^   r   zGlm4vVisionEmbeddings.forward  sB     29&,Q/N1%	!( &[[00(++f2E2E( >> %;v5E5K! ! !
 '4(( Q,wvUZPPPlEL99 $| uz     
 ,1!4LL#-..I %%iKHHAq!!1677	  7||l0333 !# "s7||,, X XA !L$6q$9 9I!((il)C)J)J7ST:)V)VWWW!((il)C)J)J7ST:)V)VWWWW 9]3366! 7   !9]3366! 7   !9XXXXXE#g,,DWDWXXX "F%-"88  !9XXXXXE#g,,DWDWXXX "F%-"88 
  {{&{FFH{{&{FFH#~1Q6:F#~1Q6:F ;/R888BB1EEOOPQRRD '(m#%' ' '# (//22::2>>FFq!LL # !7 9 9:J:P Q Q T T!! !
  "33
r]   )	rU   rV   rW   r   r   rZ   r[   r   r   r   s   @r^   r  r    sl        
0 
 
 
 
 
 
 Z	Z Z Z Z Z Z Z Zr]   r  c                       e Zd Z	 	 	 ddedededz  deddf
 fd	Zede	j
        fd
            Z
ede	j        fd            Zde	j        dee	j        e	j        e	j        f         fdZde	j        de	j        dz  fdZde	j        de	j        eee                  z  de	j        fdZdeeee	j        f                  dee         fdZ xZS )Glm4vVisionTransformerr   Nrr   vision_confignorm_epsrv   rw   r   c                    
 t                                                       j        }j        }j        }j        }j         _        j         _        j         _        j         _        j	         _	        t          ||| j                   _        t          t          |          
 j         j        z  }	t          |	ddddi           _        t!          j        
 fdt%          |          D                        _        t)          j	        j        d	 d
           _        t/                     _        t          j        j                   _        t7          j        j	        j        j                   _        t          j        j                   _        t=          |	t?          j                                _!        d S )N)r   r   r   r   r   i    Tpartial_rotary_factorr"  )r   max_positionis_neox_stylerope_parametersc                 b    g | ]+}t          j        j        j         d |           ,S )z.blocks.)r   r   r   r   rv   rw   )r   r   r   out_hidden_size)r   	layer_idxr   rw   rv   r   rU  s     r^   r   z3Glm4vVisionTransformer.__init__.<locals>.<listcomp>{  sb     
 
 
  !("n#0#@)!-$99i99  
 
 
r]   Fz.merger)r  r  rv   ru   rw   )r   out_channelsr   r  )r   r!  )"r   r   r   r   r   depthr   r   spatial_merge_sizer]  r   patch_embedr   r   r$   rotary_pos_embr   
ModuleListr   blocksr  intermediate_sizemergerr  r@  rms_norm_epspost_conv_layernormr   
downsamplepost_layernormrH   rZ   get_default_dtypeattn_backend)r   rU  rV  rv   rw   r   r   r   r`  head_dimr   r   s   `` ``     @r^   r   zGlm4vVisionTransformer.__init__W  s)    	"-
+?#/#(4&0'2"/"B,<0! 3#(	
 
 
 W(333
#t~5&4c:	
 
 
 m
 
 
 
 
 
 
 
 "'u
 
 

 
 '!1%7%%%%
 
 
 0>>#*%=+E$
 $
 $
  &%1&6%8 3	
 
 
 &%=+E
 
 
 1)++
 
 
r]   c                 .    | j         j        j        j        S r   )rb  r   r1  r!  r   s    r^   r!  zGlm4vVisionTransformer.dtype  s    $+11r]   c                 .    | j         j        j        j        S r   )rb  r   r1  r   rp  s    r^   r   zGlm4vVisionTransformer.device  s    $+22r]   grid_thwc                 
   g }|D ]p\  }}}t          j        |                              d                              d|          }t          j        |                              d                              |d          }|                    || j        z  | j        || j        z  | j                                      dddd                                          }|                    || j        z  | j        || j        z  | j                                      dddd                                          }|                    t          j	        ||gd          
                    |d                     rt          j        |d          }|d d dd f                                         }| j                            |          \  }	}
|	|                             d          }|
|                             d          }|||fS )Nr=   r   r   r;   rR   r   )rZ   r  r8  r  reshapera  r7  flattenr;  r<  r%  r   maxrc  get_cos_sin)r   rr  pos_idstrc   whpos_idswpos_idsmax_grid_sizecossincos_combinedsin_combineds                r^   rot_pos_embz"Glm4vVisionTransformer.rot_pos_emb  s     	S 	SGAq!|A0033::2qAAH|A0033::1bAAH  00+00+	  Aq!$$    00+00+	  Aq!$$  NN5;(';DDDKKAqQQRRRR)G+++ ABB++-- &22=AAS7|++A..7|++A..\722r]   r   c                     d }| j         t          j        k    s| j         t          j        k    r'|dd          |d d         z
                                  }|S )Nr=   r   )rm  r:   
FLASH_ATTNROCM_AITER_FArv  )r   r   r   s      r^   compute_attn_mask_seqlenz/Glm4vVisionTransformer.compute_attn_mask_seqlen  sY     
!5!@@@ $8$FFF$QRR.:crc?:??AAJr]   r   c           	         t          |t                    r t          j        |t          j                  }|                    | j        | j                  }|                     |          }| 	                    |          }| 
                    |          \  }}}t          j        |d d df         |d d df         z  |d d df                                       dt          j                  }t          j        |                    d          |g          }|                    | j        d          }|                     |          }|dd          |d d	         z
                                  }|                     ||||d d df         |d d df                   }|                    d          }| j        D ]}	 |	|||||
          }|                     |          }|                    d	| j        | j        |j        d	                   }|                    dddd          }|                     |                              d	| j                  }|                     |          }|S )Nr!  r  r=   r;   r   )r   r!  T)non_blockingr   r   rR   )r4  r5  rZ   r   int32r2  r   r!  rb  ri  r  repeat_interleavecumsumr   	new_zerosr  tolistr@  r8  re  rk  r   ra  r   r7  rj  r]  rg  )
r   r   rr  r   r   image_type_idsr   r   seqlensblks
             r^   r   zGlm4vVisionTransformer.forward  sl   
 h%% 	A|HEK@@@H DD4:D66Q$$Q'' BFAQAQB
 B
>. ,QQQTNXaaad^+Xaaad^
 

&Qek&
*
* 	 Y
 4 4Q 7 7DEE
]]4;T]BB
 22:>>
abb>JssO3;;==OOw.A"6qqq!t8L
 

 KKNN; 	 	C%#5#5%  AA ""FF2t.0GQSUUIIaAq!!OOA##B(<==KKNNr]   weightsc                    g d}t          |                     d                    }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N))	attn.qkv.zattn.q.r   )r  zattn.k.r   )r  zattn.v.r   )r   	gate_projr   )r   up_projr=   F)remove_duplicateweight_loader)dictnamed_parameterssetreplacer  getattrr&   add)r   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  s               r^   load_weightsz#Glm4vVisionTransformer.load_weights  s    "
 "
 "
 400%0HHII"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####r]   )r   Nrr   )rU   rV   rW   r   floatr#   r   r   propertyrZ   r!  r   r[   r   r  r  r5  r   r   r   r  r  r   r   s   @r^   rT  rT  V  s        26I
 I
(I
 I
 )4/	I

 I
 
I
 I
 I
 I
 I
 I
V 2u{ 2 2 2 X2 3 3 3 3 X3$3$3	u|U\5<7	8$3 $3 $3 $3L
L
 
	
 
 
 
2<2 ,d3i02 
	2 2 2 2hHU33D-E$F 3s8        r]   rT  c                      e Zd Zdeeedz  f         fdZdedefdZ	dede
fdZddd	d
dedededededeeef         fdZdefdZdededefdZdefdZdedededefdZdedefdZdedeeef         defdZdeeef         dedee         fdZdeeef         dedee         fdZdej        deeef         dej        defd ZdS )!Glm4vProcessingInfor   Nc                     d ddS )Nr=   imagevideor\   rp  s    r^   get_supported_mm_limitsz+Glm4vProcessingInfo.get_supported_mm_limits/  s    ***r]   kwargsc                 &     | j         di |j        S Nr\   )get_hf_processorimage_processorr   r  s     r^   get_image_processorz'Glm4vProcessingInfo.get_image_processor2      $t$..v..>>r]   c                 &     | j         di |j        S r  )r  video_processorr  s     r^   get_video_processorz'Glm4vProcessingInfo.get_video_processor5  r  r]      T )
num_frames	do_resizemax_image_pixelsimage_widthimage_heightr  r  r  c                v   |                                  }|j        }|j        }|j        }	|j        }
|r4t          ||
k    r|n|
||||	z  |          \  }}t          ||          }nt          ||          }|||
z  z   }t          ||
z  d          }|j        |z  }|j	        |z  }||z  |z  }||	dz  z  }||fS )N)r  heightwidthfactor
max_pixels)r  r  r=   r;   )
get_hf_configrU  r   ra  r   r   r.   rv  r  r  )r   r  r  r  r  r  	hf_configrU  r   
merge_sizer   resized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wr  num_vision_tokenss                       r^   _get_vision_infoz$Glm4vProcessingInfo._get_vision_info8  s    &&((	!/"-
"5
+? 	R,8 333 &:(#!!J.+- - -)NM !*n U U U )L Q Q Q '6I)II&*==qAA")Z7"(J6vo.'JM: "333r]   c                 :    |                      dd          \  }}|S )Ni r  r  r  )r   max_image_sizer   s      r^   !get_image_size_with_most_featuresz5Glm4vProcessingInfo.get_image_size_with_most_featuresa  s.     11g 2 
 
 r]   c                <    |                      ||d          \  }}|S )Ni   )r  r  r  r  )r   r  r  r   num_image_tokenss        r^   get_num_image_tokensz(Glm4vProcessingInfo.get_num_image_tokensg  s5     #33#%/ 4 
 

  r]   c                 ^    |                                  \  }}|                     ||          S )Nr  )r  r  )r   target_widthtarget_heights      r^   get_max_image_tokensz(Glm4vProcessingInfo.get_max_image_tokenst  s;    &*&L&L&N&N#m(($& ) 
 
 	
r]   c                >    |                      |||d          \  }}|S )Nr  )r  r  r  r  r  )r   r  r  r  r   num_video_tokenss         r^   get_num_video_tokensz(Glm4vProcessingInfo.get_num_video_tokens|  s8     #33#%!0	 4 
 
  r]   
max_tokensc                     |                                  \  }}d}	 |dz   }|                     |||          }||k    s|dk    rn|}-|S )Nr   Tr=   )r  r  r  )r  r  )r   r  r  r  r  next_num_framesnext_max_tokenss          r^   _get_max_video_framesz)Glm4vProcessingInfo._get_max_video_frames  sz    &*&L&L&N&N#m

	)(1nO"77(** 8  O
 ++!/C/C(J
	) r]   r   	mm_countsc                 $   |                     dd          }|                     dd          }|                                 |z  }|                     ||z
            }t          |t	          |d          z  t
                    }t	          |d          S )Nr  r   r  r=   )getr  r  minrv  _MAX_FRAMES_PER_VIDEO)r   r   r  
max_images
max_videosmax_image_tokensmax_total_framesmax_frames_per_videos           r^   !get_num_frames_with_most_featuresz5Glm4vProcessingInfo.get_num_frames_with_most_features  s    
 ]]7A..
]]7A..
4466C55g@P6PQQ"J 2 224I 
  
 '+++r]   metadatatotal_framesc                    |                                  |                    dj                  |                    d|          }|dz
  |                    dt          z            dz             }|d         }|s	|d         }n|j        k    rGt          t          j        |j        z                      }fdt          |          D             }nft          j        j        z            }||k    rt          t          |                    }n't          j        d||d	
          }	fd|	D             }t                      g }}
|D ]0}||
vr*|
                    |           |                    |           1t          |          dz  r|                    |d                    |}fd|D             }|d d d         }g }t          dt          |                    D ]}|                    ||                    |S )Nfpstotal_num_framesr=   durationdo_sample_framesframes_indicesc                     g | ]<}t          t          t          j        |z  j        z                                =S r\   )r  r   mathceilr  )r   r'  max_frame_idx	video_fpsr  s     r^   r   zCGlm4vProcessingInfo._get_video_second_idx_glm4v.<locals>.<listcomp>  sY     ! ! !
 	 %DIa)mo6I&IJJKK ! ! !r]   r   T)endpointc                 t    g | ]4}t          t          t          j        |z                                5S r\   )r  r   r  r  )r   ry  r  r   s     r^   r   zCGlm4vProcessingInfo._get_video_second_idx_glm4v.<locals>.<listcomp>  sI     % % % M3tyY/G/G+H+HII% % %r]   r   c                 4    g | ]}t          |z            S r\   r   r   idxr   s     r^   r   zCGlm4vProcessingInfo._get_video_second_idx_glm4v.<locals>.<listcomp>  %    JJJSCi00JJJr]   r;   )r  r  r  roundmax_durationr   r  floorr   r5  rO   linspacer  r  r;  r:  )r   r  r  meta_framesr  r  frame_indicesrd   num_samplestarget_secondsseenuniqr  full_second_idxstimestamps_listselected_timestampsr  r   r  s                   @@@r^   _get_video_second_idx_glm4vz/Glm4vProcessingInfo._get_video_second_idx_glm4v  sk    2244LL(;<<	ll#5|DD#a<<
E-)2K,L,Lq,PQQ#$67 	$%56MM?777
8o.A#ABBCC! ! ! ! ! !
 #1XX! ! ! "/">AT"TUU+--$({););$<$<MM%'[8[4& & &N% % % % %!/% % %M
 UUBd  	! 	!C$C   t99q= 	"KKR!!!JJJJMJJJ*33Q3/ C0011 	= 	=C&&s';<<<<""r]   c                 r   |                                  }|d         |                    d|          }|dz
  }|                    dt          |z            dz             }|                    dd          }|s
|d         }nddd	d
}	d}
d}t          ||          }|dk    r	|	d         }n|dk    r	|	d         }n|	d         }t	          |dd          }t          ||z  |z            }t          ||
          }dz  fdt          |          D             }t          |          }||k     r3t          j        d|dz
  |t
                    	                                }nLg }d}d||z  z  }t          |          D ]0}||         |k    r"||z  }|
                    |           ||k    r n1t          |          |k     rit          |          dk    rdt          |dz
  d          }}n|d         |d         }}t          j        |||t
                    	                                }nEt          |          |k    r2t          j        d|dz
  |t
                    	                                }t                      g }}|D ]0}||vr*|                    |           |
                    |           1t          |          dz  r|
                    |d                    |}fd|D             }|d d d         }g }t          t          |                    D ]}|
                    ||                    |S )Nr  r  r=   r  r  Tr  rR   r"  )   ,  `	  i  r  r  r  r   c                     g | ]}|z  S r\   r\   )r   r'  duration_per_frames     r^   r   zDGlm4vProcessingInfo._get_video_second_idx_glm46v.<locals>.<listcomp>   s    MMMQ!00MMMr]   r   r  g        r   c                 4    g | ]}t          |z            S r\   r  r  s     r^   r   zDGlm4vProcessingInfo._get_video_second_idx_glm46v.<locals>.<listcomp>'  r  r]   r;   )r  r  r  r  r  r   r   rO   r  r  r;  r:  rv  r  r  )r   r  r  r  r  r  r  r  r  DYNAMIC_FPS_THRESMAX_FRAME_COUNT_DYNAMICMAX_DURATIONeffective_duration
target_fpsr   	extract_t
timestamps
max_secondcurrent_secondinv_fpsframe_indexstartendr  r  r  r  r  r  r  r   s                                @@r^   _get_video_second_idx_glm46vz0Glm4vProcessingInfo._get_video_second_idx_glm46v  s    2244UO	ll#5|DD#a<<
E-)2K,L,Lq,PQQ#<<(:DAA /	$%56MM%&Qc : :&)#L!$X|!<!<!R''.r2

#s**.s3

.t4
")/;PRS"T"T.;>QQRRII'>??I!"YMMMM%:L:LMMMJXJY&& "{Q	! ! !&((  !#!$2Z?@#(#5#5 " "K!+..@@&'1%,,[999)Z77!E=!!I--}%%**!"Ca$;$;3EE!.q!1=3D3E "E3	 M M M T T V V]##i// "{Q	! ! !&((  UUBd  	! 	!C$C   t99q= 	"KKR!!!JJJJMJJJ*33Q3/ _--.. 	= 	=C&&s';<<<<""r]   video_arrayrr  c                 l   |                                  }|                                 |j        }|                                 }|j        }|j        }|j        }	|j        }
|j        dz  }t          |t          j                  sJ t          |t                    r#|                     |t          |                    n"|                     |t          |                    }t          |t                    rdndfd|D             }|\  }}}t!          ||z            |z  }g }|                    |	           |D ]_}|                    |           |                    |j        g|z             |                    |           |                    |           `|                    |
           |S )Nr;   z{}z{:.1f} secondsc                 d    g | ],}                                         |          d           -S )F)add_special_tokens)encodeformat)r   r'  timestamp_format	tokenizers     r^   r   zDGlm4vProcessingInfo._construct_video_placeholder.<locals>.<listcomp>I  sK     
 
 
 -44Q77ERR
 
 
r]   )r  get_tokenizerr  r  image_start_token_idimage_end_token_idvideo_start_token_idvideo_end_token_idr  r4  rZ   r[   r   r  r:  r*  r   r;  extendvideo_token_id)r   r+  r  rr  hf_processorr  r  boi_token_ideoi_token_idbov_token_ideov_token_idmerge_lengthr#  frames_idx_tokenTHWnum_tokens_per_frameplaceholder	frame_idxr1  r2  s                       @@r^   _construct_video_placeholderz0Glm4vProcessingInfo._construct_video_placeholder.  s    ,,..&&((	&6&&((	 5 3 5 3&114(EL11111 ,77OD,,Xs;7G7GHHH228S=M=MNN 	 |^<<RDDBR 	
 
 
 
 

 
 
 1a"1q5zz\9<((() 	* 	*I|,,, ;<?SSTTT|,,,y))))<(((r]   ) rU   rV   rW   r   r   r   r  objectr   r  r   r  r   r   r.   r  r  r  r  r  r  r  r  r	   r5  r  r*  rO   ndarrayrZ   r[   rG  r\   r]   r^   r  r  .  s       +cDj)A + + + +?F ?7J ? ? ? ??F ?7J ? ? ? ?  3'4 '4 '4 '4 	'4
 '4 '4 '4 
y#~	'4 '4 '4 '4R9        	 
 
       
c 
 
 
 
    	 
   
            &,, 38$, 
	, , , , 1#S#X1#691#	c1# 1# 1# 1#fK#S#XK#69K#	cK# K# K# K#Z*Z* sCx.* ,	*
 
* * * * * *r]   r  c                       e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	ddd	ed
ededede
dz  dee         fdZdS )Glm4vDummyInputsBuilderr  r   c                 f   |                     dd          }|                     dd          }| j                                        }| j                                        }| j                                        }|j        }|j        |j        |j        g}|	                    |          }	||z  |	|z  z   S )Nr  r   r  )
r  infor  r  r3  image_tokenr6  r9  r7  decode)
r   r  
num_images
num_videosr  r:  r2  rN  video_token_idsvideo_tokens
             r^   get_dummy_textz&Glm4vDummyInputsBuilder.get_dummy_text\  s    ]]7A..
]]7A..
I++--	y1133I++--	'3*'(

  &&77Z'+
*BBBr]   Nr   
mm_optionsc                    |                     dd          }|                     dd          }| j                                        \  }}| j                            ||          }|r|                     d          nd }	|r|                     d          nd }
|                     ||||	          |                     |||||
          dS )Nr  r   r  )r  r  rP  	overrides)r  r  r  rQ  rW  r  )r  rM  r  r  _get_dummy_images_get_dummy_videos)r   r   r  rU  rP  rQ  r  r  target_num_framesimage_overridesvideo_overridess              r^   get_dummy_mm_dataz)Glm4vDummyInputsBuilder.get_dummy_mm_datan  s     ]]7A..
]]7A..
&*i&Q&Q&S&S#m IGGY
 
 6@I*..111T5?I*..111T ++"$%)	 ,   ++"$,%) ,  
 
 	
r]   )rW  r  r  r  rQ  rW  c                   |r|j         rA|j         |k    r!t                              d|j         |           t          ||j                   }|j        rA|j        |k    r!t                              d|j        |           t          ||j                  }|j        rA|j        |k    r!t                              d|j        |           t          ||j                  }t          |d          }t          j        |||dfdt          j	                  }g }t          |          D ]P}d|dz  |d	 t          |          D             d
dd}	|                                |	f}
|                    |
           Q|S )Nz]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredzMvideo.width override (%d) exceeds model's maximum width (%d), will be ignoredzOvideo.height override (%d) exceeds model's maximum height (%d), will be ignoredr;   rR      r  g       @c                     g | ]}|S r\   r\   )r   r'  s     r^   r   z=Glm4vDummyInputsBuilder._get_dummy_videos.<locals>.<listcomp>  s    "@"@"@1"@"@"@r]   opencvF)r  r  r  r  video_backendr  )r  loggerwarningr  r  r  rv  rO   fulluint8r   copyr;  )r   r  r  r  rQ  rW  r  video_itemsr'  video_metadata
video_items              r^   rY  z)Glm4vDummyInputsBuilder._get_dummy_videos  s     	7# C'*44NNI!,"	   !Y-ABB
 4?U**NN>!	   E9?33 7#f,,NN?!(	   VY%566Q''
UFA628LLLz"" 
	+ 
	+A&,$."@"@eJ.?.?"@"@"@!)$) N  **,,7Jz****r]   r   )rU   rV   rW   r   r   r   rT  r   r)   r]  r   r5  r-   rY  r\   r]   r^   rK  rK  [  s       CS(9 Cc C C C C, =A	
 

 38$
 C!112T9	

 

 
 
 
P /35 5 5 5 	5
 5 5 %t+5 
i5 5 5 5 5 5r]   rK  c            
            e Zd ZdefdZdedeeef         deeef         deeef         def
 fdZ	ded	eeef         deee
f         fd
Zded	eeef         dedee         fdZ xZS )Glm4vMultiModalProcessorr   c                 "    t          d          S )NT)video_needs_metadata)r0   rp  s    r^   _get_data_parserz)Glm4vMultiModalProcessor._get_data_parser  s    #>>>>r]   promptmm_data	mm_kwargs
tok_kwargsc                 <   t          |          } | j        j        di |}d|v rt          |d         t                    rt          |d                   dk    rog }g }|                    dg           D ]}|\  }	t          di |}
                    dd          |
d<   t                      }|	gg|d<   dgt          di fdD             gg|d<   t                      
                    d||
|          }|                    d	          }|j        |||j        k    <   |j                            |          d         }|                    d|d
          }|                    |d                    |                    |d                    t          t#          j        |          t#          j        |                    }nt                      }t                      
                    ||||          }t          |fi |}t'          |          S )Nvideosr   r  Tc                 *    i | ]}|v||         S r\   r\   )r   r   r  unuse_metadatas     r^   
<dictcomp>z?Glm4vMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>  s4       $%#$N#:#: !"8A;#:#:#:r]   ri  +<|begin_of_video|><|video|><|end_of_video|>)rp  rq  rr  rs  	input_idsr=   rj   rh   )rh   rj   r\   )r  rM  r  r4  r5  r:  popr  r   r   _call_hf_processorr9  image_token_idr2  batch_decoder  r;  rZ   r   r   )r   rp  rq  rr  rs  	processorvideo_grid_thw_lstpixel_values_videos_lstitemr+  video_mm_kwargsvideo_mm_datavideo_outputsrz  video_placeholderprocessed_outputscombined_outputsr  rw  r   s                    @@r^   r|  z+Glm4vMultiModalProcessor._call_hf_processor  s    w--.DI.;;;;	 78,d33  GH%&&**!#&(#Hb11 +U +U(,%X #'"3"3"3"36>ll&7 7 23 !%,7=/h'"4!5 &      )1   
3./ !& : :H)-)	 !; ! ! *--k::	, )y'??@ %.$7$D$DY$O$OPQ$R!A%  #))-8H*IJJJ'..}=R/STTTT $)I.E$F$F$y);<<  MM
 !FFM!GG66!	 7 
 
  
 

 
 ,---r]   	hf_inputshf_processor_mm_kwargsc                 t     t          | j                                        j        j                  |          S r   )rC   rM  r  rU  ra  )r   r  r  s      r^   _get_mm_fields_configz.Glm4vMultiModalProcessor._get_mm_fields_config  s=    

,I##%%3F
 

  	r]   mm_itemsout_mm_kwargsc                        j         j        d	i |  j         j        d	i |}|j        dz  dt          ffd}dt          f fd}t          dj        |          t          dd|          gS )
Nr;   item_idxc                     d         |          }|d         j         }t          |t          j                  sJ t	          |                                          z  }j        g|z  S )Nr  rS   )datar4  rZ   r[   r   prodr}  )r  out_itemrr  
num_tokensr:  r?  r  s       r^   get_image_replacement_glm4vzQGlm4vMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_glm4v2  sc    $W-h7H 016Hh55555X]]__--=J /0:==r]   c                    d         |          }|d         j         }t          |t          j                  sJ d         |          \  }}	j                            |||          }t          j        |j                  S )Nr  rj   )embed_token_id)	r  r4  rZ   r[   rM  rG  r6   select_token_idr9  )
r  r  rr  r  r  rE  r:  r  r  r   s
         r^   get_video_replacement_glm4vzQGlm4vMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_glm4v:  s    $W-h7H 016Hh55555&w/9OE8)@@x K '6+:   r]   r  )modalitytargetreplacementr  ry  r\   )rM  r  r  r  r   r4   rN  )	r   r  r  r  r  r  r  r:  r?  s	   `` `   @@r^   _get_prompt_updatesz,Glm4vMultiModalProcessor._get_prompt_updates'  s     2ty1KK4JKK7$)7QQ:PQQ&114	># 	> 	> 	> 	> 	> 	> 	> 	>	# 	 	 	 	 	 	 	 	 	  #/7  
  D7  
 	
r]   )rU   rV   rW   r0   ro  r   r   rH  r   r|  r+   r  r/   r	   r,   r   r5   r  r   r   s   @r^   rl  rl    s<       ?"6 ? ? ? ?Q.Q. f%Q. 3;'	Q.
 CK(Q. 
Q. Q. Q. Q. Q. Q.f !(V 4 
++	,	   ,
%,
 !(S 1,
 -	,

 
,	,
 ,
 ,
 ,
 ,
 ,
 ,
 ,
r]   rl  )rM  dummy_inputsc                   p    e Zd Zg ddgdZ edddd          Zd	Zed
ede	dedz  fd            Z
dddedef fdZdededz  fdZdededz  fdZdedeej        df         fdZdedeej        df         fdZdedefdZdededz  fdZdee	         dee         deej        e	f         fd Z	 	 d/d!ej        d"ej        d#edz  d$ej        dz  dedej        ez  fd%Zd&ej        dej        dz  fd'Z d(e!eeej        f                  de"e         fd)Z#de$fd*Z%d+e	de	fd,Z&d-e	de	fd.Z' xZ(S )0Glm4vForConditionalGenerationq_projk_projv_projr   qkv_projr   zlanguage_model.lm_head.zlanguage_model.model.visual.)zlm_head.zmodel.language_model.zmodel.visual.)orig_to_new_prefixTr  r'  r   Nc                 |    |                     d          rdS |                     d          rdS t          d          )Nr  z+<|begin_of_image|><|image|><|end_of_image|>r  ry  z)Only image or video modality is supported)
startswith
ValueError)clsr  r'  s      r^   get_placeholder_strz1Glm4vForConditionalGeneration.get_placeholder_strr  sK    w'' 	A@@w'' 	A@@DEEEr]   rr   )rw   vllm_configrw   c          
         t                                                       |j        j        }|j        }|j        j        }|| _        || _        |j        dk    | _        | 	                    |ddh          5  t          |j        t          |dd          |t          |d                    | _        d d d            n# 1 swxY w Y   |j        dk    rd	g}n|j        d
k    rdg}nd }|                     |          5  t#          ||j        t          |d          |          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nr  r  r  rh  gh㈵>visual)rV  rv   rw   glm4vGlm4ForCausalLM	glm4v_moeGlm4MoeForCausalLMlanguage_model)r  r  rw   architectures)r   r   model_configr  rv   multimodal_configr  mm_encoder_tp_moder   _mark_tower_modelrT  rU  r  rG   r  
model_type_mark_language_modelrF   text_configr  make_empty_intermediate_tensors)r   r  rw   r  rv   r  r  r   s          r^   r   z&Glm4vForConditionalGeneration.__init__{  s   )3"/'4F!2!2!E!O##K'71CDD 	 	0$ >>)#FH55	  DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ''./MM+--12MM M&&{33 	 	"<' ,#F,<==+	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s$   7;B>>CC>,D66D:=D:r  c                     |                     dd           }|                     dd           }|                     dd           }||d S |t          d||          S |t          d||          S d S )NrM   ra   rS   )rN   rM   rS   )rN   ra   rS   )r{  rL   r`   )r   r  rM   ra   rS   s        r^   _parse_and_validate_image_inputz=Glm4vForConditionalGeneration._parse_and_validate_image_input  s     zz.$77zz.$77$4d;;L$84#(#)-    #,#)-    $#r]   c                     |                     dd           }|                     dd           }|                     dd           }||d S |t          d||          S |t          d||          S d S )Nrh   rm   rj   )rN   rh   rj   )rN   rm   rj   )r{  rg   rl   )r   r  rh   rm   rj   s        r^   _parse_and_validate_video_inputz=Glm4vForConditionalGeneration._parse_and_validate_video_input  s     %jj)>EEzz.$77$4d;;&<+?4*(*$7-    #,#)-    $#r]   image_input.c                    |d         }|j         dk    sJ |d         dk    r&|d                             | j        j                  }nm|d                             | j        j                  }| j        r*t          | j        ||                                d          S |                     ||          }| j        j        }|                    d	          |z  |z                                  }|	                    |          S )
NrS   r;   rN   ra   rM   rope_3d	rope_typerr  r   
ndimrN   r  r!  r   rJ   r  ra  r  r   )r   r  rr  ra   rM   r  sizess          r^   _process_image_inputz2Glm4vForConditionalGeneration._process_image_input  s    /0}!!!!v.00&~6;;DK<MNNLL&~6;;DK<MNNL% L8Kx/@/@I     ${{<({KK[3
r""j0J>FFHH!!%(((r]   video_inputc                    |d         }|j         dk    sJ |d         dk    r&|d                             | j        j                  }nm|d                             | j        j                  }| j        r*t          | j        ||                                d          S |                     ||          }| j        j        }|                    d	          |z  |z                                  }|	                    |          S )
Nrj   r;   rN   rm   rh   r  r  r  r   r  )r   r  rr  rm   rh   r  r  s          r^   _process_video_inputz2Glm4vForConditionalGeneration._process_video_input  s    /0}!!!!v.00&~6;;DK<MNNLL"-.C"D"I"I!# # % S8K'OO%%'	     ${{+>{RR [3
r""j0J>FFHH!!%(((r]   c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)rM   ra   r  )rh   rm   r  r\   )r  r  )r   r  mm_input_by_modality	input_keys       r^   %_parse_and_validate_multimodal_inputszCGlm4vForConditionalGeneration._parse_and_validate_multimodal_inputs   s    !   	 	I===#7770T0T 1 11 1$W- DDD#7770T0T 1 11 1$W- $#r]   c                      | j         di |}|sd S d}|D ]d}||         }|dk    r'|                     |          }|t          |          z  }|dk    r'|                     |          }|t          |          z  }e|S )Nr\   r  r  )r  r  r   r  )r   r  r  multimodal_embeddingsr  multimodal_inputimage_embeddingsvideo_embeddingss           r^   embed_multimodalz.Glm4vForConditionalGeneration.embed_multimodal  s    ItISSFSS# 	4 ;= - 	A 	AH3H=7""#'#<#<=M#N#N %/?)@)@@%7""#'#<#<=M#N#N %/?)@)@@%$$r]   input_tokensmm_featuresc           	      
   t          j        |ddh          }d |                    dg           D             }d |                    dg           D             }| j        }|j        }|j        }|j        }	|j        j        }
g }|s|r(g }d}|D ]h}||k    rd}n||	k    rd}||k    r|du r|	                    d           3||k    r|du r|	                    d           S|	                    d	           ig }t          j        t          |          d
           D ]K\  }}t          |          }|d         d         }|d         d         dz   }|	                    |||f           Ld}d}|D ]?\  }}}t          |          dk    r|d                                         dz   nd}|dk    r=||         \  }}}|||
z  ||
z  } }}t!          j        |                              dd                              d|| z                                            }!t!          j        |                              ddd                              |d|                                           }"t!          j        |                               ddd                              ||d                                          }#|	                    t!          j        |!|"|#g          |z              |dz  }}|dk    rb|g||         dd          R \  }}}|||
z  ||
z  } }}t-          |          D ]!}$t!          j        |$                              dd                              d|| z                                            }!t!          j        |                              ddd                              dd|                                           }"t!          j        |                               ddd                              d|d                                          }#|	                    t!          j        |!|"|#g          |z              #|dz  }|dz  }||z
  }%|	                    t!          j        |%                              dd                              dd          |z              d}An^t          |          }%|	                    t!          j        |%                              dd                              dd                     t!          j        |d                              dd          }&|&                                dz   t          |          z
                                  }'|&|'fS )NrS   rj   c                 6    g | ]}|                                 S r\   r  r   r  s     r^   r   zKGlm4vForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>4       UUUD$++--UUUr]   c                 6    g | ]}|                                 S r\   r  r  s     r^   r   zKGlm4vForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>5  r  r]   FTr  r  textc                     | d         S )Nr=   r\   )r   s    r^   <lambda>zIGlm4vForConditionalGeneration.get_mrope_input_positions.<locals>.<lambda>P  s
    qt r]   r   r   r=   rR   r   )r*   gather_kwargsr  r  r}  r6  r7  rU  ra  r;  	itertoolsgroupby	enumerater5  r:  rv  rZ   r  r   r  ru  r<  r   r   r   rt  r  )(r   r  r  r  rS   rj   r  r}  r6  r7  ra  llm_pos_ids_listinput_token_typevideo_check_flgtokeninput_type_groupr   
group_iter
group_liststart_index	end_indexvideo_frame_nummm_data_idxmodality_type	start_idxend_idxst_idxry  rc   rz  
llm_grid_t
llm_grid_h
llm_grid_wt_indexh_indexw_indext_idxtext_lenllm_positionsmrope_position_deltas(                                           r^   get_mrope_input_positionsz7Glm4vForConditionalGeneration.get_mrope_input_positions+  s   
 '4/0
 
 VUFJJ?OQS4T4TUUUUUFJJ?OQS4T4TUUUK	"1(=&9&4G!# l	V^ l	V*,#O% 4 4000&*OO000&+O^++/U2J2J$++G4444~--Ot4K4K$++G4444$++F3333;=#,#4*++^^$ $ G GZ "*--
(mA.&rN1-1	 ''k9(EFFFFOK5E M( M(1y'69:J6K6Ka6O6O$R(,,..22UV  !G++,[9GAq!//// -7
J Z00b!J$;<< 	  Z00aQ
B
;; 	  Z00aB
J;; 	  %++Wgw$?@@6I    1$KK"g--''4QRR8 GAq!
 //// -7
J "'z!2!2  !L//!T"a[[#VB
Z(?@@$WYY	   "L44!T!R^^#VAr:66$WYY	   "L44!T!Q^^#VAz266$WYY	   )//!K'7(CDDvM     1$K#q(OO  '2H$++X..33Ar::AA!RHH6Q   '(OO[M(` <((H##EL$:$:$?$?2$F$F$M$MaQS$T$TUUU	"2:::BB1bII - 1 1 3 3a 7#l:K:K KQQSS222r]   rz  	positionsintermediate_tensorsinputs_embedsc                 J    |d}| j                             ||||          }|S )a  Run forward pass for GLM-4V.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for GLM-4V
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,).
            intermediate_tensors: Optional intermediate tensors for pipeline
                parallelism.
            inputs_embeds: Optional pre-computed input embeddings.
            **kwargs: Additional keyword arguments.
        N)rz  r  r  r  )r  model)r   rz  r  r  r  r  hidden_statess          r^   r   z%Glm4vForConditionalGeneration.forward  s@    .  + M+11!5'	 2 
 
 r]   r	  c                 6    | j                             |          S r   )r  compute_logits)r   r	  s     r^   r  z,Glm4vForConditionalGeneration.compute_logits  s     "11-@@@r]   r  c                 X    t          |           }|                    || j                  S )N)mapper)rD   r  hf_to_vllm_mapper)r   r  loaders      r^   r  z*Glm4vForConditionalGeneration.load_weights  s+    "4((""743I"JJJr]   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        zlanguage_model.modelzvisual.merger.r  )r  	connectortower_model)r'   from_string_fieldrp  s    r^   get_mm_mappingz,Glm4vForConditionalGeneration.get_mm_mapping  s'     /1&!
 
 
 	
r]   r  c                 4    | j         j        j        }||dz  z  S Nr;   r  rU  ra  )r   r  r  s      r^   get_num_mm_encoder_tokensz7Glm4vForConditionalGeneration.get_num_mm_encoder_tokens  s      [.A
:q=11r]   r  c                 4    | j         j        j        }||dz  z  S r  r  )r   r  r  s      r^   get_num_mm_connector_tokensz9Glm4vForConditionalGeneration.get_num_mm_connector_tokens  s      [.A
 Z]33r]   )NN))rU   rV   rW   packed_modules_mappingrE   r  supports_encoder_tp_dataclassmethodr   r   r  r   r   rH  re   r  ro   r  r   rZ   r[   r  r  r  r  r>   r  r5  r*   r  r7   r   r  r   r  r  r'   r  r  r  r   r   s   @r^   r  r  V  s       
 
 

 ((  &1%<&
 
    $F3 F3 F3: F F F [F BD #
 #
 #
z #
3 #
 #
 #
 #
 #
 #
J	D	    0	D	    0)+)	u|S 	!) ) ) )*)+)	u|S 	!) ) ) )6$f $ $ $ $ $,% %4H44O % % % %*C33iC3 /0C3 
u|S 	!	C3 C3 C3 C3R <@-1   <  <  2D8	 
 |d*    
+	+       DA|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
22 
2 2 2 244 
4 4 4 4 4 4 4 4r]   r  c                        e Zd Zg dddgdZdS ) Glm4vMoeForConditionalGenerationr  r  r  r  N)rU   rV   rW   r  r\   r]   r^   r  r    s9        
 
 
 

 
r]   r  )rX   r  r  collections.abcr   r   r   r   	functoolsr   typingr   r	   r
   r   numpyrO   rZ   torch.nnr   torch.nn.functional
functionalr=  einopsr   transformersr   r   -transformers.models.glm4v.configuration_glm4vr   0transformers.models.glm4v.image_processing_glm4vr   r   0transformers.models.glm4v.video_processing_glm4vr   transformers.video_utilsr   vllm.configr   vllm.config.multimodalr   r   vllm.distributedr   r   r   r   vllm.loggerr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r    r!   r"   'vllm.model_executor.layers.quantizationr#   +vllm.model_executor.layers.rotary_embeddingr$   2vllm.model_executor.layers.rotary_embedding.commonr%   -vllm.model_executor.model_loader.weight_utilsr&   )vllm.model_executor.models.module_mappingr'   vllm.multimodalr(   vllm.multimodal.inputsr)   r*   r+   r,   r-   vllm.multimodal.parser.   r/   r0   vllm.multimodal.processingr1   r2   r3   r4   r5   r6   vllm.sequencer7   vllm.utils.tensor_schemar8   r9   #vllm.v1.attention.backends.registryr:   layers.activationr<   
interfacesr>   r?   r@   rA   rB   qwen2_vlrC   rD   rE   rF   rG   visionrH   rI   rJ   rU   rc  r  rL   r`   re   rY   rg   rl   ro   r   rq   r   r   r   r   r   r  r  rT  r  rK  rl  register_processorr  r  r\   r]   r^   <module>rF     s  6 G F F      A A A A A A A A A A A A       5 5 5 5 5 5 5 5 5 5 5 5                           5 5 5 5 5 5 5 5 K K K K K K        Q P P P P P 2 2 2 2 2 2 " " " " " " F F F F F F F F Q Q Q Q Q Q Q Q 0 0 0 0 0 0 # # # # # #      E D D D D D D D 8 8 8 8 8 8            G F F F F F @ @ @ @ @ @      P O O O O O D D D D D D / / / / / /              W V V V V V V V V V                . - - - - - > > > > > > > > D D D D D D * * * * * *              4 3 3 3 3 3                     
X		  
B B B B BL B B BA A A A A A A A 46OO ) O O OA A A A AL A A A"A A A A A A A A  46OO ) O O O
! ! ! ! !RY ! ! !HS 3    *i i i i i29 i i iX0 0 0 0 0ry 0 0 0f    BI   :/ / / / /ry / / /dk k k k kBI k k k\U U U U URY U U Upj j j j j, j j jZ	i i i i i45HI i i iXL
 L
 L
 L
 L
67JK L
 L
 L
^ ('	(  
W4 W4 W4 W4 W4I!<]W4 W4 
W4t ('	(  
    'D   
  r]   