
    .`i                        d Z ddlZddlZddlmZmZmZmZ ddlm	Z	 ddl
mZmZmZ ddlZddlZddlmZ ddlmc mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z  ddlm!Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZE ddlFmGZG ddlHmIZImJZJ ddlKmLZL ddlMmNZN ddlOmPZPmQZQmRZRmSZSmTZT ddl!mUZUmVZVmWZW ddlXmYZY  e$eZ          Z[d e\d!e\fd"Z] G d# d$ej^                  Z_ G d% d&ej^                  Z` G d' d(ej^                  Za G d) d*ej^                  Zb G d+ d,ej^                  Zc G d- d.ej^                  Zd G d/ d0eI          ZeeeZf G d1 d2eI          ZgegZhd3e\eiz  d4e\d5e\fd6Zjd3e\eiz  d4e\d5e\fd7Zkd3e\eiz  d4e\d5e\fd8Zl	 	 	 dLd<e\d=e\d4e\d>e\d?e\f
d@Zm G dA dBej^                  Zn G dC dDeC          Zo G dE dFeBeo                   Zp G dG dHeAeo                   Zq e6jr        epeoeqI           G dJ dKej^        eSeQeTeR                      ZsdS )MzBInference-only Ernie VL model compatible with HuggingFace weights.    N)CallableIterableMappingSequence)partial)	AnnotatedAnyLiteral)	rearrange)BatchFeature)
VllmConfig)BaseDummyOptionsVideoDummyOptions)parallel_state)utils)init_logger)	QuickGELU)MMEncoderAttention)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loader)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape)AttentionBackendEnum   )Ernie4_5_VLMoeForCausalLM)MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMappermaybe_prefix)get_vit_attn_backendhidden_sizetp_sizec                     ddl m}  fdt                    D             }|                    | t	          j                    j                   fd|D             }d t          | D             }t          j	        |d          }|S )	zEAll-gather the input tensor interleavely across model parallel group.r   Nc                 8    g | ]}t          j                  S  )torch
zeros_like).0_local_tensors     y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/ernie45_vl.py
<listcomp>z)all_gather_interleave.<locals>.<listcomp>b   s$    OOO1(66OOO    )groupc                 B    g | ]}t          j        |z  d           S ))r=   split)r?   tensorr8   r9   s     rB   rC   z)all_gather_interleave.<locals>.<listcomp>g   s9       <BFK72B77  rD   c                     g | ]	}|D ]}|
S r<   r<   )r?   pairrI   s      rB   rC   z)all_gather_interleave.<locals>.<listcomp>j   s9       d <B   rD   rG   dim)
torch.distributeddistributedrange
all_gatherr   get_tp_groupdevice_groupzipr=   cat)rA   r8   r9   distgathered_tensorsgathered_tensors_splitordered_tensorsresult_tensors   ```     rB   all_gather_interleaver[   ^   s    $$$$$$OOOOgOOOOO,n.I.K.K.X        FV    67  O Io2666MrD   c                        e Zd ZdZ	 	 ddededededz  ded	df fd
Zdej	        d	e
ej	        df         fdZ	 ddej	        dej	        dej	        dej	        dz  d	ej	        f
dZ xZS )Ernie4_5_VisionAttentionz)VisionAttention using VLLM framework APIsN 	embed_dim	num_headsprojection_sizequant_configprefixreturnc           
         t                                                       t          j                    | _        t          j                    | _        t          j        ||          | _	        t          j        || j                  | _
        t          || j	        ||d|| d          | _        t          |||| d          | _        t          | j
        | j	        | j	        dz  | d          | _        t#          dd	          | _        d S )
NTz.qkv)r8   	head_sizetotal_num_headstotal_num_kv_headsbiasrb   rc   z.proj)
input_sizeoutput_sizerb   rc   g      .attn)r`   rf   scalerc   )enforce_enableenable_fp32_compute)super__init__r   $get_tensor_model_parallel_world_sizer9   get_tensor_model_parallel_ranktp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkvr   projr   attnr   apply_rotary_emb)selfr_   r`   ra   rb   rc   	__class__s         rB   rq   z!Ernie4_5_VisionAttention.__init__t   s6    	%JLL%DFF.8.?Y/
 /
+ 2<1Bt|2
 2
. %!9%(%???
 
 
 &&!%###	
 
 
	 '<95t;###	
 
 
	 !/ $!
 !
 !
rD   ry   .c                   	 |j         \  }}}| j        dk    r t          || j        j        | j                  }|                    dd          \  }}}| j        dk    rbt          t          j        | j                  } ||          | j	                 } ||          | j	                 } ||          | j	                 }||| j
        | j        f		fd|||fD             \  }}}|||fS )Nr-         rL   )num_partitionsc              3   ,   K   | ]} |j          V  d S N)view)r?   x	new_shapes     rB   	<genexpr>z5Ernie4_5_VisionAttention.split_qkv.<locals>.<genexpr>   s,      99!6169%999999rD   )shaper9   r[   ry   r8   chunkr   ru   split_tensor_along_last_dimrt   rx   rw   )
r}   ry   seq_lenbsr@   qkvsplitterr   s
            @rB   	split_qkvz"Ernie4_5_VisionAttention.split_qkv   s
   Q<!'TX-A4<PPC ))A1)%%1a <!6t|  H DL)ADL)ADL)A 2/	
	 :9991ay9991a!QwrD   r   
cu_seqlensrotary_pos_emb
max_seqlenc                    |                      |          \  }}|                     |          \  }}}d |||fD             \  }}}|mt          j        ||gd          }	|                     |	|                                |                                          }
t          j        |
dd          \  }}|                     |||||          }t          |d          
                                }|                     |          \  }}|S )Nc              3   Z   K   | ]&}t          |d                                           V  'dS )zs b ... -> b s ...N)r   
contiguous)r?   r   s     rB   r   z3Ernie4_5_VisionAttention.forward.<locals>.<genexpr>   s9      VVq9Q 455@@BBVVVVVVrD   r   rL   r   )querykeyvaluer   r   zb s h d -> s b (h d))ry   r   r=   rU   r|   cossinr   r{   r   r   rz   )r}   r   r   r   r   r@   r   r   r   	qk_concat
qk_rotatedoutputcontext_layers                rB   forwardz Ernie4_5_VisionAttention.forward   s     xx{{1 ..##1aVVQPQSTIVVV1a%	1a&a000I..""$$""$$ J
 ;z1!444DAq!!  
 
 "&*@AALLNNIIm,,	rD   )Nr^   r   )__name__
__module____qualname____doc__intr   strrq   r=   Tensortupler   r   __classcell__r~   s   @rB   r]   r]   q   s       33 37-
 -
-
 -
 	-

 )4/-
 -
 
-
 -
 -
 -
 -
 -
^U\ eEL#4E.F    B +/! !<! L! 	!
 L4'! 
! ! ! ! ! ! ! !rD   r]   c                   |     e Zd Zeddfdededeej                 dedz  de	f
 fdZ
d	ej        d
ej        fdZ xZS )Ernie4_5_VisionMLPNr^   in_featureshidden_features	act_layerrb   rc   c                     t                                                       t          |||| d          | _         |            | _        t          |||| d          | _        d S )Nz.fc1)rb   rc   z.fc2)rp   rq   r   fc1actr   fc2)r}   r   r   r   rb   rc   r~   s         rB   rq   zErnie4_5_VisionMLP.__init__   s~     	'%???	
 
 
 9;;$%???	
 
 
rD   r   rd   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S r   )r   r   r   )r}   r   
x_parallelr@   s       rB   r   zErnie4_5_VisionMLP.forward   s@    
AXXj))
xx
##1rD   )r   r   r   r   r   typennModuler   r   rq   r=   r   r   r   r   s   @rB   r   r      s        
 &/26
 

 
 	?	

 )4/
 
 
 
 
 
 
. %,        rD   r   c                        e Zd Zedddfdedededeej                 de	egej        f         dz  de
dz  d	ed
df fdZ	 ddej        dej        dej        dej        dz  d
ej        f
dZ xZS )Ernie4_5_VisionBlockNr^   rM   r`   	mlp_ratior   
norm_layerrb   rc   rd   c                 V   t                                                       |t          t          j        d          } ||          | _         ||          | _        t          ||z            }t          ||||| d          | _	        t          ||||| d          | _        d S )Nư>epsrl   )r_   r`   ra   rb   rc   .mlp)r   rb   rc   )rp   rq   r   r   	LayerNormnorm1norm2r   r]   r{   r   mlp)
r}   rM   r`   r   r   r   rb   rc   mlp_hidden_dimr~   s
            rB   rq   zErnie4_5_VisionBlock.__init__  s     	 4888JZ__
Z__
S9_--,%###
 
 
	 &%???
 
 
rD   hidden_statesr   r   r   c                     ||                      |                     |          |||          z   }||                     |                     |                    z   }|S )Nr   r   r   )r{   r   r   r   )r}   r   r   r   r   s        rB   r   zErnie4_5_VisionBlock.forward%  se     &		JJ}%%!)!	 )2 )
 )
 
 &M1J1J(K(KKrD   r   )r   r   r   r   r   floatr   r   r   r   r   r   rq   r=   r   r   r   r   s   @rB   r   r     s        &/8<26 
  
 
  
 	 

 	? 
 cUBI-.5 
 )4/ 
  
 
 
  
  
  
  
  
N +/ | L 	
 L4' 
       rD   r   c            	       ^     e Zd Z	 	 	 	 ddedededd	f fd
Zdej        dej        fdZ xZS )Ernie4_5_VisionPatchEmbed   r      r^   
patch_sizein_channelsr_   rd   Nc                     t                                                       || _        || _        || _        t          j        ||z  |z  |d          | _        d S )NF)ri   )rp   rq   r   r   r_   r   Linearrz   )r}   r   r   r_   rc   r~   s        rB   rq   z"Ernie4_5_VisionPatchEmbed.__init__7  s^     	$&"I*$z195
 
 
			rD   r   c                 |    | j         j        j        }|                    |          }|                      |          }|S r   )rz   weightdtypeto)r}   r   target_dtypes      rB   r   z!Ernie4_5_VisionPatchEmbed.forwardG  s9    y'-%((66		-00rD   )r   r   r   r^   )	r   r   r   r   rq   r=   r   r   r   r   s   @rB   r   r   6  s         
 

 
 	
 

 
 
 
 
 
 U\ el        rD   r   c                   H     e Zd Zd	dededdf fdZdedej        fdZ xZ	S )
Ernie4_5_VisionRotaryEmbedding     @rM   thetard   Nc                     t                                                       d|t          j        d|dt          j                  |z  z  z  | _        d S )Ng      ?r   r   )startendstepr   )rp   rq   r=   arangefloat32inv_freq)r}   rM   r   r~   s      rB   rq   z'Ernie4_5_VisionRotaryEmbedding.__init__P  sM    eLqcGGG#M
 
rD   seqlenc                     t          j        || j        j        | j        j                  }t          j        || j                  }|S )Ndevicer   )inputvec2)r=   r   r   r   r   outer)r}   r   seqfreqss       rB   r   z&Ernie4_5_VisionRotaryEmbedding.forwardV  sF    l4=/t}7J
 
 
 #DM:::rD   )r   )
r   r   r   r   r   rq   r=   r   r   r   r   s   @rB   r   r   O  s{        
 
C 
 
D 
 
 
 
 
 
c el        rD   r   c            	       &    e Zd Z	 	 	 ddededz  deddf fdZedej	        fd	            Z	edej
        fd
            Z
dej        dej        fdZdej        dej        dz  fdZ	 ddej        dej        dej        fdZdee         fdZ xZS )Ernie4_5_VisionTransformerr   Nr^   norm_epsrb   rc   rd   c                    t                                                       |j        }|j        }|j        }|j        }|j        |j        }	|j        |j	        || _        | _        | _        t          || d          | _        t          t          j        |          z  }
t          |
dz            | _        t          j        fdt%          |	          D                       | _        |k    s
J d            t          j        |d          | _        t+          |
t-          j                              | _        d S )	Nz.patch_embed)r   r   r_   rc   r   r   c                 D    g | ]}t           d |           S )z.blocks.)rM   r`   r   r   rb   rc   )r   )r?   	layer_idxr_   r   r   r`   rc   rb   s     rB   rC   z7Ernie4_5_VisionTransformer.__init__.<locals>.<listcomp>  sZ     
 
 
  %!'')!-$99i99  
 
 
rD   z5vit's config.hidden must be equal to config.embed_dimr   )rf   r   )rp   rq   r   spatial_merge_sizer   r8   r_   depthr`   r   r   patch_embedr   r   r   r   r   
ModuleListrP   blockslnr7   r=   get_default_dtypeattn_backend)r}   vision_configr   rb   rc   r   r   r   r8   r   head_dimr_   r   r   r`   r~   s      ``      @@@@rB   rq   z#Ernie4_5_VisionTransformer.__init___  s    	"-
*=#/#/!+	#!+	!+	"4""4!#***	
 
 
 R\x888
	)<X]KKm
 
 
 
 
 
 
 
 
 "'u
 
 

 
 i'''C ('' ,{5550)++
 
 
rD   c                 .    | j         j        j        j        S r   )r   rz   r   r   r}   s    rB   r   z Ernie4_5_VisionTransformer.dtype  s    $+11rD   c                 .    | j         j        j        j        S r   )r   rz   r   r   r   s    rB   r   z!Ernie4_5_VisionTransformer.device  s    $+22rD   grid_thwc                    g }|D ]p\  }}}t          j        |                              d                              d|          }t          j        |                              d                              |d          }|                    || j        z  | j        || j        z  | j                                      dddd                                          }|                    || j        z  | j        || j        z  | j                                      dddd                                          }|                    t          j	        ||gd          
                    |d                     rt          j        |d          }|d d dd f                                         }|                     |          }	|	|                             d          }
|
S )Nr-   rG   r   r   r   rL   )r=   r   	unsqueezeexpandreshaper   permuteflattenappendstackrepeatrU   maxr   )r}   r  pos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr   s              rB   rot_pos_embz&Ernie4_5_VisionTransformer.rot_pos_emb  s    	S 	SGAq!|A0033::2qAAH|A0033::1bAAH  00+00+	  Aq!$$    00+00+	  Aq!$$  NN5;(';DDDKKAqQQRRRR)G+++ ABB++--"11-@@,W5==a@@rD   r   c                     d }| j         t          j        k    s| j         t          j        k    r'|dd          |d d         z
                                  }|S )Nr-   rG   )r   r,   
FLASH_ATTNROCM_AITER_FAr  )r}   r   r   s      rB   compute_attn_mask_seqlenz3Ernie4_5_VisionTransformer.compute_attn_mask_seqlen  sW    
!5!@@@ $8$FFF$QRR.:crc?:??AAJrD   r   r   c                    |                      |          }|                     |          }|                    |j                  }t	          j        |d d df         |d d df         z  |d d df                                       dt          j                  }|                    d          }|dk    r&t	          j	        |||g          }|d         |z   |d<   nt	          j	        ||g          }|j
        dk    r|                    d          }|                     |          }t          | j                  D ]\  }}	 |	||||          }|                     |          }
|
j
        d	k    r|
                    d          }
|
S )
Nr-   r   r   )rM   r   rG   rL   r   r   )r   r  r   r   r=   repeat_interleavecumsumint32	new_zerosrU   ndimr  r  	enumerater   r   squeeze)r}   r   r  num_padr   r   zerosr   iblkfinal_outputs              rB   r   z"Ernie4_5_VisionTransformer.forward  s    ((77))(33'**=+?@@,QQQTNXaaad^+Xaaad^
 

&Qek&
*
* 	 $$Q''Q;;E:u#=>>J'^g5JrNNE:#677J "")333::M 22:>>
,, 	 	FAsC%-%	  MM ww}--!!'//A/66LrD   c                     t          |                     d                    }t                      }|D ]D\  }}||         }t          |dt                    } |||           |                    |           E|S NF)remove_duplicateweight_loaderdictnamed_parameterssetgetattrr   addr}   weightsparams_dictloaded_paramsnameloaded_weightparamr*  s           rB   load_weightsz'Ernie4_5_VisionTransformer.load_weights  s    400%0HHII"%%%#* 	$ 	$D-%E#E?<QRRMM%///d####rD   )r   Nr^   r   )r   r   r   r   r   r   rq   propertyr=   r   r   r   r  r  r   r.  r8  r   r   s   @rB   r   r   ^  s~        266
 6
 6
 )4/	6

 6
 
6
 6
 6
 6
 6
 6
p 2u{ 2 2 2 X2 3 3 3 3 X3EL U\    @5< ELSWDW     LM' '"\'5:\'	' ' ' 'R	s3x 	 	 	 	 	 	 	 	rD   r   c                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
Ernie4_5_VLImagePixelInputsz
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - ni: Number of images
        - cps: Number of channels * patch_size * patch_size
    pixel_valuesr   npcpsnir   image_grid_thwN
r   r   r   r   r
   __annotations__r   r=   r   r+   r<   rD   rB   r<  r<     sw           .
!!!!EL++dE*B*BBCCCCelKKa,@,@@AAAAAArD   r<  c                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
Ernie4_5_VLVideoPixelInputsz
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - ni: Number of images
        - cps: Number of channels * temporal_patch_size * patch_size *
              patch_size
    pixel_values_videosr   r>  r?  r@  r   video_grid_thwNrB  r<   rD   rB   rE  rE    sx           '
(((("5<T51I1I#IJJJJelKKa,@,@@AAAAAArD   rE  numberfactorrd   c                 ,    t          | |z            |z  S r   )roundrH  rI  s     rB   round_by_factorrM  &  s    &!!F**rD   c                 6    t          j        | |z            |z  S r   )mathceilrL  s     rB   ceil_by_factorrQ  *  s    9Vf_%%..rD   c                 6    t          j        | |z            |z  S r   )rO  floorrL  s     rB   floor_by_factorrT  .  s    :fvo&&//rD      @     heightwidth
min_pixels
max_pixelsc                 ,   d}t          | |          t          | |          z  |k    rm| |k    r2t          |t          ||                    }t          ||z  |          }n1t          |t          | |                    }t          ||z  |          }|} |}t          |t          | |                    }t          |t          ||                    }	||	z  |k    rAt	          j        | |z  |z            }
t          | |
z  |          }t          ||
z  |          }	nI||	z  |k     r@t	          j        || |z  z            }
t          | |
z  |          }t          ||
z  |          }	|||	z  k    s	||	z  |k    rt          d| d|	 d| d| d	          ||	fS )N   zInvalid h_bar=z, w_bar=z': h_bar * w_bar must be >= min_pixels (z) and <= max_pixels (z).)r  minrM  rT  rO  sqrtrQ  
ValueError)rX  rY  rI  rZ  r[  	MAX_RATIO	new_width
new_heighth_barw_barbetas              rB   smart_resizerg  2  s    I
65C...::E>>FOE6$B$BCCI(Y)>GGJJV_VV%D%DEEJ'
Y(>GGI7788Ev6677Eu}z!!y&5.J677v66f55		#	#yv~677v}f55ut|V44EEM!!UU]Z%?%?1U 1 1E 1 14>1 1",1 1 1
 
 	
 %<rD   c                   z     e Zd Z	 d
deddf fdZd Zd Zdeeee	j
        f                  dee         fd	Z xZS ) VariableResolutionResamplerModelr^   rc   rd   Nc                 z   t                                                       || _        || _        || _        || _        || _        |j        | _        | j        | j        z  | j        z  | _        | j        | j        z  | j        z  | j        z  | _	        t          | j        | j        ddt          |dd           | d          | _        t          j                    | _        t          | j        | j        ddt          |dd           | d          | _        t          j        | j        d          | _        | j        rt          | j	        | j        ddt          |dd           | d          | _        t          j                    | _        t          | j        | j        ddt          |dd           | d	          | _        t          j        | j        d          | _        t          | j        | j        ddt          |dd           | d
          | _        t1          |t          |dd                    | _        d S )NTrb   z.spatial_linear1)ri   gather_outputrb   rc   z.spatial_linear2r   r   z.temporal_linear1z.temporal_linear2r   rms_norm_eps)r8   r   )rp   rq   in_dimout_dimconfigspatial_conv_sizetemporal_conv_sizeuse_temporal_convspatial_dimtemporal_dimr   r/  spatial_linear1r   GELUspatial_geluspatial_linear2r   spatial_normtemporal_linear1temporal_gelutemporal_linear2temporal_normr   r   
after_norm)r}   rm  rn  rp  rq  ro  rc   r~   s          rB   rq   z)VariableResolutionResamplerModel.__init__[  s{    	!2"4!'!9  ;)??$BXX K$%$% %& 	  4 >>... 
  
  
 GII3 >>... 
  
  
 L)9tDDD! 	J$8! "$V^TBB 333% % %D! "$D$8  "$V^TBB 333% % %D! "$d.>D!I!I!ID'L >>???
 
 
 "WV^T%J%J
 
 
rD   c                 T    |j         \  }}|                    d||dz  z  g          }|S )NrG   r   )r   r  )r}   r   rp  SCs        rB   spatial_conv_reshapez5VariableResolutionResamplerModel.spatial_conv_reshape  s3    w1IIr1 11 45677rD   c                       fd}d fd	} fd} fd} ||          } j         r |||          } ||          } ||          }|S )Nc                                          | j                  }                     |           \  } }                    |           }                     |           \  } }                    |           } | S r   )r  rp  ru  rw  rx  ry  r   r@   r}   s     rB   fwd_spatialz=VariableResolutionResamplerModel.forward.<locals>.fwd_spatial  st    ))!T-CDDA''**DAq!!!$$A''**DAq!!!$$AHrD   Fc           
         |                                                                 }|d d df         |d d dd f         }}|                    d          j        dz  z  }|                    d          j        dz  z  }t	          j        |j        |j                  }d|d<   |                                d d         |dd <   g }	t          |||          D ]Q\  }
}}t          d|
d          D ]9}|	                    t	          j        |||z  z   ||dz   |z  z                        :Rt          j        t	          j        |	d                                        | j                  }	g }t          |||          D ]Y\  }
}}t          |
dk    rdnd|
d          D ]9}|                    t	          j        |||z  z   ||dz   |z  z                        :Zt          j        t	          j        |d                                        | j                  }t          j        | d|	          }t          j        | d|          }t          j        ||gd          } | S )	Nr   r-   rG   r   r   )axis)rM   indexrL   )cpunumpyprodrp  r>  emptysizer   r  rT   rP   r  r   r=   rI   concatenater   r   index_selectconcat)r   r  	to_tensorgrid_thw_cpugrid_tgrid_hwgrid_hw_after_convtokens_per_img_or_vidbatch_offsetslice_offsetstemporoal_sizespatial_sizeb_offsettemp_offsetslice_offsets2x_timestep_1x_timestep_2r}   s                    rB   fwd_placeholderzAVariableResolutionResamplerModel.forward.<locals>.fwd_placeholder  s   #<<>>//11L*111a40,qqq!""u2EGF!(b!1!1d6La6O!P$0$5$5b$9$9d>TVW>W$X!8%*2G2M  L  LO4;;==crcBLM:=*L; ; 	 	6h $)NA#>#>  K!((	$|'CC$a<'GG     "LB)O)O)OPPSS M  N:=*L; ;  6h $)'!++AANA$ $  K #))	$|'CC$a<'GG     #\".b*Q*Q*QRRUU N !-aQmLLLL -aQnMMMLlL9rBBBAHrD   c                                          |           \  } }                    |           }                     |           \  } }                    |           } | S r   )rz  r{  r|  r}  r  s     rB   fwd_temporalz>VariableResolutionResamplerModel.forward.<locals>.fwd_temporal  s\    ((++DAq""1%%A((++DAq""1%%AHrD   c                 b                         |           \  } }                    |           } | S r   )r   r~  r  s     rB   fwd_mlpz9VariableResolutionResamplerModel.forward.<locals>.fwd_mlp  s,    88A;;DAq""AHrD   )F)rr  )r}   r   r  r  r  r  r  s   `      rB   r   z(VariableResolutionResamplerModel.forward  s    	 	 	 	 	/	 /	 /	 /	 /	 /	b	 	 	 	 		 	 	 	 	
 KNN! 	 8,,AQAGAJJrD   r2  c                     t          |                     d                    }t                      }|D ]I\  }}||vr
||         }t          |dt                    } |||           |                    |           J|S r(  r+  r1  s           rB   r8  z-VariableResolutionResamplerModel.load_weights  s    400%0HHII"%%%#* 	$ 	$D-;&&%E#E?<QRRMM%///d####rD   r^   )r   r   r   r   rq   r  r   r   r   r=   r   r.  r8  r   r   s   @rB   ri  ri  Z  s         S
 S
 S
 
S
 S
 S
 S
 S
 S
j  
M M M^HU33D-E$F 3s8        rD   ri  c                   p   e Zd Zd ZdefdZdefdZdeee	dz  f         fdZ
de	d	eee	f         deee	f         fd
Zdddde	de	de	dededz  deee	f         fdZde	de	dedz  de	fdZde	de	de	dedz  de	f
dZdefdZde	fdZde	de	fdZde	d	eee	f         de	fdZde	d	eee	f         de	fdZdS )Ernie4_5_VLProcessingInfoc                 $    | j         j        j        S r   )ctxmodel_config	hf_configr   s    rB   get_hf_configz'Ernie4_5_VLProcessingInfo.get_hf_config  s    x$..rD   kwargsc                 *     | j         j        dddi|S )Nuse_fastTr<   )r  get_hf_processorr}   r  s     rB   r  z*Ernie4_5_VLProcessingInfo.get_hf_processor  s"    (tx(AA$A&AAArD   c                 &     | j         di |j        S )Nr<   )r  image_processorr  s     rB   get_image_processorz-Ernie4_5_VLProcessingInfo.get_image_processor  s    $t$..v..>>rD   rd   Nc                     d d dS Nimagevideor<   r   s    rB   get_supported_mm_limitsz1Ernie4_5_VLProcessingInfo.get_supported_mm_limits  s    ---rD   r   	mm_countsc                 `    |                                  }|                     ||          }||dS r  )get_max_image_tokensget_max_video_tokens)r}   r   r  max_image_tokensmax_video_tokenss        rB   get_mm_max_tokens_per_itemz4Ernie4_5_VLProcessingInfo.get_mm_max_tokens_per_item  s;    
  446644WiHH)4DEEErD   r-   T)
num_frames	do_resizeimage_widthimage_heightr  r  r  c                   ||                                  }|                                 }|j        }|j        }|j        }	|j        }
|r6t          ||||	z  |j        |j                  \  }}t          ||          }nt          ||          }t          ||
z  d          }|j        |z  }|j        |z  }||z  |z  }||	dz  z  }||fS )N)rX  rY  rI  rZ  r[  )rY  rX  r-   r   )r  r  r   r   rp  rq  rg  rZ  r[  r!   r  rX  rY  )r}   r  r  r  r  r  r  r   r   rp  rq  resized_heightresized_widthpreprocessed_sizer  grid_hgrid_wnum_patchesnum_vision_tokenss                      rB   _get_vision_infoz*Ernie4_5_VLProcessingInfo._get_vision_info(  s    ""6688O&&((	!/"-
%7&9 
	R,8#!!$55*5*5- - -)NM !*n U U U )L Q Q QZ#55q99")Z7"(J6vo.',=q,@A "333rD   c                <    |                      |||          \  }}|S Nr  r  r  r  )r}   r  r  r  r@   num_image_tokenss         rB   get_num_image_tokensz.Ernie4_5_VLProcessingInfo.get_num_image_tokensO  s5     #33#%+ 4 
 

  rD   c                >    |                      ||||          \  }}|S Nr  r  r  r  r  )r}   r  r  r  r  r@   num_video_tokenss          rB   get_num_video_tokensz.Ernie4_5_VLProcessingInfo.get_num_video_tokens]  s8     #33#%!+	 4 
 
  rD   c                 <    |                      ddd           \  }}|S )Ni r  r  )r}   max_image_sizer@   s      rB   !get_image_size_with_most_featuresz;Ernie4_5_VLProcessingInfo.get_image_size_with_most_featuresm  s2     11   2 
 

 rD   c                 d    |                                  \  }}|                     ||d           }|S r  )r  r  )r}   target_widthtarget_heightr  s       rB   r  z.Ernie4_5_VLProcessingInfo.get_max_image_tokensu  sD    &*&L&L&N&N#m44$&  5 
 

  rD   
max_tokensc                     |                                  \  }}d}	 |dz   }|                     |||d           }||k    rn|}(|dz  dk    r|dz  }|S )Nr   Tr-   r  r   )r  r  )r}   r  r  r  r  next_num_framesnext_max_tokenss          rB   _get_max_video_framesz/Ernie4_5_VLProcessingInfo._get_max_video_frames  s    &*&L&L&N&N#m
	)(1nO"77(** $	 8  O ++(J	) >Q!OJrD   c                     |                     dd          }|                     dd          }|                                 |z  }|                     ||z
            }|t          |d          z  }t          |d          S )Nr  r   r  r-   r   )getr  r  r  )r}   r   r  
max_images
max_videosr  max_total_framesmax_frames_per_videos           rB   !get_num_frames_with_most_featuresz;Ernie4_5_VLProcessingInfo.get_num_frames_with_most_features  s    
 ]]7A..
]]7A..
4466C55g@P6PQQ/3z13E3EE'+++rD   c                     |                                  \  }}|                     |||                     ||          d           S r  )r  r  r  )r}   r   r  r  r  s        rB   r  z.Ernie4_5_VLProcessingInfo.get_max_video_tokens  sS    
 '+&L&L&N&N#m(($&==gyQQ 	 ) 
 
 	
rD   )r   r   r   r  objectr  r  r   r   r   r  r  boolr	   r   r!   r  r  r  r  r  r  r  r  r<   rD   rB   r  r    s       / / /B B B B B?F ? ? ? ?.cDj)A . . . .FF 38$F 
c		F F F F %4 %4 %4 %4 	%4
 %4 %4 t%4 
y#~	%4 %4 %4 %4N    	 
 t  
           	 
   t  
        9     c             2,, 38$, 
	, , , ,

 38$
 
	
 
 
 
 
 
rD   r  c            
           e Zd ZdefdZdej        dedej        fdZde	de
e	ef         de
e	ef         de
e	ef         def
d	Zd
ede
e	ef         dedee         fdZdede
e	ef         de
e	ef         fdZdS )Ernie4_5VLMultiModalProcessorrd   c                 "    t          d          S )NT)video_needs_metadata)r#   r   s    rB   _get_data_parserz.Ernie4_5VLMultiModalProcessor._get_data_parser  s    #!%
 
 
 	
rD   r=  	mm_kwargsc                    | j                                         }|j        } | j         j        di |}t	          j        |j        t          j                                      g d          }t	          j        |j	        t          j                                      g d          }t	          j        |j
        t          j                  }|j        dz  }	|                    ddg                              |	d          }|                    ddg                              |	d          }|                                s|                                }|                                s|                                }||                    t          j                  z  |z
  |z  }|                    |j                  }|S )Nr  )r-   r   r-   r-   r   r  rG   r<   )infor  r   r  r=   rI   
image_meanr   r  	image_stdrescale_factorr   r!  r  is_contiguousr   r   r   )
r}   r=  r  r  r   r  image_mean_tensorimage_std_tensorr  patch_size_squareds
             rB   _pixel_values_normz0Ernie4_5VLMultiModalProcessor._pixel_values_norm  s   
 I++--	!/7$)7DD)DD!L&em
 
 

',,,

 	 !<%U]
 
 

',,,

 	 *%-
 
 
 +5q8-55r2h??QQ
 
 ,33RH==OO
 
 !..00 	? 1 < < > >--// 	=/::<< \__U];;;>OO $y77rD   promptmm_data
tok_kwargsc           	         d|vrZd|vrV|dk    rP| j                                         }|                    |          }t          t	          |g          d          }|S d|vrg |d<   d|vrg |d<    | j         j        di |}t          |dd          }	|d         r1|	s/t                              d	           d
 |d         D             |d<   | j         j	        
                    |t	          |g|d         |d                   t	          di ||          }
|
|
d         }||                     ||          |
d<   t          |
                                          D ]}|
|         |
|= |dk    r|
d         }|
d         }|d d df         dk    }||         |
d<   ||          |
d<   |
d                             d                                          }|d |         |
d<   ||d          |
d<   |
d= |
S )Nimagesvideosr^   )	input_idspt)tensor_typesupports_video_metadataFzgHF processor doesn't support video metadata. Timestamps will NOT be rendered. Please upgrade the model.c                 L    g | ]!}t          |t                    r|d          n|"S r9  )
isinstancer   )r?   r   s     rB   rC   zDErnie4_5VLMultiModalProcessor._call_hf_processor.<locals>.<listcomp>  s<     ! ! !89
1e,,3!!! ! !rD   )textr  r  r  r   r-   rG  rA  rL   r=  rF  r<   )r  get_tokenizerencoder   r,  r  r/  loggerwarning_oncer  call_hf_processorr  listkeysr  sum)r}   r  r  r  r  	tokenizer
prompt_idstokenizer_outputhf_processorr  processor_outputr=  r   r  pixel_values_allmaskimage_patch_nums                    rB   _call_hf_processorz0Ernie4_5VLMultiModalProcessor._call_hf_processor  s    7""xw'>'>6R<<	//11I"))&11J+
|,,,$      $#7"" "GH7"" "GH 2ty1>>I>>")3U#
 #
 8 	%< 	M  ! !=DX=N! ! !GH  9=::vhwx'8ARSSS++9+
++
 
 '+H5L'-1-D-D ). . * ,113344 3 3#C(0(-*$$/
;H'7'A$ $AAAqD>A-D9A$$%569A4%$%56()9:??A?FFJJLL $ 8H((8$^4 ?O'((?$%:; )2rD   mm_itemshf_processor_mm_kwargsout_mm_kwargsc                      | j         j        d
i |ddddddj        dz  dt          dt          ffdfd	dD             S )Nz<|image@placeholder|>z<|video@placeholder|>r  z<|IMAGE_PLACEHOLDER|>r   item_idxmodalityc                 <   |         |          }|| d         j         }t          |t          j                  sJ |dk    r-t	          |                                          j        z  z  }n$t	          |                                          z  }|         |z  S )N	_grid_thwr  )datar  r=   r   r   r  rq  )	r$  r%  out_itemr  
num_tokensafter_placeholderr  merge_lengthr"  s	        rB   get_replacement_ernie45vlzTErnie4_5VLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_ernie45vlB  s    $X.x8H8 6 6 67<Hh555557""((#67#$ 
 !11\A
$X.;;rD   c           
      ^    g | ])}t          ||         t          |                     *S ))r%  )r%  targetreplacement)r'   r   )r?   r%  before_placeholderr-  s     rB   rC   zEErnie4_5VLMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>P  sV     
 
 
  !)(3#$=QQQ  
 
 
rD   r<   )r  r  rp  r   r   )	r}   r   r!  r"  r+  r1  r-  r  r,  s	      `@@@@@rB   _get_prompt_updatesz1Ernie4_5VLMultiModalProcessor._get_prompt_updates-  s     2ty1KK4JKK -,
 
 -,
 
 $5q8	< 	<s 	< 	< 	< 	< 	< 	< 	< 	< 	<
 
 
 
 
 /
 
 
 	
rD   	hf_inputsc                    |                     dt          j        d                    }|                    d          }|                     dt          j        d                    }|                    d          }t	          t          j        d|          t          j        d          t          j        d|          t          j        d                    S )NrA  )r   r   rG   rG  r  r  )r=  rA  rF  rG  )r  r=   r  r  r,  r   flat_from_sizesbatched)r}   r3  r!  rA  image_grid_sizesrG  video_grid_sizess          rB   _get_mm_fields_configz3Ernie4_5VLMultiModalProcessor._get_mm_fields_configY  s    
 #'7V9L9LMM)..r22"'7V9L9LMM)..r22.>)  18AA 5 E)! ! 18AA	
 	
 	
 		
rD   N)r   r   r   r#   r  r=   r   r  r  r   r   r   r  r"   r	   r    r   r(   r2  r   r9  r<   rD   rB   r  r    s[       
"6 
 
 
 

#l# # 
	# # # #JK K  f%K  3;'	K 
 CK(K  
K  K  K  K Z*
%*
 !(S 1*
 -	*

 
,	*
 *
 *
 *
X

 !(V 4
 
++	,	
 
 
 
 
 
rD   r  c                       e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	ddd	ed
ededede
dz  f
dZdS )Ernie4_5_VLDummyInputsBuilderr  rd   c                     |                     dd          }|                     dd          }d}t          |          D ]}|d|dz    dz  }t          |          D ]}|d|dz    d	z  }|S )
Nr  r   r  r^   zPicture r-   z2:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>zVideo z2:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>)r  rP   )r}   r  
num_images
num_videosr  r$  s         rB   get_dummy_textz,Ernie4_5_VLDummyInputsBuilder.get_dummy_textq  s    ]]7A..
]]7A..
z"" 	 	AT1q5TTTFF z"" 	Y 	YAXq1uXXXXFFrD   Nr   
mm_optionsc                    |                     dd          }|                     dd          }| j                                        \  }}| j                            ||          }|r|                     d          nd }	|r|                     d          nd }
|                     ||||	          |                     |||||
          dS )Nr  r   r  )rY  rX  r=  	overrides)rY  rX  r  r>  rB  r  )r  r  r  r  _get_dummy_images_get_dummy_videos)r}   r   r  r@  r=  r>  r  r  target_num_framesimage_overridesvideo_overridess              rB   get_dummy_mm_dataz/Ernie4_5_VLDummyInputsBuilder.get_dummy_mm_data~  s     ]]7A..
]]7A..
&*i&Q&Q&S&S#m IGGY
 
 6@I*..111T5?I*..111T ++"$%)	 ,   ++"$,%) ,  
 
 	
rD   )rB  rY  rX  r  r>  rB  c                   |r|j         rA|j         |k    r!t                              d|j         |           t          ||j                   }|j        rA|j        |k    r!t                              d|j        |           t          ||j                  }|j        rA|j        |k    r!t                              d|j        |           t          ||j                  }t          |d          }t          j        |||dfdt          j	                  }g }t          |          D ]P}d|dz  |d	 t          |          D             d
dd}	|                                |	f}
|                    |
           Q|S )Nz]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredzMvideo.width override (%d) exceeds model's maximum width (%d), will be ignoredzOvideo.height override (%d) exceeds model's maximum height (%d), will be ignoredr   r      r  g       @c                     g | ]}|S r<   r<   )r?   r$  s     rB   rC   zCErnie4_5_VLDummyInputsBuilder._get_dummy_videos.<locals>.<listcomp>  s    "@"@"@1"@"@"@rD   opencvF)fpsdurationtotal_num_framesframes_indicesvideo_backenddo_sample_frames)r  r  warningr^  rY  rX  r  r>  fulluint8rP   copyr  )r}   rY  rX  r  r>  rB  r  video_itemsr$  video_metadata
video_items              rB   rD  z/Ernie4_5_VLDummyInputsBuilder._get_dummy_videos  s     	7# C'*44NNI!,"	   !Y-ABB
 4?U**NN>!	   E9?33 7#f,,NN?!(	   VY%566Q''
UFA628LLLz"" 
	+ 
	+A&,$."@"@eJ.?.?"@"@"@!)$) N  **,,7Jz****rD   r   )r   r   r   r   r   r   r?  r   r   rH  r   rD  r<   rD   rB   r;  r;  p  s        S(9 c    " =A	
 

 38$
 C!112T9	

 

 
 
 
P /34 4 4 4 	4
 4 4 %t+4 4 4 4 4 4rD   r;  )r  dummy_inputsc                       e Zd Zg dddgdZ eddddd	d
ddddd          Zededededz  fd            Z	d:de
deddf fdZdej        dej        dz  fdZdej        dej        dej        fdZdej        ddfd Zd!ee         d"ee         deej        ef         fd#Zd$ededz  fd%Zd$ededz  fd&Zd'edeej        d(f         fd)Zd*edeej        d(f         fd+Zd$edefd,Zd$ededz  fd-Z	 d;dd.d/dej        d0edz  d1ej        dz  d2e dej        f
 fd3Z!	 	 d<dej        d4ej        d5e"dz  d6ej        dz  fd7Z#d8e$eeej        f                  de%e         fd9Z& xZ'S )=&Ernie4_5_VLMoeForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.lm_head.zlanguage_model.model.zresampler_model.)zlm_head.zmodel.z%language_model.model.resampler_model.zspatial_linear1.zspatial_linear2.zspatial_norm.ztemporal_linear1.ztemporal_linear2.ztemporal_norm.)zspatial_linear.0.zspatial_linear.2.zspatial_linear.3.ztemporal_linear.0.ztemporal_linear.2.ztemporal_linear.3.)orig_to_new_prefixorig_to_new_substrr%  r$  rd   Nc                 |    |                     d          rdS |                     d          rdS t          d          )Nr  z1<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>r  z1<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>z)Only image or video modality is supported)
startswithr`  )clsr%  r$  s      rB   get_placeholder_strz:Ernie4_5_VLMoeForConditionalGeneration.get_placeholder_str  sK    w'' 	GFFw'' 	GFFDEEErD   r^   vllm_configrc   c                 P   t                                                       |j        j        }|j        }|j        j        }|| _        || _        |                     |ddh          5  t          |j	        t          |dd          |t          |d                    | _        t          | j        j        | j        j        | j        j        | j        j        | j        t          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t)          |t          |d	          
          | _        d d d            n# 1 swxY w Y   d | _        | j        j        | _        t          | j        dd           rd | j        j        t          | j        dd           t          | j        dd           t          | j        dd           t          | j        dd           fD             }t3          j        |t2          j                  | _        d S d | _        d S )Nr  r  rl  r   vision_model)r   rb   rc   resampler_model)ro  rc   language_model)rj  rc   im_patch_idc                     g | ]}||S r   r<   )r?   token_ids     rB   rC   zCErnie4_5_VLMoeForConditionalGeneration.__init__.<locals>.<listcomp>+  s,     
  
  
  '  (''rD   image_start_token_idimage_end_token_idvideo_start_token_idvideo_end_token_idr  )rp   rq   r  r  rb   multimodal_configro  _mark_tower_modelr   r   r/  r6   rl  ri  pixel_hidden_sizer8   rp  rq  rm  _mark_language_modelr.   rn  visual_token_maskmake_empty_intermediate_tensorsro  r=   rI   long_visual_token_ids_tensor_cache)r}   rj  rc   ro  rb   rv  visual_token_idsr~   s          rB   rq   z/Ernie4_5_VLMoeForConditionalGeneration.__init__  s   )3"/'4F!2##K'71CDD 	 	 :$ >>)#FN;;	! ! !D $D-'-.{#F,=>>$ $ $D 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  &&{33 	 	";'#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 "&? 	, 4;t44 	7
  
  K+DK)?FFDK)=tDDDK)?FFDK)=tDD!
  
  
  38, 
3 3 3D/// 37D///s%   'BDD
D#%EEEr   c                 6    | j                             |          S )zcompute logits)rn  compute_logits)r}   r   s     rB   r  z5Ernie4_5_VLMoeForConditionalGeneration.compute_logits<  s    
 "11-@@@rD   r=  r  c                    |||dk             }|                                 dz  dk    r%t          d|                                  d          |                    dd          }t          j        t          j        |d d dd f         |d d df         d          g dd          }|                     ||          }|S )	Nr   r   zgrid_thw has z6 elements after filtering,which is not divisible by 3.rG   r-   )r-   r   r   r   )r   )numelr`  r  Fpadr=   r  rl  )r}   r=  r  image_featuress       rB   _vision_forwardz6Ernie4_5_VLMoeForConditionalGeneration._vision_forwardC  s    
 1-H~~!#q(( 3HNN$4$4 3 3 3    ''A..Hu'ABB!!!Q$KK  H
 **<BBrD   r  c                     | j         	d| _        dS | j                             |j        |j                  }t          j        ||                              dd          | _        dS )z@Set mask for visual tokens (image/video patches and delimiters).Nr   rG   r-   )r}  rz  r   r   r   r=   isinr  )r}   r  visual_token_ids_tensors      rB   _set_visual_token_maskz=Ernie4_5_VLMoeForConditionalGeneration._set_visual_token_maskY  st    .6%)D"F"&"E"H"H#/ #I #
 #

 "'I7N!O!O!W!W"
 "
rD   input_tokensmm_featuresc           	      
   t          j        |ddh          }d |                    dg           D             }d |                    dg           D             }| j        }|j        }|j        }|j        }	|j        }
|j        }g }|s|rg }d}|D ]h}||k    rd}n||	k    rd}||k    r|du r|	                    d           3||k    r|du r|	                    d           S|	                    d	           ig }t          j        t          |          d
           D ]K\  }}t          |          }|d         d         }|d         d         dz   }|	                    |||f           Ld}d}|D ]6\  }}}t          |          dk    r|d                                         dz   nd}|dk    r=||         \  }}}|||
z  ||
z  }!} }t!          j        |                              dd                              d| |!z                                            }"t!          j        |                               ddd                              |d|!                                          }#t!          j        |!                              ddd                              || d                                          }$|	                    t!          j        |"|#|$g          |z              |dz  }}|dk    rY||         \  }}}||z  ||
z  ||
z  }!} }t-          |          D ]!}%t!          j        |%                              dd                              d| |!z                                            }"t!          j        |                               ddd                              dd|!                                          }#t!          j        |!                              ddd                              d| d                                          }$|	                    t!          j        |"|#|$g          |z              #|dz  }|dz  }||z
  }&|	                    t!          j        |&                              dd                              dd          |z              d}8n^t          |          }&|	                    t!          j        |&                              dd                              dd                     t!          j        |d                              dd          }'|'                                dz   t          |          z
                                  }(|'|(fS )NrA  rG  c                 6    g | ]}|                                 S r<   tolistr?   items     rB   rC   zTErnie4_5_VLMoeForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>q       UUUD$++--UUUrD   c                 6    g | ]}|                                 S r<   r  r  s     rB   rC   zTErnie4_5_VLMoeForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>r  r  rD   FTr  r  r  c                     | d         S )Nr-   r<   )r   s    rB   <lambda>zRErnie4_5_VLMoeForConditionalGeneration.get_mrope_input_positions.<locals>.<lambda>  s
    qt rD   r   rG   r-   r   rL   )r   gather_kwargsr  ro  ro  rt  ru  rp  rq  r  	itertoolsgroupbyr   r  lenr  r=   r   r   r  r  r	  rP   rI   rU   r  r  ))r}   r  r  r  rA  rG  r  image_token_idrt  ru  rp  rq  llm_pos_ids_listinput_token_typevideo_check_flgtokeninput_type_groupr   
group_iter
group_liststart_index	end_indexvideo_frame_nummm_data_idxmodality_type	start_idxend_idxst_idxr  r  r  
llm_grid_t
llm_grid_h
llm_grid_wt_indexh_indexw_indext_idxtext_lenllm_positionsmrope_position_deltas)                                            rB   get_mrope_input_positionsz@Ernie4_5_VLMoeForConditionalGeneration.get_mrope_input_positionsh  s   
 '4/0
 
 VUFJJ?OQS4T4TUUUUUFJJ?OQS4T4TUUUK	".(=&9%7&9!# i	V^ i	V*,#O% 4 4000&*OO000&+O^++/U2J2J$++G4444~--Ot4K4K$++G4444$++F3333;=#,#4*++^^$ $ G GZ "*--
(mA.&rN1-1	 ''k9(EFFFFOK5E J( J(1y'69:J6K6Ka6O6O$R(,,..22UV  !G++,[9GAq!.... -7
J Z00b!J$;<< 	  Z00aQ
B
;; 	  Z00aB
J;; 	  %++Wgw$?@@6I    1$KK"g--,[9GAq!//.... -7
J "'z!2!2  !L//!T"a[[#VB
Z(?@@$WYY	   "L44!T!R^^#VAr:66$WYY	   "L44!T!Q^^#VAz266$WYY	   )//!K'7(CDDvM     1$K#q(OO  '2H$++X..33Ar::AA!RHH6Q   '(OOUJ(Z <((H##EL$:$:$?$?2$F$F$M$MaQS$T$TUUU	"2:::BB1bII - 1 1 3 3a 7#l:K:K KQQSS222rD   r  c                     |                     dd           }|                     dd           }|d S |t          d||          S d S )Nr=  rA  )r   r=  rA  )popr<  )r}   r  r=  rA  s       rB   _parse_and_validate_image_inputzFErnie4_5_VLMoeForConditionalGeneration._parse_and_validate_image_input  sd     zz.$77$4d;;4#.#)-    $#rD   c                     |                     dd           }|                     dd           }|d S |t          d||          S d S )NrF  rG  )r   rF  rG  )r  rE  )r}   r  rF  rG  s       rB   _parse_and_validate_video_inputzFErnie4_5_VLMoeForConditionalGeneration._parse_and_validate_video_input  sf     %jj)>EE$4d;;&4*.*$7-    +*rD   image_input.c                 l   |d         }|j         dk    sJ |d                             | j        j                  }|                     ||          }|                     ||          }| j        j        }|                    d          |z  |z  }|                    |	                                          S )NrA  r   r=  r=  r  rG   )
r  r   rl  r   r  rm  r   r  rH   r  )r}   r  r  r=  r  image_embeds
merge_sizesizess           rB   _process_image_inputz;Ernie4_5_VLMoeForConditionalGeneration._process_image_input  s     /0}!!!!">2778I8OPP--% . 
 
 ++NHEE&9
b!!Z/:=!!%,,..111rD   video_inputc                    |d         }|j         dk    sJ |d                             | j        j                  }|                     ||          }|                     ||          }| j        j        }|                    d          | j        j	        z  |z  |z  }|
                    |                                          S )NrG  r   rF  r  rG   )r  r   rl  r   r  rm  r   r  ro  rq  rH   r  )r}   r  r  rF  video_featuresvideo_embedsr  r  s           rB   _process_video_inputz;Ernie4_5_VLMoeForConditionalGeneration._process_video_input  s     /0}!!!!)*?@EE#
 
 --,x . 
 
 ++NHEE&9
]]2$+"@@ 	 !!%,,..111rD   c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)r=  r  r  )rF  r  r  r<   )r  r  )r}   r  
modalities	input_keys       rB   %_parse_and_validate_multimodal_inputszLErnie4_5_VLMoeForConditionalGeneration._parse_and_validate_multimodal_inputs3  s    
   
	V 
	VI===J..'Kt'K'U'Uf'U'U
8$DDDJ..'Kt'K'U'Uf'U'U
8$rD   c                 
    | j         di |}|sd S d}|D ]l}|dk    r/|d         }|                     |          }|t          |          z  }|dk    r/|d         }|                     |          }|t          |          z  }m|S )Nr<   r  r  )r  r  r   r  )	r}   r  r  multimodal_embeddingsr%  r  image_embeddingsr  video_embeddingss	            rB   embed_multimodalz7Ernie4_5_VLMoeForConditionalGeneration.embed_multimodalF  s    ?T?II&II
 	4 ;= # 	A 	AH8##(2#'#<#<[#I#I %/?)@)@@%8##(2#'#<#<[#I#I %/?)@)@@%$$rD   F)is_multimodalhandle_oov_mm_tokenr  r  r  c                    |(t          |          dk    r|                     |           ||!t                                          |          S t                                          ||||          S )Nr   )r  r  r  )r  r  rp   embed_input_ids)r}   r  r  r  r  r~   s        rB   r  z6Ernie4_5_VLMoeForConditionalGeneration.embed_input_ids]  s     !,5J1K1Ka1O1O''	222 !(M,A77**9555ww&&"7' 3	 ' 
 
 	
rD   	positionsintermediate_tensorsinputs_embedsc                    ||||d}| j         | j         j        d         |j        d         k    r|j        d         | j         j        d         z
  }t          j        || j         j        d         f| j         j        | j         j                  }t          j        | j         |gd          | _         |                    d| j         i           d | _          | j        j	        di ||}	|	S )N)r  r  r  r  r   r-   )r   r   rL   rz  r<   )
rz  r   r=   r#  r   r   rU   updatern  model)
r}   r  r  r  r  r  forward_kwargspadding_lenr  r   s
             rB   r   z.Ernie4_5_VLMoeForConditionalGeneration.forwards  s    #"$8*	
 
 !-%+A.-2Ea2HHH+1!4t7M7STU7VVk $"8">q"AB0618  
 */D4JC3PVW)X)X)X&!!#68N"OPPP%)D"1+1 
 


 

 rD   r2  c                 X    t          |           }|                    || j                  S )N)mapper)r4   r8  hf_to_vllm_mapper)r}   r2  loaders      rB   r8  z3Ernie4_5_VLMoeForConditionalGeneration.load_weights  s+    "4((""743I"JJJrD   r  r   )NN)(r   r   r   packed_modules_mappingr5   r  classmethodr   r   ri  r   rq   r=   r   r  r  r  r  r   r   r  r  Ernie4_5_VLImageInputsr  Ernie4_5_VLVideoInputsr  r  r  r,  r  r/   r  r  r  r)   r   r   r.  r8  r   r   s   @rB   r\  r\    s       
 
 
 

 
 &1- 6H
 
 "4!3!0"5"5"2
 
  & F3 F3 F3: F F F [F37 37J 37 37T 37 37 37 37 37 37jA|A 
	A A A Al , 
	   ,
 
 
 
 
 
A33iA3 /0A3 
u|S 	!	A3 A3 A3 A3F	$	&    	$	&    212	u|S 	!2 2 2 2"212	u|S 	!2 2 2 2.f     &% %4H44O % % % %4 >B

 .2$)
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
4 <@-1" "<" <" 2D8	"
 |d*" " " "HKHU33D-E$F K3s8 K K K K K K K KrD   r\  )rU  rV  rW  )tr   r  rO  collections.abcr   r   r   r   	functoolsr   typingr   r	   r
   r  r>  r=   torch.nnr   torch.nn.functional
functionalr  einopsr   transformersr   vllm.configr   vllm.config.multimodalr   r   vllm.distributedr   r   ru   vllm.loggerr   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r    vllm.multimodal.parser!   r"   r#   vllm.multimodal.processingr$   r%   r&   r'   r(   vllm.sequencer)   vllm.utils.tensor_schemar*   r+   #vllm.v1.attention.backends.registryr,   ernie45_vl_moer.   
interfacesr/   r0   r1   r2   r3   r4   r5   r6   visionr7   r   r  r   r[   r   r]   r   r   r   r   r   r<  r  rE  r  r   rM  rQ  rT  rg  ri  r  r  r;  register_processorr\  r<   rD   rB   <module>r     sq  0 I H      A A A A A A A A A A A A       * * * * * * * * * *                           % % % % % % " " " " " " F F F F F F F F + + + + + + 0 0 0 0 0 0 # # # # # # ; ; ; ; ; ;      9 8 8 8 8 8         
 G F F F F F      P O O O O O / / / / / /            W V V V V V V V V V              . - - - - - > > > > > > > > D D D D D D 5 5 5 5 5 5              B A A A A A A A A A ( ( ( ( ( (	X		
S 3    &o o o o ory o o od       >1 1 1 1 129 1 1 1h    	   2    RY   \ \ \ \ \ \ \ \DB B B B B, B B B 5 B B B B B, B B B 5 
+C%K + + + + + +/3; / / / / / /0C%K 0 0 0 0 0 0 !%% %%% % 	%
 % % % %Pu u u u ury u u up`
 `
 `
 `
 `
 2 `
 `
 `
Fx
 x
 x
 x
 x
$;<U$V x
 x
 x
vc c c c c$:;T$U c c cL ('!	".  
~K ~K ~K ~K ~KI!<]~K ~K 
~K ~K ~KrD   