
    .`i<                     @   d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZE d dlFmGZG ddlHmIZI ddlJmKZKmLZLmMZM ddlNmOZO ddl mPZPmQZQmRZRmSZSmTZT dd lUmVZV 	 	 	 dId$eWd%eWd&eWd'eWd(eWf
d)ZX G d* d+e>          ZY G d, d-e<eY                   ZZ G d. d/e=eY                   Z[ G d0 d1ej\                  Z] G d2 d3eD          Z^ G d4 d5ej\                  Z_d6ej`        d7eWd8eWfd9Za G d: d;ej\                  Zb G d< d=ej\                  Zc G d> d?ej\                  Zd G d@ dAej\                  Ze G dB dCej\                  Zf G dD dEej\                  Zg e1jh        e[eYeZF           G dG dHej\        eMeL                      ZidS )J    N)IterableMappingSequence)partial)	AnnotatedLiteral	rearrange)BatchFeaturePretrainedConfig)GELUActivation)BaseModelOutputWithPooling)	torch_int)
VllmConfig)BaseDummyOptions)parallel_state)utils)MMEncoderAttention)Conv2dLayer)QKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loadermaybe_remap_kv_scale_name)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape)AttentionBackendEnum   )Ernie4_5ForCausalLM)MultiModalEmbeddingsSupportsMRoPESupportsMultiModal)	SiglipMLP)AutoWeightsLoaderPPMissingLayerWeightsMapperis_pp_missing_parametermaybe_prefix)get_vit_attn_backend      P heightwidthfactor
min_pixels
max_pixelsc                    | |k     rt          ||z  | z            }|} ||k     rt          | |z  |z            } |}t          | |          t          | |          z  dk    r1t          dt          | |          t          | |          z             t          | |z            |z  }t          ||z            |z  }||z  |k    rUt	          j        | |z  |z            }t	          j        | |z  |z            |z  }t	          j        ||z  |z            |z  }n]||z  |k     rTt	          j        || |z  z            }t	          j        | |z  |z            |z  }t	          j        ||z  |z            |z  }||fS )a)  Rescales the image so that the following conditions are met:

    1. Both dimensions (height and width) are divisible by 'factor'.

    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].

    3. The aspect ratio of the image is maintained as closely as possible.

       z4absolute aspect ratio must be smaller than 200, got )roundmaxmin
ValueErrormathsqrtfloorceil)r<   r=   r>   r?   r@   h_barw_barbetas           {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/paddleocr_vl.pysmart_resizerO   Y   s   " uv~/00v~~5011
65C...44=vu%%FE(:(::= =
 
 	
 &6/""V+E%&.!!F*Eu}z!!y&5.J677
6D=6122V;
54<&011F:		#	#yv~677	&4-&011F:	%$,/0069%<    c                   R    e Zd Zd ZdefdZdefdZd Zdededefd	Z	de
fd
ZdS )PaddleOCRVLProcessingInfoc                 4    | j                                         S N)ctxget_hf_configselfs    rN   rV   z'PaddleOCRVLProcessingInfo.get_hf_config   s    x%%'''rP   kwargsc                 &     | j         j        di |S N )rU   get_hf_processorrX   rY   s     rN   r]   z*PaddleOCRVLProcessingInfo.get_hf_processor   s    (tx(226222rP   c                 &     | j         di |j        S r[   )r]   image_processorr^   s     rN   get_image_processorz-PaddleOCRVLProcessingInfo.get_image_processor   s    $t$..v..>>rP   c                 
    dd iS )Nimager\   rW   s    rN   get_supported_mm_limitsz1PaddleOCRVLProcessingInfo.get_supported_mm_limits   s    rP   image_widthimage_heightreturnc                :   ||                                  }|                                 }|j        }|j        }|j        }t          ||||z  |j        |j                  \  }}	t          |	|          }
d}|
j	        |z  }|
j
        |z  }||z  |z  }||dz  z  }|S )N)r<   r=   r>   r?   r@   )r=   r<   r-      )ra   rV   vision_config
patch_sizespatial_merge_sizerO   r?   r@   r"   r<   r=   )rX   re   rf   r`   	hf_configrj   rk   
merge_sizeresized_heightresized_widthpreprocessed_sizegrid_tgrid_hgrid_wnum_patchesnum_image_tokenss                   rN   get_num_image_tokensz.PaddleOCRVLProcessingInfo.get_num_image_tokens   s     ""6688O&&((	!/"-
"5
(4
*&1&1)
 )
 )
% &M.QQQ")Z7"(J6vo.&:q=9rP   c                 ,   |                                  }|j        j        }|j        j        }||z  }|                                 j        |dz  z  }t          t          j        |                    }|||z  z  }||z  }t          ||z  ||z            S )Nri   )r<   r=   )
rV   rj   rl   rk   ra   r@   intrG   rH   r"   )rX   rm   rn   rk   r>   max_num_tokens	h_patches	w_patchess           rN   !get_image_size_with_most_featuresz;PaddleOCRVLProcessingInfo.get_image_size_with_most_features   s    &&((	 ,?
,7
j(1133>619M 	.1122	.944"i/		F 2)f:LMMMMrP   N)__name__
__module____qualname__rV   objectr]   ra   rd   ry   rw   r"   r}   r\   rP   rN   rR   rR      s        ( ( (3 3 3 3 3?F ? ? ? ?      	  
       @N9 N N N N N NrP   rR   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	PaddleOCRVLDummyInputsBuilder	mm_countsrg   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nrc   r   )getinfor]   image_token)rX   r   
num_images	processorr   s        rN   get_dummy_textz,PaddleOCRVLDummyInputsBuilder.get_dummy_text   s;    ]]7A..
I..00	+Z''rP   Nseq_len
mm_optionsc                     |                     dd          }| j                                        }|r|                     d          nd }d|                     |j        |j        ||          iS )Nrc   r   )r=   r<   r   	overrides)r   r   r}   _get_dummy_imagesr=   r<   )rX   r   r   r   r   max_image_sizeimage_overridess          rN   get_dummy_mm_dataz/PaddleOCRVLDummyInputsBuilder.get_dummy_mm_data   s}     ]]7A..
DDFF5?I*..111T T++$*%,%)	 ,  
 	
rP   rT   )
r~   r   r   r   strry   r   r   r   r   r\   rP   rN   r   r      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rP   r   c            
           e Zd Zdedeeef         deeef         deeef         def
dZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZdS )PaddleOCRVLMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrg   c           	         |r| j         j                             | j         j        di |t	          dd|i|t	          di ||          }|d                             d          }|d                             |                                          |d<   n'| j                                         } ||dd          }|S )	Ntextimage_grid_thwpixel_valuesTpt)add_special_tokensreturn_tensorsr\   )	r   rU   call_hf_processorr]   dictprodsplittolistget_tokenizer)rX   r   r   r   r   processed_outputsnum_patches_per_image	tokenizers           rN   _call_hf_processorz1PaddleOCRVLMultiModalProcessor._call_hf_processor   s      	 $	 ? ?*	*77Y77,,&,G,,//y/J//! !
 %66F$G$L$LR$P$P!0A.0Q0W0W%,,..1 1n-- 	//11I )	4! ! ! ! rP   	hf_inputshf_processor_mm_kwargsc                 l    t          t          j        d          t          j        d                    S )Nrc   )r   r   )r   r   batched)rX   r   r   s      rN   _get_mm_fields_configz4PaddleOCRVLMultiModalProcessor._get_mm_fields_config   s7    
 .6w??08AA
 
 
 	
rP   mm_itemsout_mm_kwargsc                        j         j        di |} j                                         }|j        dt          f fd}t          dgt          ||                    gS )Nitem_idxc                                          dt                    }|                    |           }j                            |j        |j        |          }g|z  S )Nrc   )re   rf   r`   )	get_itemsr!   get_image_sizer   rw   r=   r<   )r   r`   images
image_sizerv   image_token_idr   rX   s        rN   get_replacementzKPaddleOCRVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement  si    ''1DEEF..x88J#y==&,'. /  >     ##&666rP   rc   )r`   )modalitytargetreplacementr\   )r   ra   rV   r   ry   r'   r   )rX   r   r   r   r`   rm   r   r   s   ``     @rN   _get_prompt_updatesz2PaddleOCRVLMultiModalProcessor._get_prompt_updates  s     8$)7QQ:PQQI++--	"1
	7c 
	7 
	7 
	7 
	7 
	7 
	7 
	7 
	7  &'#O_UUU  
 	
rP   N)r~   r   r   r   r   r   r   r   r   r   r#   r    r   r(   r   r\   rP   rN   r   r      s        !! f%! 3;'	!
 CK(! 
! ! ! !0

 !(V 4
 
++	,	
 
 
 

%
 !(V 4
 -	

 
,	
 
 
 
 
 
rP   r   c                   b     e Zd Z	 d
dededef fdZdej        dej        dej        fd	Z xZ	S )	Projector text_configrj   prefixc                    t                                                       || _        || _        d| _        | j        j        | j        d         z  | j        d         z  | _        t          j                            | j        j        d          | _	        t          j
        | j        | j        d          | _        t                      | _        t          j
        | j        | j        j        d          | _        d S )N)ri   ri   r   r-   gh㈵>epsT)bias)super__init__r   rj   merge_kernel_sizehidden_sizetorchnn	LayerNormpre_normLinearlinear_1r   actlinear_2)rX   r   rj   r   	__class__s       rN   r   zProjector.__init__$  s     	&*!' *$Q'($Q'( 	 **4+=+Iu*UU	$"2D4D4PPP!##	d.:
 
 
rP   image_featuresr   rg   c           
      0   | j         \  }}t          |t          t          f          rt                      }t	          ||          D ]\  }}|                     |          }|\  }}	}
t          |d||	|z  ||
|z  |          }|                     |          }|                     |          }| 	                    |          }|
                    |           |S |j        d d         }|j        d         }|                    t          j        |          |          }|                     |                              d| j                  }|                     |          }|                     |          }| 	                    |          } |j        g |dR  S )Nz$(t h p1 w p2) d -> (t h w) (p1 p2 d))thp1wp2r   )r   
isinstancelisttuplezipr   r
   r   r   r   appendshapeviewnpr   r   )rX   r   r   m1m2processed_featuresimage_feature
image_gridr   r   r   hidden_statesdimsdims                 rN   forwardzProjector.forward<  s   
 'BntUm44 	&!%-0-P-P 9 9)z $m < <$1a )!:2g2g! ! ! !%m < < $ 7 7 $m < <"))-8888%%#CRC("2&',,RWT]]C@@n55::2t?OPPm44//m44!}!,4,,,,,rP   )r   )
r~   r   r   r   r   r   r   Tensorr   __classcell__r   s   @rN   r   r   #  s        
 	
 
%
 (
 	
 
 
 
 
 
0$-$- $- 
	$- $- $- $- $- $- $- $-rP   r   c            
           e Zd ZU ed         ed<   eej         eddddddh          f         ed<   eej         edd          f         ed<   d	S )
PaddleOCRImagePixelInputsr   typebnp   rk   )dynamic_dimsr   N)	r~   r   r   r   __annotations__r   r   r   r+   r\   rP   rN   r   r   c  s         
.
!!!!D#q,C5QQQ	S    D!	     rP   r   c                   
    e Zd Zdef fdZ	 ddej        dedededej        f
d	Z		 ddej        dededefdZ
	 	 	 ddej        dej        dz  deeeeef         eeeeef                  z           dz  dej        fdZ xZS )SiglipVisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        t          |j        | j        | j        | j        d          | _	        | j        | j        z  dz  | _
        | j
        | _        t                      | _        t                      | _        t          j        | j        | j                  | _        t          j        d| j                  | _        |                     dt)          j        | j                                      d          d           d S )	Nvalid)in_channelsout_channelskernel_sizestridepaddingri   i   position_ids)r-   r   F
persistent)r   r   r   r   	embed_dimr   rk   r   num_channelspatch_embeddingru   num_positionsr   cache_position_embeddingcache_position_countr   	Embeddingposition_embeddingpacking_position_embeddingregister_bufferr   arangeexpand)rX   r   r   s     rN   r   zSiglipVisionEmbeddings.__init__p  s"   + + +*+? 
  
  
 !Ot>1D!-(,%$(FF!"$,t/A4>"R"R*,,udn*M*M'L+,,33G<< 	 	
 	
 	
 	
 	
rP   F
embeddingsr<   r=   is_after_patchifyrg   c                    | j         j        j        d         }| j         j                            d          }|j        d         }|r|}|}	n|| j        z  }|| j        z  }	t          |dz            }
|                    d|
|
|          }|                    dddd          }t          j	        
                    |||	fdd	          }|                    dddd                              dd|          }|S )
Nr   r   g      ?r-   r   ri   bilinearF)sizemodealign_corners)r  weightr   	unsqueezerk   r   reshapepermuter   
functionalinterpolater   )rX   r  r<   r=   r  r
  patch_pos_embedr   
new_height	new_widthsqrt_num_positionss              rN   interpolate_pos_encodingz/SiglipVisionEmbeddings.interpolate_pos_encoding  s    /6<Q?18BB1EEr" 	1JII4?2J0I&}c'9::)11!#5s
 
 *11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNrP      r   r   	max_cachec                    ||f}|| j         v r"| j        |xx         dz  cc<   | j         |         S t          | j                   |k    rTt          | j        | j        j                  }| j                            |           | j                             |           |                     |||d          }d| j        |<   || j         |<   |S )Nr-   )keyT)r  r  lenrE   r   popr$  )rX   r  r   r   r&  gridmin_hit_gridr  s           rN   "fetch_position_embedding_lfu_cachez9SiglipVisionEmbeddings.fetch_position_embedding_lfu_cache  s     1v4000%d+++q0+++066t,--::)-1  L %)),777)--l;;;!:::q!TRR*+!$'.@%d+!!rP   Nr   r  r   c                    |                                 dk    r|                    d          }|                                 dk    rg|t          d          |j        \  }}}}}	| j        j        j        }
t          |d          }|                     |                    |
                    }|	                    d          
                    d          }|r|d}t                      }|D ]|}|\  }}}|||z  |z  z   }|||d d f         }|                     |||d	          
                    d                              |d
          }||z   }|                    |           |}}t          j        |d                              d          }n||                     |          z   }|S t          d|                                  d          )N   r      z9position_ids cannot be None when pixel_values.dim() is 5.zb l c h w -> (b l) c h wdtyper   Tr-   r   z$Unsupported pixel_values dimension: z. Expected 4 or 5.)r   r  rF   r   r	  r  r2  r
   toflattensqueezer   r$  repeatr   r   concatr  )rX   r   r  r   r$  
batch_sizesquence_lenchannelr<   r=   target_dtypepatch_embedsr  starttmp_embeddingsr   r   r   r   endimage_embeddingsr  s                         rN   r   zSiglipVisionEmbeddings.forward  s    ""'11!44L""# O   "/6<L$\3MNNL//l0S0STTL%--b1199"==J' XN,F!%"0    J(GAq!!a%!)+C'1%)QQQ,'?$556F1dSS 1 '
 (8:L'L$"))*:;;;EE"\.a@@@JJ1MM

'$*I*I,*W*WW
; $$&&; ; ;  rP   )F)r%  )NNF)r~   r   r   r   r   r   r   ry   boolr$  r-  FloatTensorr   r   r   r   r   s   @rN   r   r   o  s^       
/ 
 
 
 
 
 
B #(" "L" " 	"
  " 
" " " "J JL" ","+."36"CF" " " "0 -1!&2 2'2 lT)2 U3S=1DsC}9M4NNO
	2 
2 2 2 2 2 2 2 2rP   r   local_tensorr   tp_sizec                     ddl m}  fdt                    D             }|                    | t	          j                    j                   fd|D             }d t          | D             }t          j	        |d          }|S )	zEAll-gather the input tensor interleavely across model parallel group.r   Nc                 8    g | ]}t          j                  S r\   )r   
zeros_like).0_rE  s     rN   
<listcomp>z)all_gather_interleave.<locals>.<listcomp>  s$    OOO1(66OOOrP   )groupc                 B    g | ]}t          j        |z  d           S )r   )r   r   )rJ  tensorr   rF  s     rN   rL  z)all_gather_interleave.<locals>.<listcomp>  s9       <BFK72B77  rP   c                     g | ]	}|D ]}|
S r\   r\   )rJ  pairrO  s      rN   rL  z)all_gather_interleave.<locals>.<listcomp>  s9       d <B   rP   r   r4  )
torch.distributeddistributedrange
all_gatherr   get_tp_groupdevice_groupr   r   cat)rE  r   rF  distgathered_tensorsgathered_tensors_splitordered_tensorsresult_tensors   ```     rN   all_gather_interleaver^    s    $$$$$$OOOOgOOOOO,n.I.K.K.X        FV    67  O Io2666MrP   c                        e Zd ZdZddddededededz  d	ed
df fdZdej	        d
e
ej	        df         fdZdej	        dej	        dej	        dz  dej	        dz  d
ej	        f
dZ xZS )SiglipAttentionz=SigLIP vision attention adapted from Qwen2.5-VisionAttention.Nr   quant_configr   r  	num_headsprojection_sizerb  r   rg   c          
         t                                                       t          j                    | _        t          j                    | _        t          j        ||          | _	        t          j        || j                  | _
        t          || j	        ||d|| d          | _        t          |||| d          | _        t          | j
        | j	        | j	        dz  | d          | _        t#          dd	          | _        d S )
NTz	.qkv_proj)r   	head_sizetotal_num_headstotal_num_kv_headsr   rb  r   z	.out_proj)
input_sizeoutput_sizerb  r   g      z.attn)rc  rf  scaler   )enforce_enableenable_fp32_compute)r   r   r   $get_tensor_model_parallel_world_sizerF  get_tensor_model_parallel_ranktp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkv_projr   out_projr   attnr   apply_rotary_emb)rX   r  rc  rd  rb  r   r   s         rN   r   zSiglipAttention.__init__  s9    	%JLL%DFF.8.?Y/
 /
+ 2<1Bt|2
 2
. *!9%(%'''
 
 
 *&!%'''	
 
 
 '<95t;###	
 
 
	 !/ $!
 !
 !
rP   qkv.c                   	 |j         \  }}}| j        dk    r t          || j        j        | j                  }|                    dd          \  }}}| j        dk    rbt          t          j        | j                  } ||          | j	                 } ||          | j	                 } ||          | j	                 }||| j
        | j        f		fd|||fD             \  }}}|||fS )Nr-   r   ri   r4  )num_partitionsc              3   ,   K   | ]} |j          V  d S rT   )r   )rJ  x	new_shapes     rN   	<genexpr>z,SiglipAttention.split_qkv.<locals>.<genexpr>S  s,      99!6169%999999rP   )r   rF  r^  ru  r   chunkr   rq  split_tensor_along_last_dimrp  rt  rs  )
rX   ry  r   bsrK  qkvsplitterr~  s
            @rN   	split_qkvzSiglipAttention.split_qkv>  s   Q<!'T]-FUUC))A1)%%1a<!6t|  H DL)ADL)ADL)A 2/	
	 :9991ay9991a!QwrP   r   
cu_seqlensrotary_pos_emb
max_seqlenc                &   |j         \  }}}t          |d          }|                     |          \  }}|                     |          \  }}	}
d ||	|
fD             \  }}	}
|mt	          j        ||	gd          }|                     ||                                |                                          }t	          j	        |dd          \  }}	| 
                    ||	|
||          }t          |d          }|                     |          \  }}|S )Nzb s d -> s b dc              3   6   K   | ]}t          |d           V  dS )zs b h d -> b s h dNr	   )rJ  r   s     rN   r  z*SiglipAttention.forward.<locals>.<genexpr>c  s-      II!9Q 455IIIIIIrP   r   r4  ri   )queryr(  valuer  r  zb s h d -> b s (h d))r   r
   ru  r  r   rX  rx  cossinr  rw  rv  )rX   r   r  r  r  r:  rK  r}  r  r  r  	qk_concat
qk_rotatedcontext_layeroutputs                  rN   r   zSiglipAttention.forwardV  s1    ).
Aqm%566}}Q1..##1aII1ayIII1a%	1a&a000I..""$$""$$ J
 ;z1!444DAq		!! " 
 
 "-1GHHMM-00	rP   )r~   r   r   __doc__ry   r   r   r   r   r   r   r  r   r   r   s   @rN   r`  r`    s       GG 37,
 ,
 ,
 ,
 	,

 ,
 )4/,
 ,
 
,
 ,
 ,
 ,
 ,
 ,
\U\ eEL#4E.F    0"|" L	"
 t+" L4'" 
" " " " " " " "rP   r`  c                   N     e Zd Zd
dededdf fdZd Zdedej        fd	Z	 xZ
S )SigLIPRotaryEmbedding     @r   thetarg   Nc                     t                                                       || _        || _        |                                  d S rT   )r   r   r   r  	rope_init)rX   r   r  r   s      rN   r   zSigLIPRotaryEmbedding.__init__|  s=    
rP   c                     d| j         t          j        d| j        dt          j                  | j        z  z  z  }|                     d|d           d S )N      ?r   ri   r1  inv_freqFr  )r  r   r  r   floatr  )rX   r  s     rN   r  zSigLIPRotaryEmbedding.rope_init  sW    J5<48QekJJJTXUV
 	ZeDDDDDrP   seqlenc                     t          j        || j        j        | j        j                  }t          j        || j                  }|S )N)devicer2  )r   r  r  r  r2  outer)rX   r  seqfreqss       rN   r   zSigLIPRotaryEmbedding.forward  sE    l='-%
 
 

 C//rP   )r  )r~   r   r   ry   r  r   r  r   r   r   r   r   s   @rN   r  r  {  s         C  D      E E Ec el        rP   r  c            
            e Zd Z	 	 ddededz  def fdZdej        dej        d	ej        dz  d
ej        dz  dej        f
dZ	 xZ
S )SiglipEncoderLayerNr   r   rb  r   c                 ~   t                                                       |j        | _        t	          j        | j        |j                  | _        t          |j        |j	        |j        || d          | _
        t	          j        | j        |j                  | _        t          ||| d          | _        d S )Nr   z
.self_attn)r  rc  rd  rb  r   z.mlpra  )r   r   r   r  r   r   layer_norm_epslayer_norm1r`  num_attention_heads	self_attnlayer_norm2r2   mlprX   r   rb  r   r   s       rN   r   zSiglipEncoderLayer.__init__  s     	+<F<QRRR((0".%(((
 
 
 <F<QRRR%???
 
 
rP   r   r  r  r  rg   c                    |}|                      |          }|                     ||||          }||z   }|}|                     |          }|                     |          }||z   }|S )N)r   r  r  r  )r  r  r  r  )rX   r   r  r  r  residuals         rN   r   zSiglipEncoderLayer.forward  s     !((77'!)!	 ' 
 
 !=0 ((77// =0rP   Nr   )r~   r   r   r   r   r   r   r   r   r   r   r   s   @rN   r  r    s         37	
 
 
 )4/
 	
 
 
 
 
 
.| L	
 t+ L4' 
       rP   r  c                        e Zd Z	 	 ddededz  def fdZed             Z	 	 	 	 dde	j
        dz  d	eeeeef         eeeeef                  z           dz  d
e	j
        dz  de	j
        dz  de	j
        f
dZ xZS )SiglipEncoderNr   r   rb  r   c                    t                                                       | _        j        }j        }||z  }t          |t          j                              | _        | j        t          j
        t          j        t          j        hvrt          d| j         d          t          j        fdt!          j                  D                       | _        t'          |dz            | _        d S )N)rf  r2  zPaddleOCR-VL does not support z backend now.c           	      >    g | ]}t           d |           S )z.layers.ra  )r  )rJ  	layer_idxr   r   rb  s     rN   rL  z*SiglipEncoder.__init__.<locals>.<listcomp>  sQ         #!-$99i99    rP   ri   )r   r   r   r   r  r8   r   get_default_dtypeattn_backendr,   
FLASH_ATTN
TORCH_SDPAROCM_AITER_FARuntimeErrorr   
ModuleListrT  num_hidden_layerslayersr  r  )rX   r   rb  r   r  rc  head_dimr   s    ```   rN   r   zSiglipEncoder.__init__  s!    	&	.		)0)++
 
 
  + + .%
 
 

 Q1BQQQ   m      "'v'?!@!@  	
 	
 4HMBBrP   c                     t                      }| D ]B}t          |t                     r|                    |           -|                    |           C|S rT   )r   r   extendr   )r   tmp_image_grid_thwr   s      rN   flatten_listzSiglipEncoder.flatten_list  sa    !VV( 	6 	6J*d++ 6"))*5555"))*5555!!rP   r  r   height_position_idswidth_position_idsrg   c                    |j         }|}|                     |          }||t                      }	t                      }
|D ]\\  }}}t          j        ||z  |z  |          ||z  z  }||z  }||z  }|	                    |           |
                    |           ]t          j        |
d          }t          j        |	d          }t          j        ||gd          }|                                dz   }| 	                    |          }||         
                    d          }|t          d          t          |t          j                  s"t          j        |t          j        |          }n|                    |          }d }| j        t$          j        t$          j        hv r'|dd          |d d         z
                                  }|}| j        D ]} |||||          }|S )	N)r  r   r4  r   r-   z,cu_seqlens cannot be None for SiglipEncoder.)r2  r  )r  r  r  )r  r  r   r   r  r   r9  stackrD   r  r6  rF   r   r   rO  int32r5  r  r,   r  r  r  )rX   inputs_embedsr  r   r  r  r  r   flatten_image_grid_thw
split_hids
split_widsr   r   r   
image_pidssample_hidssample_widspidsmax_grid_sizerope_emb_max_gridr  r  encoder_layers                          rN   r   zSiglipEncoder.forward  s+    %%!%!2!2>!B!B%)<)DJJ1 / /1a"\!a%!)FCCCq1uM
(Ao(1n!!+...!!+....!&ja!@!@!@"',zq"A"A"A{ "45
 
 
 

Q //>>*4088;;KLLL*el33 	6jFSSSJJ#f55J
 + .!
 
 
 %QRR.:crc?:??AAJ%![ 	 	M)M%-%	  MM rP   r  )NNNN)r~   r   r   r   r   r   r   staticmethodr  r   r   r   r   ry   r   r   r   s   @rN   r  r    s.        37	"C "C "C )4/"C 	"C "C "C "C "C "CH " " \" +/37268 8 L4'8 U3S=1DsC}9M4NNO
	8 #\D08 "L4/8 
8 8 8 8 8 8 8 8rP   r  c                        e Zd Z	 	 ddededz  def fdZ	 	 	 	 	 	 ddej        d	e	dz  d
ej        dz  dej        dz  dej        dz  dej        dz  dej        dz  dej        fdZ
 xZS )SiglipVisionTransformerNr   r   rb  r   c                     t                                                       || _        |j        }t	          |          | _        t          ||| d          | _        t          j	        ||j
                  | _        d S )Nz.encoderra  r   )r   r   r   r   r   r  r  encoderr   r   r  post_layernorm)rX   r   rb  r   r  r   s        rN   r   z SiglipVisionTransformer.__init__2  s     	&	088$%&&&
 
 

 !l9&:OPPPrP   Fr   r$  r  r  r  r  r   rg   c                     |                      ||||          }|                     |||||          }	|                     |	          }	|	S )N)r$  r  r   )r  r  r   r  r  )r  r  r  )
rX   r   r$  r  r  r  r  r   r   last_hidden_states
             rN   r   zSiglipVisionTransformer.forwardD  so     %=%)	 ( 
 
 !LL'!) 31 ) 
 
 !//0ABB  rP   r  )FNNNNN)r~   r   r   r   r   r   r   r   r   rC  r   r   r   s   @rN   r  r  1  s        37	Q Q Q )4/Q 	Q Q Q Q Q Q* 16,03726*..2! !l! #'+! lT)	!
 #\D0! "L4/! L4'! t+! 
! ! ! ! ! ! ! !rP   r  c                   f    e Zd Z	 	 ddedz  def fdZedej        fd            Zedej	        fd            Z	de
j        fd	Z	 	 	 	 ddedej        dz  deeeeef         eeeeef                  z           dz  dej        dz  def
dZdeeeej        f                  dee         fdZ xZS )SiglipVisionModelNr   rb  r   c                     t                                                       t          ||| d          | _        || _        d S )Nz.vision_modelra  )r   r   r  vision_modelrb  r  s       rN   r   zSiglipVisionModel.__init__b  sT     	3%+++
 
 

 )rP   rg   c                 8    | j         j        j        j        j        S rT   )r  r  r	  r  r2  rW   s    rN   r2  zSiglipVisionModel.dtypeq  s     +;BHHrP   c                 8    | j         j        j        j        j        S rT   )r  r  r	  r  r  rW   s    rN   r  zSiglipVisionModel.deviceu  s     +;BIIrP   c                 $    | j         j        j        S rT   )r  r  r	  rW   s    rN   get_input_embeddingsz&SiglipVisionModel.get_input_embeddingsy  s     +;;rP   Fr$  r  r   r  c                 6    |                      |||||          S )N)r   r$  r  r   r  )r  )rX   r   r$  r  r   r  s         rN   r   zSiglipVisionModel.forward|  s0       %%=%)! ! 
 
 	
rP   weightsc                    g d}t          |                     d                    }t                      }|D ]\  }}d|v rd|v sd|v rd|v sd|v r| j        ~| j                            |          x}rb||         }t          |d	t                    }	|                                d
k    r|n|d
         } |	||           |                    |           |D ]i\  }
}}||vr|	                    ||
          }|
                    d          r||vr;t          ||           rL||         }|j        }	 |	|||            nk|
                    d          r||vr)t          ||          }|=t          ||           rO||         }t          |d	t                    }	 |	||           |                    |           |S )N))ru  q_projr  )ru  k_projr  )ru  v_projr  F)remove_duplicatezrotary_emb.inv_freqzhead.attentionzhead.layernormzhead.mlpz
head.probeweight_loaderr   z.bias)r   named_parameterssetrb  get_cache_scalegetattrr   r   addreplaceendswithr6   r  r   )rX   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr  
param_nameweight_nameshard_ids                rN   load_weightszSiglipVisionModel.load_weights  so   "
 "
 "

 400%0HHII"%%%#* 5	$ 5	$D-$,,4''+;t+C+CT!!\T%9%9 ,"/??EEE
 - $J/ '#)! ! &3%6%6%8%8A%=%=MM=QRCS  e]333!!*---
 (	4 4 d**||K<<==)) d+.E.E*466 #D) % 3e]H=====)) d+.E.E0{CC<*466 #D) '#)! !
 e]333d####rP   r  )FNNN)r~   r   r   r   r   r   propertyr   r2  r  r   Moduler  rC  r   r   r   ry   r   r   r   r  r  r   r   s   @rN   r  r  a  s        37	) ) )4/) 	) ) ) ) ) ) Iu{ I I I XI J J J J XJ<bi < < < < */,0*.
 
 #'
 lT)	

 U3S=1DsC}9M4NNO

 L4'
 
$
 
 
 
">HU33D-E$F >3s8 > > > > > > > >rP   r  )r   dummy_inputsc            
           e Zd Z eddd          Zededededz  fd	            Zd
dde	def fdZ
dej        dej        dz  fdZdee         dee         deej        ef         fdZdededz  fdZ	 	 d#dej        dej        dedz  dej        dz  fdZdej        dej        dej        fdZdedefdZdefd Zd!eeeej        f                  dee         fd"Z xZS )$#PaddleOCRVLForConditionalGenerationzlanguage_model.model.zlanguage_model.lm_head.)zmodel.zlm_head.)orig_to_new_prefixr   irg   Nc                 N    |                     d          rdS t          d          )Nrc   z1<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>z Only image modality is supported)
startswithrF   )clsr   r  s      rN   get_placeholder_strz7PaddleOCRVLForConditionalGeneration.get_placeholder_str  s.    w'' 	GFF;<<<rP   r   )r   vllm_configr   c          	         t                                                       |j        j        }|j        }|| _        |                     |d          5  t          |j        |t          |d                    | _
        t          ||j                  | _        d d d            n# 1 swxY w Y   |                     |          5  t          |t          |d                    | _        | j        j        j        D ](}t%          |t&                    sd|j        j        _        )	 d d d            n# 1 swxY w Y   | j        j        | _        d S )Nrc   visual)r   rb  r   language_model)r  r   T)r   r   model_configrm   rb  r   _mark_tower_modelr  rj   r7   r  r   mlp_AR_mark_language_modelr.   r  modelr  r   r4   r  
rotary_embis_neox_stylemake_empty_intermediate_tensors)rX   r  r   r   rb  layerr   s         rN   r   z,PaddleOCRVLForConditionalGeneration.__init__  s   )3"/##K99 	B 	B++)#FH55  DK
 $FF,@AADK	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B &&{33 	D 	D"5'#F,<==# # #D
 ,29 D D!%88 D?CEO.<D	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D ? 	,,,s&   AB##B'*B'AD//D36D3r   c                 6    | j                             |          S rT   )r  compute_logits)rX   r   s     rN   r  z2PaddleOCRVLForConditionalGeneration.compute_logits  s     "11-@@@rP   input_tokensmm_featuresc                 8
   t          j        |h d          }d |                    dg           D             }d |                    dg           D             }|                    dg           }| j        }|j        }|j        }	|j        }
|j        j        }t          |j        dd          }t          j        |          }t          j        ||
k                                  d	          }||d	z            }||k                                    }||	k                                    }g }d
}||}}d\  }}t          ||z             D ]}d}|d
k    r:	 |                    ||          }n4# t"          $ r t%          |          d	z   }Y nw xY wt%          |          d	z   }|d
k    r:	 |                    |	|          }n4# t"          $ r t%          |          d	z   }Y nw xY wt%          |          d	z   }||k     r||         \  }}}|d	z  }|d	z  }|}n$||         \  }}}d}|r||         }|d	z  }|d	z  }|}|||z  ||z  }"}!} ||z
  }#t%          |          d
k    r|d                                         d	z   nd
}$|                    t          j        |#                              d	d                              dd          |$z              t          j        |                               dd	                              d|!|"z            |z  |z                                                                  }%t          j        |!                              d	dd	                              | d|"                                          }&t          j        |"                              d	d	d                              | |!d                                          }'|                    t          j        |%|&|'g          |#z   |$z              || |!z  |"z  z   }|t%          |          k     rt%          |          d
k    r|d                                         d	z   nd
}$t%          |          |z
  }#|                    t          j        |#                              d	d                              dd          |$z              t          j        |d	                              dd          }(|(                                d	z   t%          |          z
                                  })|(|)fS )N>   r   video_grid_thwsecond_per_grid_tsc                 6    g | ]}|                                 S r\   r   rJ  items     rN   rL  zQPaddleOCRVLForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>       UUUD$++--UUUrP   r   c                 6    g | ]}|                                 S r\   r   r!  s     rN   rL  zQPaddleOCRVLForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>  r#  rP   r  r  tokens_per_secondr  r-   r   )r   r   g        r   r   r4  )r   gather_kwargsr   r   r   video_token_idvision_start_token_idrj   rl   r  r   rO  argwherer7  sumrT  indexrF   r)  rD   r   r  r   r  longr6  r  rX  r  r"  )*rX   r  r  rY   r   r  r  rm   r   r'  r(  rl   r%  input_tokens_tensorvision_start_indicesvision_tokens
image_nums
video_numsllm_pos_ids_liststremain_imagesremain_videosimage_indexvideo_indexrK  video_second_per_grid_ted_imageed_videor   r   r   ed
llm_grid_t
llm_grid_h
llm_grid_wtext_lenst_idxt_indexh_indexw_indexllm_positionsmrope_position_deltas*                                             rN   get_mrope_input_positionsz=PaddleOCRVLForConditionalGeneration.get_mrope_input_positions  s   
 '4FFF
 
 VUFJJ?OQS4T4TUUUUUFJJ?OQS4T4TUUU#ZZ(<bAAK	"1"1 ) ?&4G#I$;=PRUVV#l<88$~#88 
  

'!** 	 ,,@1,DE#~5::<<
#~5::<<
!#'1:}#' [zJ.// E	; E	;A&)#q  5+11."EEHH! 5 5 5"<0014HHH5 |,,q0q  5+11."EEHH! 5 5 5"<0014HHH5 |,,q0(""(51aq "(51a*-'% N.@.M+q " '''' %/
J
 BwH7:;K7L7Lq7P7P%b)--//!33VWF##X&&++Ar2299!R@@6I   L,,T"a[[VB
Z 788-. (	(   Z((aQ
B
33	  Z((aB
J33	  ##Wgw7888CfL   j:-
::BBL!!!!7:;K7L7Lq7P7P%b)--//!33VWF<((2-H##X&&++Ar2299!R@@6I   	"2:::BB1bII - 1 1 3 3a 7#l:K:K KQQSS222s$   E((FF#F::GGrY   c                     |                     dd           }|                     dd           }|d S t          d||          S )Nr   r   )r   r   r   )r*  r   )rX   rY   r   r   s       rN   _parse_and_validate_image_inputzCPaddleOCRVLForConditionalGeneration._parse_and_validate_image_inputy  sV     zz.$77$4d;;4(%)
 
 
 	
rP   	input_ids	positionsintermediate_tensorsr  c                 :    |d }|                      ||||          S rT   )r  )rX   rI  rJ  rK  r  rY   s         rN   r   z+PaddleOCRVLForConditionalGeneration.forward  s1      + M""y"6
 
 	
rP   r   r   c                    |                     | j        j                  }t                      }t                      }dg}t	          |                                          }t          j        |          }|                    |           t          j
        |          t          j        |dd                    z  }|                    |           |                    |d         |z              t          j        |d                              |j                  }t          j        |t          j                                      |j                  }|                     |||d|          }	|	S )Nr   r-   r   r4  r1  T)r   r   r  r$  r  )r   r  r2  r   r   r   r   r   r   r   r  r9  r5  r  rO  r  )
rX   r   r   siglip_position_idsimage_grid_hwsr  	thw_tuplenumelimage_position_idsvision_outputss
             rN   encode_imagez0PaddleOCRVLForConditionalGeneration.encode_image  sO    $(():;;"ffS
.//1122		""i((("\%00279QRR=3I3II""#5666*R.50111#l+>AFFFII
 
 \*EK@@@CCLDWXX
%),%)! % 
 
 rP   image_inputc                      |j         }|j        }t           fdt          ||          D                       }                     ||          }|S )Nc              3   p   K   | ]0\  }}                     ||                              d           V  1dS )r   N)rT  r7  )rJ  pixelr+  rX   s      rN   r  zKPaddleOCRVLForConditionalGeneration._process_image_input.<locals>.<genexpr>  sY       
 
t eT**22155
 
 
 
 
 
rP   )r   r   r   r   r  )rX   rU  r   r   rS  image_embedss   `     rN   _process_image_inputz8PaddleOCRVLForConditionalGeneration._process_image_input  so     #/$3 
 
 
 
"<@@
 
 
 
 
 {{>>BBrP   c                 z     | j         di |}|dS d}|                     |          }|t          |          z  }|S r[   )rH  rZ  r   )rX   rY   rU  multimodal_embeddingsrY  s        rN   embed_multimodalz4PaddleOCRVLForConditionalGeneration.embed_multimodal  sW    :d:DDVDD2:<00==|!4!44$$rP   r  c                 \    t          |           }|                    || j                  }|S )N)mapper)r3   r  hf_to_vllm_mapper)rX   r  loaderautoloaded_weightss       rN   r  z0PaddleOCRVLForConditionalGeneration.load_weights  s1    "4((#00AW0XX!!rP   )NN)r~   r   r   r5   r`  classmethodr   ry   r
  r   r   r   r   r  r   r   r   rF  r   r   rH  r)   r   rT  r/   rZ  r]  r   r  r  r   r   s   @rN   r  r    sk        &-1
 
   =3 =3 =3: = = = [= BD 
 
 
z 
3 
 
 
 
 
 
:A|A 
	A A A Ar33ir3 /0r3 
u|S 	!	r3 r3 r3 r3h

	"T	)
 
 
 
& <@-1
 
<
 <
 2D8	

 |d*
 
 
 
!L:?,	   :
4
	
 
 
 
	%,@ 	% 	% 	% 	%"HU33D-E$F "3s8 " " " " " " " "rP   r  )r9   r:   r;   )jrG   collections.abcr   r   r   	functoolsr   typingr   r   numpyr   r   torch.nnr   einopsr
   transformersr   r   transformers.activationsr   transformers.modeling_outputsr   transformers.utilsr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   rq  9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r    vllm.multimodal.parser!   r"   r#   vllm.multimodal.processingr$   r%   r&   r'   r(   vllm.sequencer)   vllm.utils.tensor_schemar*   r+   #vllm.v1.attention.backends.registryr,   ernie45r.   
interfacesr/   r0   r1   siglipr2   r3   r4   r5   r6   r7   visionr8   ry   rO   rR   r   r   r  r   r   r   r   r^  r`  r  r  r  r  r  register_processorr  r\   rP   rN   <module>r     s  "  7 7 7 7 7 7 7 7 7 7       % % % % % % % %                  7 7 7 7 7 7 7 7 3 3 3 3 3 3      ) ( ( ( ( ( " " " " " " 3 3 3 3 3 3 + + + + + + 0 0 0 0 0 0      8 7 7 7 7 7        G F F F F F             0 / / / / /                    
              . - - - - - > > > > > > > > D D D D D D ( ( ( ( ( ( O O O O O O O O O O                    ) ( ( ( ( ( #$( ((( ( 	(
 ( ( ( (V:N :N :N :N :N 2 :N :N :Nz
 
 
 
 
$:;T$U 
 
 
<A
 A
 A
 A
 A
56A
 A
 A
H=- =- =- =- =-	 =- =- =-@	 	 	 	 	 	 	 	H H H H HRY H H HV 3 QT    &k k k k kbi k k k\    BI   .2 2 2 2 2 2 2 2jg g g g gBI g g gT-! -! -! -! -!bi -! -! -!`j j j j j	 j j jZ ('"	".  
{" {" {" {" {"")5G {" {" 
{" {" {"rP   