
    .`iJ                     2   d Z ddlmZ ddlmZ ddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$  ee	j%        d          Z&d Z'd)dZ( G d de	j)                  Z* G d de	j)                  Z+ G d de	j)                  Z, G d d e	j)                  Z- G d! d"e	j)                  Z. G d# d$e	j)                  Z/ G d% d&e	j)                  Z0 G d' d(e          Z1dS )*zIThis is basically a copy from perception_models/core/vision_encoder/pe.py    )Callable)partialN)	rearrangerepeat)nn)
functional)
VllmConfig)$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig   )Step3VLForConditionalGeneration)WeightsMapperinit_vllm_registered_modelmaybe_prefix)run_dp_sharded_vision_modelh㈵>)epsc                     t          | dd          } |                     d          \  }}t          j        | |fd          } t          | d          S )Nz... (d r) -> ... d r   rdimz... d r -> ... (d r))r   unbindtorchstack)xx1x2s      v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/step_vl.pyrotate_halfr(       sY    !+q111AXX"XFBbS"I2&&&AQ.///          ?c                    |j         }|j        dk    r|j        |         }| | d          } | j        d         }||z   }||j        d         k    s)J d                    |j        d         |                      |dd |f         |d||f         |d|d f         }
}}	||                                 z  |z  t          |          |                                 z  |z  z   }t          j        |	||
fd          }|	                    |          S )N   r   zPfeature dimension {} is not of sufficient size to rotate in all the positions {}.r   )
dtypendimshapeformatcosr(   sinr"   cattype)freqststart_indexscaleseq_dimr.   seq_lenrot_dim	end_indext_leftt_rightouts               r'   apply_rotary_embrA   '   s   GEv{{''"whii k"oGg%Iagbk!!!	vagbk733 "!! 	
#||
	#{9$
$%	#yzz/ AF
 
UYY[[5	 [^^eiikk%AE%IJA
)VQ(b
1
1
1C88E??r)   c            	            e Zd Z	 	 	 	 	 ddededed	ef fd
Zdeez  dedej        fdZ	dej        dej        fdZ
dej        fdZdej        dej        deeef         fdZ xZS )PerceptionEncoderRope2DF'  
   r   r*   r    max_grid_heightmax_grid_widthuse_cls_tokenc	                    t                                                       || _        || _        || _        || _        ||||dz
  z  z  z  | _        || _        || _        | 	                                }	| 
                    d|	d           d S )Nr   freqs_cacheF)
persistent)super__init__r    rF   rG   rH   thetamax_freq	num_freqs_compute_2d_freqsregister_buffer)selfr    rF   rG   rH   rN   rO   rP   theta_rescale_factorcache	__class__s             r'   rM   z PerceptionEncoderRope2D.__init__B   s     	.,*1cS1WoFF
 "&&((]EeDDDDDr)   basereturnc                 ~    d|t          j        d|d          d |dz                                           |z  z  z  }|S )Nr*   r   r   )r"   arangefloat)rS   rW   r    r6   s       r'   _compute_inv_freqz)PerceptionEncoderRope2D._compute_inv_freqX   sC    tQQ 7 73!8 E K K M MPS STUr)   r7   inv_freqc                     t          j        d|                    |j                  |          }t	          |dd          }|S )Nz..., f -> ... fz... n -> ... (n r)r   r   )r"   einsumr5   r.   r   )rS   r7   r]   r6   s       r'   _compute_freqsz&PerceptionEncoderRope2D._compute_freqs\   s>    .x~0F0FQQu2a888r)   c                    t          j        | j        t           j                  }t          j        | j        t           j                  }| j        r
|dz  }|dz  }|                     | j        | j        dz            }| 	                    ||          d d d f         
                    | j        | j        d          }| 	                    ||          d d d f         
                    | j        | j        d          }t          j        ||gd                              | j        | j        z  d          }| j        r6t          j        t          j        d|j        d                   |gd          }|d         }|S )N)r.   r   r   r   r   r   )NN.)r"   rZ   rF   r[   rG   rH   r\   rN   r    r`   expandr4   reshapezerosr0   )rS   grid_h_rangegrid_w_ranger]   freqs_hfreqs_wr6   s          r'   rQ   z)PerceptionEncoderRope2D._compute_2d_freqsa   ss   |D$8LLL|D$7u{KKK 	ALAL))$*dh!mDD%%lH==aaagFMM $"5r
 
 %%lH==dAAAgFMM $"5r
 
 	7G,"555== 4#66
 
  	OIu{1ek"o>>FANNNEo&r)   qkgrid_hwc                    |d         | j         k    s|d         | j        k    rt          j        |d         |j                                      dd          }t          j        |d         |j                                      dd          }|| j        z  |z                       d                              t          j                  }| j	        rSt          j
        t          j        d|j                  |dz   gd          }|                    t          j                  }| j                            d|          }n| j        }t          ||          }t          ||          }||fS )Nr   r   )devicer   r   r   )rF   rG   r"   rZ   rm   viewrc   tolongrH   r4   rd   rJ   index_selectrA   )rS   ri   rj   rk   rowscols	positionsr6   s           r'   forwardzPerceptionEncoderRope2D.forwardv   sG   1:---t?R1R1R<
18<<<AA"aHHD<
18<<<AA!RHHD 33d:CCBGGJJ5:VVI! 5!I[18444i!mD!  	 &LL44	$11!Y??EE$EUA&&UA&&!tr)   )FrD   rE   r   r*   )__name__
__module____qualname__intboolrM   r[   r"   Tensorr\   r`   rQ   tupleru   __classcell__rV   s   @r'   rC   rC   A   s+        $ E EE E 	E
 E E E E E E,cEk           
5<    * %, sCx        r)   rC   c                   &     e Zd Zd fd	Zd Z xZS )PerceptionEncoderLayerScaler   Fc                     t                                                       || _        t          j        |t          j        |          z            | _        d S N)rL   rM   inplacer   	Parameterr"   onesgamma)rS   r    init_valuesr   rV   s       r'   rM   z$PerceptionEncoderLayerScale.__init__   sB    \+
3"?@@


r)   c                 X    | j         r|                    | j                  n	|| j        z  S r   )r   mul_r   )rS   r$   s     r'   ru   z#PerceptionEncoderLayerScale.forward   s(    %)\Eqvvdj!!!q4:~Er)   )r   F)rv   rw   rx   rM   ru   r}   r~   s   @r'   r   r      sY        A A A A A A
F F F F F F Fr)   r   c                        e Zd Z	 	 	 ddededeg ej        f         dedz  ded	e	f fd
Z
dej        dej        fdZ xZS )PerceptionEncoderMLPN F	input_dim
hidden_dim	act_layerquant_configprefixuse_data_parallelc                     t                                                       t          ||d|| d|          | _        || _        t          ||d|| d|          | _        d S )NTz.fc1biasr   r   
disable_tpz.fc2)rL   rM   r   fc1
activationr   fc2)rS   r   r   r   r   r   r   rV   s          r'   rM   zPerceptionEncoderMLP.__init__   s     	'%???(
 
 
 $$%???(
 
 
r)   r$   rX   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S r   )r   r   r   )rS   r$   _s      r'   ru   zPerceptionEncoderMLP.forward   s>    xx{{1OOAxx{{1r)   )Nr   F)rv   rw   rx   ry   r   r   Moduler   strrz   rM   r"   r{   ru   r}   r~   s   @r'   r   r      s         37"'
 

 
 B	M*	

 )4/
 
  
 
 
 
 
 
8 %,        r)   r   c                        e Zd Z	 	 	 	 ddededededed	edz  d
edef fdZdej	        de
eef         dej	        fdZ xZS ) PerceptionEncoderVisionAttentionFNr   	embed_dim	num_headsrF   rG   rH   r   r   r   c	           	         t                                                       || _        || _        ||z  | _        | j        dz  | _        |rdnt                      }	| j        |	z  dk    s
J d            | j        |	z  | _        t          || j        | j        d|| d|          | _	        t          ||d|| d|          | _        t          | j        | j        | j                  | _        t          | j        |||	          | _        d S )
N      r   r   z(embed_dim must be divisible by num_headsTz	.qkv_projr   z	.out_proj)r    rF   rG   rH   )rL   rM   r   total_num_headshead_dimr9   r
   r   r   qkv_projr   out_projr   attnrC   rope)rS   r   r   rF   rG   rH   r   r   r   tp_sizerV   s             r'   rM   z)PerceptionEncoderVisionAttention.__init__   s=    	"(!Y.]D(
(T!!.R.T.T#g-2226 322 -8)M %'''(
 
 
 *%'''(
 
 
 't~t}djQQ	++)'	
 
 
			r)   r$   rk   rX   c                    |j         \  }}}|                     |          \  }}|                    dd          \  }}}	|                    ||| j        | j                                      dddd          }|                    ||| j        | j                                      dddd          }|                     |||          \  }}|                    dddd                              ||| j        | j        z            }|                    dddd                              ||| j        | j        z            }| 	                    |||	          }
| 
                    |
          \  }
}|
S )Nr-   r   )chunksr    r   r   r   rk   )r0   r   chunkrn   r   r   permuter   rc   r   r   )rS   r$   rk   bszr;   r   qkvri   rj   vattn_outputs              r'   ru   z(PerceptionEncoderVisionAttention.forward   sN   'Waq!!Q))1")--1aFF3??GG1aQRSSFF3??GG1aQRSSyyAwy//1IIaAq!!))#w8VWWIIaAq!!))#w8VWWii1a(({33Qr)   )FNr   F)rv   rw   rx   ry   rz   r   r   rM   r"   r{   r|   ru   r}   r~   s   @r'   r   r      s         $26"'.
 .
.
 .
 	.

 .
 .
 )4/.
 .
  .
 .
 .
 .
 .
 .
` c3h EL        r)   r   c                        e Zd Zddej        ej        ddddfdedededed	ed
ededede	de
dz  dede	f fdZdej        deeef         fdZ xZS )PerceptionEncoderVisionBlock      @NFr   d_modeln_headrF   rG   	mlp_ratiols_init_valuer   
norm_layerrH   r   r   r   c           
         t                                                       t          |||||	|
| d|          | _        |t	          ||          nt          j                    | _        |t	          ||          nt          j                    | _         ||          | _	         ||          | _
        t          ||z            }t          ||||
| d|          | _        d S )Nz.attn)rF   rG   rH   r   r   r   z.mlpr   r   r   )rL   rM   r   r   r   r   Identityls_1ls_2ln_1ln_2ry   r   mlp)rS   r   r   rF   rG   r   r   r   r   rH   r   r   r   r   rV   s                 r'   rM   z%PerceptionEncoderVisionBlock.__init__   s    	4+)'%###/	
 	
 	
	 ( (??? 		 ( (??? 		
 Jw''	Jw''	9,--
'%???/
 
 
r)   r$   rk   c                    ||                      |                     |                     |          |                    z   }||                     |                     |                     |                              z   }|S Nr   )r   r   r   r   r   r   )rS   r$   rk   s      r'   ru   z$PerceptionEncoderVisionBlock.forward(  sc    		$))DIIaLL')BBCCC		$((499Q<<00111r)   rv   rw   rx   r   GELU	LayerNormry   r[   r   rz   r   r   rM   r"   r{   r|   ru   r}   r~   s   @r'   r   r      s        # g!|#26"'.
 .
.
 .
 	.

 .
 .
 .
 .
 .
 .
 )4/.
 .
  .
 .
 .
 .
 .
 .
` c3h        r)   r   c                        e Zd Zddej        ej        ddddfdedededed	ed
edededede	de
dz  dede	f fdZdej        deeef         fdZ xZS )"PerceptionEncoderVisionTransformerr   NFr   widthlayersheadsrF   rG   r   r   r   r   rH   r   r   r   c                    	
 t                                                       | _        || _        t	          j        	
fdt          |          D                       | _        d S )Nc                 P    g | ]"}t          
	 d |           #S )z.resblocks.)r   r   rF   rG   r   r   r   r   rH   r   r   r   )r   ).0ir   r   r   rF   rG   r   r   r   r   rH   r   r   s     r'   
<listcomp>z?PerceptionEncoderVisionTransformer.__init__.<locals>.<listcomp>C  sl         -! $3#1'"/')"/!-$4444&7    r)   )rL   rM   r   r   r   
ModuleListrange	resblocks)rS   r   r   r   rF   rG   r   r   r   r   rH   r   r   r   rV   s    ` ```````````r'   rM   z+PerceptionEncoderVisionTransformer.__init__/  s      	
               v  
 
r)   r$   rk   c                 4    | j         D ]} |||          }|S r   )r   )rS   r$   rk   blocks       r'   ru   z*PerceptionEncoderVisionTransformer.forwardV  s.    ^ 	* 	*Ea)))AAr)   r   r~   s   @r'   r   r   .  s        # g!|#26"'%
 %
%
 %
 	%

 %
 %
 %
 %
 %
 %
 %
 )4/%
 %
  %
 %
 %
 %
 %
 %
N c3h        r)   r   c                        e Zd Zedddfdedededz  dedef
 fd	Zd
e	de	fdZ
dej        fdZdej        fdZ xZS )PerceptionEncoderNr   Fr   r   r   r   r   c                 ,   t                                                       |j        | _        |j        p|j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _	        | j	        st          d          |j        | _        t          d|j        |j        |j        d          | _        |j        r ||j                  nt          j                    | _        |j        r || j                  nt          j                    | _        t)          |j        |j        |j        | j        | j        z  | j        | j        z  |j        |j        ||| j        || d|          | _        t          |j        |j        dz  ddd	          | _        t          |j        dz  |j        d
z  ddd	          | _        | j        r;t          j        | j        dz  t7          j        | j                  z            | _        | j        ro| j        | j        z  | _        t          j        | j        dz  t7          j        t?          | j                  | j        dz  z   | j                  z            | _         d S d S )Nzuse_rope2d must be Truer-   F)in_channelsout_channelskernel_sizestrider   z.transformer)
rF   rG   r   r   r   r   rH   r   r   r   r   r   )r   r   padding   r   )!rL   rM   
patch_size
output_dimr   r   r   use_abs_posembrH   
use_rope2d
ValueError
image_sizer   conv1
use_ln_prer   r   ln_preuse_ln_postln_postr   r   r   transformervit_downsampler1vit_downsampler2r   r"   randnclass_embeddingposemb_grid_sizery   positional_embedding)rS   configr   r   r   r   r   rV   s          r'   rM   zPerceptionEncoder.__init__]  s    	 + +;v|\
\
m$3#1 + 	86777 + )$
 
 

 392CVjj...171CVzz$*---=LML Ot>?do=& .!,%***/
 
 
  !,L&,*!Q!
 !
 !
 !,L1flQ.AaQR!
 !
 !
  	#%<T!U[%<%<<$ $D   	$(Ot$FD!(*T!+*++d.CQ.FFJ ) )D%%%	 	r)   grid_hgrid_wc                    | j         |k    r| j         |k    r| j        d         S | j        }| j        r|d d         |dd          }}|                    d| j         | j         d                              dddd                                          }t          j        |||fdd	          }|                    dddd                              d| j                  }| j        rt          j
        ||gd
          }|d         S )N)N.r   r   r   r-   r   bilinearF)sizemodealign_cornersr   )r   r   rH   rc   r   
contiguousFinterpolater   r"   r4   )rS   r   r   	pos_embedcls_token_embeds        r'   sample_abs_posembz#PerceptionEncoder.sample_abs_posemb  s    F**t/D/N/N,Y77-	 	F)22A2	!""YO a!68MrRRWQ1a  Z\\ 	
 MVV,:U
 
 
	 %%aAq1199"djII	 	G	?I">AFFFI##r)   r$   c                 f   |j         \  }}}}|| j        z  || j        z  }}|                     |          }|                    dddd                              |d| j                  }| j        rGt          j        | j	        
                    ddd                              |dd          |gd          }| j        r||                     ||          z   }|                     |          }|                     |||f          }|                     |          }| j        r|d d dd d d f         }|S )Nr   r   r-   r   r   r   r   )r0   r   r   r   rc   r   rH   r"   r4   r   rn   rb   r   r   r   r   r   )rS   r$   batchr   hwr   r   s           r'   forward_featuresz"PerceptionEncoder.forward_features  sB   q!Qdo-qDO/CJJqMMIIaAq!!))%TZ@@ 		%**1a44;;E2rJJANTU  A  	;D**66:::AKKNNQ(899LLOO 	!!!QRR(Ar)   c                    |                      |          }|j        \  }}}t          |dz            }|                    dd                                          }|                    ||||          }|                     |          }|                     |          }|j        \  }}}}|                    |d||z                                dd          S )Ng      ?r   r   r   )r  r0   ry   	transposer   rn   r   r   )rS   r$   BPCTs         r'   ru   zPerceptionEncoder.forward  s    !!!$$'1a3KKKK1((**FF1aA!!!$$!!!$$W
1avvaQU##--a333r)   )rv   rw   rx   _DEFAULT_NORM_LAYERr   r   r   rz   rM   ry   r   r"   r{   r  ru   r}   r~   s   @r'   r   r   \  s        
  326"'G G G 	G
 )4/G G  G G G G G GR$ $S $ $ $ $.%,    04 4 4 4 4 4 4 4 4r)   r   c                        e Zd Z edddddddd	          Zd
ddededdf fdZdej	        dz  dej	        dz  fdZ
dej	        dej	        fdZ xZS )StepVLForConditionalGenerationzlanguage_model.model.zlanguage_model.lm_head.)zmodel.zlm_head.z.attn.qkv_proj.weightz.attn.qkv_proj.biasz.mlp.fc1z.mlp.fc2)z.attn.in_proj_weightz.attn.in_proj_biasz	.mlp.c_fcz.mlp.c_proj)orig_to_new_prefixorig_to_new_substrr   )r   vllm_configr   rX   Nc                    t          t          |                                            |j        j        }|j        j        }|j        }|| _        || _        |j        dk    | _	        | 
                    |d          5  t          |j        t          |j        j                  |t          |d          | j	                  | _        t#          |j        j        dz  |j        j        |j        d|t          |d          | j	                  | _        d d d            n# 1 swxY w Y   |                     |          5  t1          ||j        t          |d	          
          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Ndataimagevision_modelr   r   Tvit_large_projector)r   gather_outputr   r   r   language_model)r  	hf_configr   )rL   r   rM   model_configr  multimodal_configr   r   mm_encoder_tp_moder   _mark_tower_modelr   vision_configr   
hidden_actr   r  r   r   text_confighidden_sizeprojector_biasr  _mark_language_modelr   r  make_empty_intermediate_tensors)rS   r  r   r   r  r   rV   s         r'   rM   z'StepVLForConditionalGeneration.__init__  s   -t44==???)3'4F"/!2!2!E!O##K99 	 	 1$6/:;;)#FN;;"&"8! ! !D (<$*Q.".*")#F,ABB1( ( (D$	 	 	 	 	 	 	 	 	 	 	 	 	 	 	$ &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   <BDD"D;+E22E69E6input_tensorc                 l    |d S | j         rt          || j                  S |                     |          S r   )r   r   r  )rS   r%  s     r'   _get_vision_model_outputz7StepVLForConditionalGeneration._get_vision_model_output  sB     4! 	P.|T=NOOO  ...r)   image_featuresc                 6    |                      |          \  }}|S r   )r  )rS   r(  r   s      r'   _process_image_featuresz6StepVLForConditionalGeneration._process_image_features"  s      44^DDr)   )rv   rw   rx   r   hf_to_vllm_mapperr	   r   rM   r"   r{   r'  r*  r}   r~   s   @r'   r  r    s        %-1
 

 %<"7#%	
 
   BD &
 &
 &
z &
3 &
 &
 &
 &
 &
 &
 &
P/!L4//		/ / / /el u|        r)   r  )r   r*   r+   )2__doc__collections.abcr   	functoolsr   r"   einopsr   r   r   torch.nnr   r   vllm.configr	   vllm.distributedr
   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   step3_vlr   utilsr   r   r   visionr   r   r  r(   rA   r   rC   r   r   r   r   r   r   r   r)   r'   <module>r<     s%   P O $ $ $ $ $ $        $ $ $ $ $ $ $ $       $ $ $ $ $ $ " " " " " " A A A A A A < < < < < < X X X X X X 7 7 7 7 7 7         
 G F F F F F 5 5 5 5 5 5 J J J J J J J J J J / / / / / /gbl555 0 0 0   4D D D D Dbi D D DNF F F F F") F F F! ! ! ! !29 ! ! !H> > > > >ry > > >B4 4 4 4 429 4 4 4n+ + + + + + + +\D4 D4 D4 D4 D4	 D4 D4 D4NA A A A A%D A A A A Ar)   