
    .`i[                        d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$  G d dej%                  Z& G d dej%                  Z'dej(        dej(        dej(        dej(        de)dede*ej(        ej(        f         fdZ+ G d d ej%                  Z, G d! d"ej%                  Z- G d# d$ej%                  Z. G d% d&ej%                  Z/ G d' d(ej%                  Z0 G d) d*ej        j%                  Z1dS )+z\Implementation of SiglipVisionModel intended to be only used
within a vision language model.    )IterableN)nn)
functional)Siglip2VisionConfig)PretrainedConfig)divide$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinear
LinearBaseQKVParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loader)current_platform   )is_vit_use_data_parallelc                   H     e Zd Zd	dededdf fdZdedej        fdZ xZ	S )
VisionRotaryEmbedding     @dimthetareturnNc                     t                                                       d|t          j        d|dt          j                  |z  z  z  }|                     d|d           d S )Ng      ?r      dtypeinv_freqF)
persistent)super__init__torcharangefloatregister_buffer)selfr   r   r"   	__class__s       {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/siglip2navit.pyr%   zVisionRotaryEmbedding.__init__$   sd    %ELC%+$N$N$NQT$TUVZeDDDDD    seqlenc                     t          j        || j        j        | j        j                  }t          j        || j                  }|S )Ndevicer!   )r&   r'   r"   r1   r!   outer)r*   r.   seqfreqss       r,   forwardzVisionRotaryEmbedding.forward)   sC    l4=/t}7J
 
 
 C//r-   )r   )
__name__
__module____qualname__intr(   r%   r&   Tensorr5   __classcell__r+   s   @r,   r   r   #   s        E EC E ED E E E E E E
c el        r-   r   c                   `     e Zd Zdef fdZ	 ddej        dej        dz  dej        fdZ	 xZ
S )	Siglip2VisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _	        | j        dk    r|t          |j        | j        z  | j        z  | j        d          | _        | j        rBt          | j        dz            | _        t          j        | j        | j                  | _        d S d S t%          |j        | j        | j        | j        d          | _        | j        rQ| j        | j        z  dz  | _        | j        | j        z  | _        t          j        | j        | j                  | _        d S d S )Nr   F)
input_sizeoutput_sizereturn_biasg      ?valid)in_channelsout_channelskernel_sizestridepaddingr   )r$   r%   r?   hidden_size	embed_dim
patch_size
image_sizenum_patchespreserve_original_pehidden_strider   num_channelspatch_embeddingr9   position_embedding_sizer   	Embeddingposition_embeddingr   )r*   r?   r+   s     r,   r%   z Siglip2VisionEmbeddings.__init__2   s   + + +!-$*$?!#1 a#3!.@4?R N!$ $ $D 
 ( Y/243CS3H/I/I,*,,t7G*X*X'''Y Y
 $/"/!^ O$ $ $D  ( Y$(Ot$F1#L /3$//Q,*,,t7G*X*X'''Y Yr-   Npixel_values	grid_thwsr   c           	         | j         j        j        }t          | j         t                    r*|                      |                    |                    }nt          | j         t                    r}|                    d| j        j	        | j        j
        z  | j        | j                  }|                      |                    |                    }|                    d| j                  }| j        r[|J t          j        |          }| j        j                            | j        | j        d                              d                              dddd          }d}|D ]\  }}	}
||	z  |
z  }t+          j        ||	|
fdd	
          }|                    dddd                              d|	|
z  d          }|d                             |d          }|                    ||	| j        z  | j        |
| j        z  | j        d          }|                    dddddd                              |d          }|||||z   <   ||z  }||z   }|S )aL  
        Args:
            pixel_values (`torch.FloatTensor`):
                Pixel values of shape (
                    num_patches,
                    num_channels * temporal_patch_size * patch_size * patch_size
                )
            grid_thws: (`torch.LongTensor`):
                grid shape (num_patches, 3)
        r    Nr      r   r   bicubicF)sizemodealign_corners      )rR   weightr!   
isinstancer   tor   viewr?   rQ   temporal_patch_sizerL   reshaperK   rO   r&   
zeros_likerU   rS   	unsqueezepermuteFinterpolaterepeatrP   )r*   rV   rW   target_dtypepatch_embedspos_embed_newpositional_embeddingscntthwvolumepes                r,   r5   zSiglip2VisionEmbeddings.forwardT   sm   " +28d*J77 
	D//l0S0STTLL,k:: 	D',,(4;+JJ	 L  //l0S0STTL'//DNCCL$  	8(((!,\::M'.660$2NPR  1Aq!$$ " C$  1aQ])Q""'	   ZZ1a++33Aq1ubAAU\\!Q''ZZ++&++&  ZZ1aAq1199&"EE46cC&L01v'-7Lr-   N)r6   r7   r8   r   r%   r&   FloatTensor
LongTensorr:   r5   r;   r<   s   @r,   r>   r>   1   s         Y/  Y  Y  Y  Y  Y  YJ .2@ @'@ #d*@ 
	@ @ @ @ @ @ @ @r-   r>   qkcossinis_flash_attn_backendapply_rotary_embr   c                 |   |                     dd          d                                         }|                     dd          d                                         }|rt          j                    r|j        }n$|rt          j                    r|j        }n|j        } || ||          } ||||          }||fS )Nr   rY   r   r   )chunk
contiguousr   is_cudaforward_cudais_rocmforward_hipforward_native)	rz   r{   r|   r}   r~   r   apply_rotary_emb_funcq_embedk_embeds	            r,   apply_rotary_pos_embr      s     ))A2)

q
!
,
,
.
.C
))A2)

q
!
,
,
.
.C @!1!9!;!; @ 0 =	 @#3#;#=#= @ 0 < 0 ?##AsC00G##AsC00GGr-   c                        e Zd ZdZ	 	 ddededz  def fdZ	 ddej	        d	ej	        d
e
ej	        ej	        f         dz  de
ej	        ej	        dz  f         fdZ xZS )Siglip2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperN r?   quant_configprefixc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        t                      }t          | j        | j        | j        || d|          | _        t          | j        | j        || d|          | _        |rd	nt#                      | _        t'          | j        | j                  | _        |j        | _        t-          | j        | j        | j	        | d
          | _        t1          dd          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z	.qkv_proj)rJ   	head_sizetotal_num_headsr   r   
disable_tpz	.out_proj)rA   rB   r   r   r   r   z.attn)	num_headsr   scaler   T)enforce_enableenable_fp32_compute)r$   r%   r?   rJ   rK   num_attention_headsr   head_dim
ValueErrorr   attention_dropoutdropoutr   r   qkv_projr   out_projr	   tp_sizer   num_heads_per_partitionuse_roper   attnr   r   r*   r?   r   r   use_data_parallelr+   s        r,   r%   zSiglip2Attention.__init__   s    	+3$.8=4>)T^;;3%)^3 3 $3 3 3  
 ]D(
/466)m N%'''(
 
 
 *~%'''(
 
 
 #NAA(L(N(N 	 (.dndl'K'K$&2m*###	
 
 
	 !/ $!
 !
 !
r-   hidden_states
cu_seqlensposition_embeddingsr   c                    |j         \  }}|                     |          \  }}|                    dd          \  }}	}
|                    || j        | j                  }|	                    || j        | j                  }	|
                    || j        | j                  }
| j        r{|\  }}t          |                    d          |	                    d          ||| j	        j
        | j                  \  }}	|                    d          }|	                    d          }	|dd         |dd         z
                                  }| 	                    |                    d          |	                    d          |
                    d          ||          }|                    || j        | j        z            }|                     |          \  }}|S )z#Input shape: Batch x Time x ChannelrZ   rY   r   r   r   N)querykeyvaluer   
max_seqlen)shaper   r   rd   r   r   r   r   rh   r   r~   r   squeezemaxrf   r   )r*   r   r   r   
seq_lengthrK   
qkv_states_querieskeysvaluesr|   r}   r   attn_outputs                  r,   r5   zSiglip2Attention.forward   s    !. 3
Im44
A * 0 0 0 ; ;v,,z4+GWWyyT%A4=QQZ)Et}UU= 	#*HC0!!!$$q!!	/% MGT ooa((G<<??D nz#2#6;;==
ii##A&&q!!""1%%!!   
 
 "))4t}D
 
 {33Qr-   Nr   rw   )r6   r7   r8   __doc__r   r   strr%   r&   r:   tupler5   r;   r<   s   @r,   r   r      s        GG
 37	5
 5
#5
 )4/5
 	5
 5
 5
 5
 5
 5
v IM	+ +|+ L+ #5<#=>E	+
 
u|U\D00	1+ + + + + + + +r-   r   c                   \     e Zd Z	 	 d
dededz  def fdZdej        dej        fd	Z	 xZ
S )
Siglip2MLPNr   r?   r   r   c                 <   t                                                       || _        t                      }t	          |j                  | _        t          |j        |j	        || d|          | _
        t          |j	        |j        || d|          | _        d S )Nz.fc1)r   r   r   z.fc2)r$   r%   r?   r   r
   
hidden_actactivation_fnr   rJ   intermediate_sizefc1r   fc2r   s        r,   r%   zSiglip2MLP.__init__  s     	466'(9::'$%???(
 
 
 %$%???(
 
 
r-   r   r   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S rw   )r   r   r   )r*   r   r   s      r,   r5   zSiglip2MLP.forward1  sG    88M22q**=9988M22qr-   r   )r6   r7   r8   r   r   r   r%   r&   r:   r5   r;   r<   s   @r,   r   r     s         37	
 
#
 )4/
 	
 
 
 
 
 
2U\ el        r-   r   c            	            e Zd Z	 	 ddededz  def fdZdej        dej        d	ej        d
e	ej
                 fdZ xZS )Siglip2EncoderLayerNr   r?   r   r   c                 \   t                                                       |j        | _        t	          j        | j        |j                  | _        t          ||| d          | _	        t	          j        | j        |j                  | _
        t          ||| d          | _        d S )Nepsz
.self_attnr   r   z.mlp)r$   r%   rJ   rK   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr*   r?   r   r   r+   s       r,   r%   zSiglip2EncoderLayer.__init__9  s     	+<F<QRRR)%(((
 
 

 <F<QRRR%???
 
 
r-   r   r   r   r   c                     |}|                      |          }|                     |||          }||z   }|}|                     |          }|                     |          }||z   }|S )z
        Args:
            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
            cu_seqlens: Cumulative sequence lengths tensor.
            position_embeddings: Position embeddings tensor.
        )r   r   r   )r   r   r   r   )r*   r   r   r   residuals        r,   r5   zSiglip2EncoderLayer.forwardN  s     !((77'! 3 ' 
 

 !=0 ((77// =0r-   r   )r6   r7   r8   r   r   r   r%   r&   r:   r   rx   r5   r;   r<   s   @r,   r   r   8  s         37	
 
#
 )4/
 	
 
 
 
 
 
*| L #\	
 
u 	!       r-   r   c                   z     e Zd ZdZ	 	 ddededz  def fdZd Zd	 Z	d
e
j        de
j        de
j        fdZ xZS )Siglip2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers`
    self attention layers. Each layer is a [`Siglip2EncoderLayer`].

    Args:
        config: PretrainedConfig
    Nr   r?   r   r   c                    t                                                       | _        t          j        fdt          j                  D                       | _        t          j	        j
        z  dz            | _        j        | _        j        | _        j        | _        j        j        z  | _        j        	d | _        d S d j                            d          D             | _        d S )Nc           	      >    g | ]}t           d |           S )z.layers.r   )r   ).0idxr?   r   r   s     r,   
<listcomp>z+Siglip2Encoder.__init__.<locals>.<listcomp>}  sQ         $!-$33c33    r-   r   c                 ,    g | ]}t          |          S  )r9   )r   is     r,   r   z+Siglip2Encoder.__init__.<locals>.<listcomp>  s+     * * *A* * *r-   |)r$   r%   r?   r   
ModuleListrangenum_hidden_layerslayersr   rJ   r   rotary_pos_embrL   rP   window_sizespatial_merge_unitfullatt_block_indexessplitr   s    ```r,   r%   zSiglip2Encoder.__init__t  s     	m      !!9::  	
 	
 4&"<<A
 
 !+#1!-"("69M"M'/)-D&&&* * & < B B3 G G* * *D&&&r-   c                    g }|D ]x\  }}}t          j        |                              d                              d|          }|                    || j        z  | j        || j        z  | j                  }|                    dddd          }|                                }t          j        |                              d                              |d          }|                    || j        z  | j        || j        z  | j                  }|                    dddd          }|                                }|                    t          j	        ||gd          
                    |d                     zt          j        |d          }|d d dd f                                         }|                     |          }	|	|                             d          }
|
S )Nr   rY   r   r   rZ   r   )r&   r'   rh   expandrf   rP   ri   flattenappendstackrl   catr   r   )r*   grid_thwpos_idsrr   rs   rt   hpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr   s              r,   rot_pos_embzSiglip2Encoder.rot_pos_emb  s    	S 	SGAq!|A0033::2qAAH''T''"T''"	 H  ''1a33H''))H|A0033::1bAAH''T''"T''"	 H  ''1a33H''))HNN5;(';DDDKKAqQQRRRR)G+++ ABB++--"11-@@,W5==a@@r-   c                    g }dg}d}| j         | j        z  | j        z  }|D ]\  }}}|| j        z  || j        z  }
}	t          j        ||	z  |
z                                ||	|
          }||	|z  z
  }||
|z  z
  }|	|z   |z  }|
|z   |z  }t          j        |d|d|fdd          }|                    |||||          }|                    ddddd                              |||z  ||          }|dk    	                    ddg                              d          }|                    d          }||dk             }|
                    ||z              |                    d          | j        z  |d         z   }|                    |                                           |||	z  |
z                                  z  }t          j        |d	          }||fS )
Nr   constantir   rZ   r   r_   rY   r   )r   rP   rL   r&   r'   rf   rj   padri   sumr   cumsumr   extendtolistitemr   )r*   r   window_indexcu_window_seqlenswindow_index_idvit_merger_window_sizegrid_tgrid_hgrid_w
llm_grid_h
llm_grid_windexpad_hpad_wnum_windows_hnum_windows_windex_paddedseqlens	index_newcu_seqlens_tmps                       r,   get_window_indexzSiglip2Encoder.get_window_index  sN   #$#  22doE 	 '/ "	I "	I"FFF$,,$,, #J L*!4z!ABBJJ
J E +Z:P-PPE*Z:P-PPE'%/4JJM'%/4JJM5E1e(<j$OOL'//&& L (//1aA>>FF-&&	 L $t+00!Q88@@DDG'//33L$\T%9:I	O ;<<<q!!D$;;>OPR>SS  $$^%:%:%<%<=== 3j @FFHHHOOy1555...r-   inputs_embedsrW   r   c                 b   |                      |          }|                     |          \  }}t          j        ||j        t          j                                        r|j        nt          j                  }t          j	        |          }|
                                \  }}|                    || j        z  | j        d          }||ddddf         }|                    |d          }|                    || j        z  | j        d          }||ddddf         }|                    |d          }t          j        ||fd          }|                                |                                f}	t          j        |dddf         |dddf         z  |dddf                                       dt          j                                        r|j        nt          j                  }
t          j        |
                    d          |
g          }
t          j        |          }|}t)          | j                  D ]'\  }}| j        r	|| j        v r|
}n|} ||||	          }(|                    || j        z  | j        d          }||ddf                             |d          }|S )	a  
        Args:
            inputs_embeds: Input tensor of shape
                (batch_size, sequence_length, hidden_size).
                Embedded representation of the input tokens.
            grid_thws: Grid tensor of shape (num_patches, 3)
                containing grid dimensions.
                Whether or not to return a [`~utils.ModelOutput`] instead of
                a plain tuple.
        r0   rY   Nr   r   r   r   )r   r!   )r   r  r&   tensorr1   jit
is_tracingr!   int32unique_consecutiver\   rf   r   r   r|   r}   repeat_interleaver   	new_zerosargsort	enumerater   r   )r*   r  rW   r   r   r   seq_lenr   embr   r   reverse_indicesr   r  blockr  s                   r,   r5   zSiglip2Encoder.forward  s    )))44*.*?*?	*J*J''!L '%*Y%9%9%;%;L)//
 
 

 "45FGG"''))
%--t..0G
 
 &lAAAqqq&89%--gr::'//t..0G
 
 (aaa(:;'//<<i8bAAA"wwyy#''))4,aaadOi1o-yA
 

& &+Y%9%9%;%;L)//  	
 	
 	 Y
 4 4Q 7 7DEE
-55%%dk22 	V 	VLE5- 3$:T1T1T!+!2!E-ATUUMM%--t..0G
 
 &oqqq&89AA'2NNr-   r   )r6   r7   r8   r   r   r   r   r%   r   r  r&   r:   r5   r;   r<   s   @r,   r   r   k  s          37	 # )4/ 	     D  :./ ./ ./`C|C <C 
	C C C C C C C Cr-   r   c                   j     e Zd Z	 	 ddededz  def fdZdej        dej	        d	ej
        fd
Z xZS )Siglip2VisionTransformerNr   r?   r   r   c                     t                                                       || _        |j        }t	          |          | _        t          ||| d          | _        t          j	        ||j
                  | _        d S )Nz.encoderr   r   )r$   r%   r?   rJ   r>   
embeddingsr   encoderr   r   r   post_layernorm)r*   r?   r   r   rK   r+   s        r,   r%   z!Siglip2VisionTransformer.__init__*  s     	&	1&99%%&&&
 
 

 !l9&:OPPPr-   rV   rW   r   c                     |                      ||          }|                     ||          }|                     |          }|S )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width)
            of the input images.
        )r#  r$  r%  )r*   rV   rW   r   last_hidden_states        r,   r5   z Siglip2VisionTransformer.forward<  sF     i@@ LL	BB //0ABB  r-   r   )r6   r7   r8   r   r   r   r%   r&   rx   ry   r:   r5   r;   r<   s   @r,   r!  r!  )  s         37	Q Q#Q )4/Q 	Q Q Q Q Q Q$!'! #! 
	! ! ! ! ! ! ! !r-   r!  c                        e Zd Z	 	 ddededz  def fdZdej        dej	        d	ej
        fd
Zdeeeej
        f                  d	ee         fdZ xZS )Siglip2NavitModelNr   r?   r   r   c                 |    t                                                       t          ||| d          | _        d S )Nz.vision_modelr   )r$   r%   r!  vision_modelr   s       r,   r%   zSiglip2NavitModel.__init__O  sJ     	4%+++
 
 
r-   rV   rW   r   c                 0    |                      ||          S )N)rV   rW   )r+  )r*   rV   rW   s      r,   r5   zSiglip2NavitModel.forward]  s'    
   % ! 
 
 	
r-   weightsc                 |   g d}t          |                                           }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N))r   q_projrz   )r   k_projr{   )r   v_projvweight_loader)dictnamed_parameterssetreplacer3  getattrr   add)r*   r-  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr3  s               r,   load_weightszSiglip2NavitModel.load_weightsg  s    "
 "
 "
 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####r-   r   )r6   r7   r8   r   r   r   r%   r&   rx   ry   r:   r5   r   r   r6  rC  r;   r<   s   @r,   r)  r)  N  s         37	
 
#
 )4/
 	
 
 
 
 
 

'
 #
 
	
 
 
 
HU33D-E$F 3s8        r-   r)  )2r   collections.abcr   r&   r   torch.nnr   rj   transformersr    transformers.configuration_utilsr   vllm.distributedr   r	   %vllm.model_executor.layers.activationr
   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   vllm.platformsr   visionr   Moduler   r>   r:   boolr   r   r   r   r   r   r!  r)  r   r-   r,   <module>rT     sj  # # % $ $ $ $ $        $ $ $ $ $ $ , , , , , , = = = = = = I I I I I I I I < < < < < < X X X X X X 7 7 7 7 7 7              G F F F F F      P O O O O O + + + + + + , , , , , ,    BI   c c c c cbi c c cL|| 
 
	
   % 5<%&   0e e e e ery e e eP       B0 0 0 0 0") 0 0 0f{ { { { {RY { { {|"! "! "! "! "!ry "! "! "!J2 2 2 2 2 2 2 2 2 2r-   