
    .`i>@                        d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZ  G d dej                  Z G d dej                  Z G d dej                  Z  eddgdde           G d dej                              Z! G d dej                  Z" G d dej                  Z# G d dej        j                  Z$dS ) z]Implementation of Siglip2VisionModel intended to be only used
within a vision language model.    )IterableN)nn)
functional)Siglip2VisionConfig)support_torch_compile)$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader   )is_vit_use_data_parallelshould_torch_compile_mm_vitc            	            e Zd Zdef fdZdej        dej        dej        fdZ	e
dej        dej        dee         dej        fd	            Z xZS )
Siglip2VisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        t          j        |j        | j        z  | j        z  | j                  | _	        |j
        | _
        t          | j
        dz            | _        t          j        | j
        | j                  | _        d S )N)in_featuresout_featuresg      ?)super__init__r   hidden_size	embed_dim
patch_sizer   Linearnum_channelspatch_embeddingnum_patchesintposition_embedding_size	Embeddingposition_embedding)selfr   	__class__s     {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/lfm2_siglip2.pyr   z Siglip2VisionEmbeddings.__init__   s    + +!y+do=O 
  
  
 "-'*4+;S+@'A'A$"$,t/?"P"P    pixel_values_packedspatial_shapesreturnc                    |j         j        dk    s
J d            |                                dk    r|j        d         dk    sJ |d         }n|}|dddf         |dddf         z                      t
          j                  }|                                }t          t          |                    }||j        d         k    r!t          d|j        d          d	| d
          | j        j        j        }|                     |                    |                    }| j        j                            | j        | j        d          }	|                     |	||          }
||
z   }|                    d          S )al  Embed patchified pixel values in packed (unpadded) form.

        Args:
            pixel_values_packed: (1, total_tokens, patch_dim) or
                (total_tokens, patch_dim), packed in tile order.
            spatial_shapes: (num_tiles, 2) on CPU (height, width) per tile.

        Returns:
            (1, total_tokens, embed_dim) packed embeddings.
        cpuzYExpected `spatial_shapes` on CPU to avoid device-to-host sync in variable-length packing.   r   r   N)dtypez?Packed pixel_values token count does not match spatial_shapes: z vs .)lengths_list)devicetypedimshapetotorchint64tolistr"   sum
ValueErrorr    weightr0   r%   reshaper#   #resize_positional_embeddings_packed	unsqueeze)r&   r*   r+   pixel_values_flatlengthsr3   total_tokenstarget_dtypepatch_embedspositional_embeddingspacked_pos_embeds
embeddingss               r(   forwardzSiglip2VisionEmbeddings.forward*   s    $)U222' 322
 ""$$))&,Q/14444 3A 6 3!!!!Q$'.A*>>BBBUU~~''3|,,--,21555C$*1-C C3?C C C  
 +28++,=,@,@|,@,T,TUU $ 7 > F F($*F!
 !
 !DD!% E 
 
 "$55
##A&&&r)   rG   r3   c                    |j         j        dk    sJ | j        d         }| j        }t	          t          |                    }t          j        ||f| j         |          }|                     ddd          	                    d          }|j         j        dk    r|
                    t          j                  }d}t          |          D ]\  }	}
|
dk    r||	                                         \  }}t          j        |||fddd	
          }|                    |||z                                dd          }|
                    |          }|||||
z   <   ||
z  }|S )a  Resize positional embeddings per image and return a packed tensor.

        Args:
            positional_embeddings: (height, width, embed_dim) base grid.
            spatial_shapes: (batch_size, 2) on CPU, (height, width) per image.
            lengths_list: flattened token length per image (height * width).

        Returns:
            (total_tokens, embed_dim) packed positional embeddings, concatenated
            in the same order as `lengths_list`.
        r.   r2   )r4   r0      r   r   bilinearFT)sizemodealign_corners	antialias)r4   r5   r7   r0   r"   r<   r9   emptypermuterA   r8   float32	enumerater;   Finterpolater?   	transpose)rG   r+   r3   r   source_dtyperD   rH   pos_4doffsetilengthheightwidthresizeds                 r(   r@   z;Siglip2VisionEmbeddings.resize_positional_embeddings_packed\   s   " $)U2222)/3	,23|,,--!K9%(/
 
 
 '..q!Q77AA!DD =&&YYu}--F"<00 	 	IAv{{*1-4466MFEme_#  G ooi%@@JJ1aPPGjj..G:Afv67fFF  r)   )__name__
__module____qualname__r   r   r9   FloatTensor
LongTensorTensorrJ   staticmethodlistr"   r@   __classcell__r'   s   @r(   r   r      s        Q2 Q Q Q Q Q Q0'".0' (0' 
	0' 0' 0' 0'd 5!$|5!(5! 3i5! 
	5! 5! 5! \5! 5! 5! 5! 5!r)   r   c                        e Zd ZdZ	 	 ddededz  def fdZdej	        d	ej	        d
e
ej	        z  dej	        fdZ xZS )Siglip2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperN r   quant_configprefixc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        t                      }|rdnt                      }| j        |z  dk    sJ | j        |z  | _        t          | j        | j        | j        || d|          | _        t#          | j        | j        || d	|
          | _        t'          | j        | j        | j	        | d          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r   r   z	.qkv_proj)r   	head_sizetotal_num_headsrn   ro   
disable_tpz	.out_proj)
input_sizeoutput_sizern   ro   rs   z.attn)	num_headsrq   scalero   )r   r   r   r   r   num_attention_headsrv   head_dimr=   rw   attention_dropoutdropoutr   r   num_heads_per_partitionr   qkv_projr   out_projr
   attn)r&   r   rn   ro   use_data_paralleltp_sizer'   s         r(   r   zSiglip2Attention.__init__   s    	+3$.8=4>)T^;;'%)^' 'N' ' '  
 ]D(
/466(T!!.R.T.T~'1,,,,'+~'@$)m N%'''(
 
 
 *~%'''(
 
 
 '2m*###	
 
 
			r)   hidden_states
cu_seqlens
max_seqlenr,   c                    |                      |          \  }}|j        \  }}}|                    dd          \  }}	}
|                    ||| j        | j                  }|	                    ||| j        | j                  }	|
                    ||| j        | j                  }
|                     ||	|
||          }|                    ||d          }|                     |          \  }}|S )Nr/   r2   )r6   )querykeyvaluer   r   )	r}   r7   chunkviewr|   ry   r   r?   r~   )r&   r   r   r   qkv_bszq_lenquery_states
key_statesvalue_statesoutattn_outputs                r(   rJ   zSiglip2Attention.forward   s    
 
Q 	UA141"1E1E.j,#((4dm
 
  __4dm
 

 $((4dm
 

 ii!!  
 
 kk#ub))s++Qr)   Nrm   ra   rb   rc   __doc__r   r   strr   r9   rf   r"   rJ   ri   rj   s   @r(   rl   rl      s        GG
 37	-
 -
#-
 )4/-
 	-
 -
 -
 -
 -
 -
^| L %,&	
 
       r)   rl   c                   \     e Zd Z	 	 d
dededz  def fdZdej        dej        fd	Z	 xZ
S )
Siglip2MLPNrm   r   rn   ro   c                 <   t                                                       || _        t          |j                  | _        t                      }t          |j        |j	        || d|          | _
        t          |j	        |j        || d|          | _        d S )Nz.fc1)rn   ro   rs   z.fc2)r   r   r   r	   
hidden_actactivation_fnr   r   r   intermediate_sizefc1r   fc2)r&   r   rn   ro   r   r'   s        r(   r   zSiglip2MLP.__init__   s     	'(9::466'$%???(
 
 
 %$%???(
 
 
r)   r   r,   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r   r   r   )r&   r   r   s      r(   rJ   zSiglip2MLP.forward  sG    88M22q**=9988M22qr)   r   )ra   rb   rc   r   r   r   r   r9   rf   rJ   ri   rj   s   @r(   r   r      s         37	
 
#
 )4/
 	
 
 
 
 
 
2U\ el        r)   r   )r   r   )dynamic_arg_dims	enable_ifc                   ~     e Zd Z	 	 ddededz  def fdZdej        dej        d	e	ej        z  d
ej        fdZ
 xZS )Siglip2EncoderLayerNrm   r   rn   ro   c                 \   t                                                       |j        | _        t	          j        | j        |j                  | _        t          ||| d          | _	        t	          j        | j        |j                  | _
        t          ||| d          | _        d S )Nepsz
.self_attnrn   ro   z.mlp)r   r   r   r   r   	LayerNormlayer_norm_epslayer_norm1rl   	self_attnlayer_norm2r   mlpr&   r   rn   ro   r'   s       r(   r   zSiglip2EncoderLayer.__init__  s     	+<F<QRRR)%(((
 
 

 <F<QRRR%???
 
 
r)   r   r   r   r,   c                     |}|                      |          }|                     |||          }||z   }|}|                     |          }|                     |          }||z   }|S )z
        Args:
            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
            cu_seqlens: Cumulative sequence lengths tensor.
            max_seqlen: Maximum sequence length.
        )r   r   r   )r   r   r   r   )r&   r   r   r   residuals        r(   rJ   zSiglip2EncoderLayer.forward$  s     !((77'!! ' 
 

 !=0 ((77// =0r)   r   )ra   rb   rc   r   r   r   r   r9   rf   r"   rJ   ri   rj   s   @r(   r   r   
  s         37	
 
#
 )4/
 	
 
 
 
 
 
*| L %,&	
 
       r)   r   c                        e Zd ZdZ	 	 ddededz  def fdZdej	        d	ej	        d
e
ej	        z  dej	        fdZ xZS )Siglip2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers`
    self attention layers. Each layer is a [`Siglip2EncoderLayer`].

    Args:
        config: PretrainedConfig
    Nrm   r   rn   ro   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d S )Nc           	      >    g | ]}t           d |           S )z.layers.)r   rn   ro   )r   ).0idxr   ro   rn   s     r(   
<listcomp>z+Siglip2Encoder.__init__.<locals>.<listcomp>S  sQ         $!!-$33c33    r)   )r   r   r   r   
ModuleListrangenum_hidden_layerslayersr   s    ```r(   r   zSiglip2Encoder.__init__J  sy     	m      !!9::  	
 	
r)   inputs_embedsr   r   r,   c                 >    |}| j         D ]} ||||          }|}|S )N)r   r   )r   )r&   r   r   r   r   encoder_layerlayer_outputss          r(   rJ   zSiglip2Encoder.forward]  sH     &![ 	* 	*M)M%%  M
 *MMr)   r   r   rj   s   @r(   r   r   A  s          37	
 
#
 )4/
 	
 
 
 
 
 
&| L %,&	
 
       r)   r   c            
            e Zd Z	 	 ddededz  def fdZd Zdej	        d	ej
        d
ej        dej        dej        f
dZ xZS )Siglip2VisionTransformerNrm   r   rn   ro   c                    t                                                       |j        }|| _        t	          |          | _        ddlm}  |dd          5  t          ||| d          | _	        d d d            n# 1 swxY w Y   |j
        }t          | j	        j                  |j
        k    r-t          d| d	t          | j	        j                   d
          t          j        ||j                  | _        d S )Nr   )set_model_tagr   T)
is_encoderz.encoderr   zThe original encoder only has z layers, but you requested z layers.r   )r   r   r   r   r   rI   vllm.compilation.backendsr   r   encoderr   lenr   r=   r   r   r   post_layernorm)r&   r   rn   ro   r   r   r   r'   s          r(   r   z!Siglip2VisionTransformer.__init__o  so    	&	1&99;;;;;;]+=== 	 	)) ***  DL	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 #4t|"##f&>>>P1B P P-01D-E-EP P P  
 !l9&:OPPPs   A>>BBc                     | j         S r   )rI   )r&   s    r(   get_input_embeddingsz-Siglip2VisionTransformer.get_input_embeddings  s
    r)   r*   r+   r   r   r,   c                     |                      ||          }|                     |||          }|                     |          S )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width)
        of the input images.
        )r   r   r   )rI   r   r   )r&   r*   r+   r   r   r   encoder_outputss          r(   rJ   z Siglip2VisionTransformer.forward  sO     (;^LL,,'!! ' 
 

 ""?333r)   r   )ra   rb   rc   r   r   r   r   r   r9   rd   re   rf   rJ   ri   rj   s   @r(   r   r   n  s         37	Q Q#Q )4/Q 	Q Q Q Q Q Q8  4".4 (4 L	4
 L4 
4 4 4 4 4 4 4 4r)   r   c            
            e Zd Z	 	 ddededz  def fdZdej        dej	        d	ej
        d
ej
        dej
        f
dZdeeeej
        f                  dee         fdZ xZS )Siglip2ModelNrm   r   rn   ro   c                 |    t                                                       t          ||| d          | _        d S )Nz.vision_modelr   )r   r   r   vision_modelr   s       r(   r   zSiglip2Model.__init__  sJ     	4%+++
 
 
r)   r*   r+   r   r   r,   c                 4    |                      ||||          S )N)r*   r+   r   r   )r   )r&   r*   r+   r   r   s        r(   rJ   zSiglip2Model.forward  s-        3)!!	 ! 
 
 	
r)   weightsc                 |   g d}t          |                                           }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N))r}   q_projq)r}   k_projk)r}   v_projvweight_loader)dictnamed_parameterssetreplacer   getattrr   add)r&   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   s               r(   load_weightszSiglip2Model.load_weights  s    "
 "
 "
 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####r)   r   )ra   rb   rc   r   r   r   r   r9   rd   re   rf   rJ   r   tupler   r   ri   rj   s   @r(   r   r     s         37	
 
#
 )4/
 	
 
 
 
 
 

".
 (
 L	

 L
 

 
 
 
HU33D-E$F 3s8        r)   r   )%r   collections.abcr   r9   r   torch.nnr   rV   transformersr   vllm.compilation.decoratorsr   vllm.distributedr   %vllm.model_executor.layers.activationr	   9vllm.model_executor.layers.attention.mm_encoder_attentionr
   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   visionr   r   Moduler   rl   r   r   r   r   r    r)   r(   <module>r      s  # # % $ $ $ $ $        $ $ $ $ $ $ , , , , , , = = = = = = A A A A A A < < < < < < X X X X X X         
 G F F F F F O O O O O O I I I I I I I Iv! v! v! v! v!bi v! v! v!rQ Q Q Q Qry Q Q Qh       B ()1vQ??)  0 0 0 0 0") 0 0	 0f* * * * *RY * * *Z24 24 24 24 24ry 24 24 24j6 6 6 6 658? 6 6 6 6 6r)   