
    .`i]                        d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlmc m	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d	d
lmZmZ  G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Zdej        dede ej        e eef         f         fdZ!dej        dede eef         de eef         dej        f
dZ"dededej        dej        fd Z#d!ej        d"ej        d#ej        de eef         de eef         dej        fd$Z$ G d% d&ej                  Z%d' Z&d( Z' G d) d*e          Z( G d+ d,ej                  Z)dS )-    N)Iterable)partial)CLIPVisionConfig)MMEncoderAttention)Conv2dLayer)QuantizationConfig)default_weight_loader   )CLIPEncoderCLIPVisionEmbeddingsc            	       x     e Zd Zej        fdededeej                 ddf fdZde	j
        de	j
        fdZ xZS )	MLPBlockembedding_dimmlp_dimactreturnNc                     t                                                       t          j        ||          | _        t          j        ||          | _         |            | _        d S N)super__init__nnLinearlin1lin2r   )selfr   r   r   	__class__s       z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/deepencoder.pyr   zMLPBlock.__init__   sR     	ImW55	Ig}55	355    xc                 x    |                      |                     |                     |                              S r   )r   r   r   r   r   s     r   forwardzMLPBlock.forward)   s*    yy$))A,,//000r   )__name__
__module____qualname__r   GELUinttypeModuler   torchTensorr"   __classcell__r   s   @r   r   r      s        
  "w		 		 	 ")_		
 
	 	 	 	 	 	1 1%, 1 1 1 1 1 1 1 1r   r   c                   R     e Zd Zd	dededdf fdZdej        dej        fdZ xZ	S )
LayerNorm2dư>num_channelsepsr   Nc                    t                                                       t          j        t	          j        |                    | _        t          j        t	          j        |                    | _        || _	        d S r   )
r   r   r   	Parameterr*   onesweightzerosbiasr2   )r   r1   r2   r   s      r   r   zLayerNorm2d.__init__0   sZ    l5:l#;#;<<L\!:!:;;	r   r   c                 "   |                     dd          }||z
                      d                               dd          }||z
  t          j        || j        z             z  }| j        d d d d f         |z  | j        d d d d f         z   }|S )Nr
   T)keepdim   )meanpowr*   sqrtr2   r6   r8   )r   r   uss       r   r"   zLayerNorm2d.forward6   s    FF1dF##UKKNN400UejTX...K4&*TYqqq$}-EEr   )r0   )
r#   r$   r%   r'   floatr   r*   r+   r"   r,   r-   s   @r   r/   r/   /   s}         S u        %,        r   r/   c            #           e Zd Zdddddddddej        ej        dd	dd
dfdededededededededede	ej
                 de	ej
                 dededededeedf         ddf" fdZd ej        d!efd"Zd#ej        dej        fd$Z xZS )%ImageEncoderViT                     @   TFr    img_size
patch_sizein_chans	embed_dimdepth	num_heads	mlp_ratio	out_chansqkv_bias
norm_layer	act_layeruse_abs_posuse_rel_posrel_pos_zero_initwindow_sizeglobal_attn_indexes.r   Nc                    t                                                       || _        t          ||f||f||          | _        d| _        |r4t          j        t          j	        d||z  ||z  |                    | _        t          j
                    | _        t          |          D ]C}t          ||||	|
|||||vr|nd||z  ||z  f
  
        }| j                            |           Dt          j        t!          ||dd          t#          |          t!          ||ddd	          t#          |                    | _        t!          d
ddddd          | _        t!          dddddd          | _        dS )a  
        Args:
            img_size (int): Input image size.
            patch_size (int): Patch size.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
            depth (int): Depth of ViT.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            norm_layer (nn.Module): Normalization layer.
            act_layer (nn.Module): Activation layer.
            use_abs_pos (bool): If True, use absolute positional embeddings.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks.
            global_attn_indexes (list): Indexes for blocks using global attention.
        )kernel_sizestriderN   rO   Nr
   r   )
dimrQ   rR   rT   rU   rV   rX   rY   rZ   
input_sizeF)r]   r8   rF   )r]   paddingr8   rJ   i   r;   )r]   r^   ra   r8   rD   )r   r   rL   
PatchEmbedpatch_embed	pos_embedr   r4   r*   r7   
ModuleListblocksrangeBlockappend
Sequentialr   r/   necknet_2net_3)r   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   iblockr   s                      r   r   zImageEncoderViT.__init__@   s   J 	 %#Z0
+	
 
 
 /3 	\x:-x:/Ey  DN moou 	& 	&A##!%#'"3+,4G+G+GKKQ$
2H
4JK  E Ku%%%%M	   	""   	""
 
	$ !!Aqu
 
 

 !1Q
 
 



r   abs_postgt_sizec                 H   |j         }|                    d          }||k    r|                    dddd          }|                    t          j                  }t          j        |||fddd                              |          }|                    dddd          }|S |S )	Nr
   r   rF   r;   bicubicTFsizemode	antialiasalign_corners)dtyperu   permutetor*   float32Finterpolate)r   rp   rq   ry   src_sizeold_pos_embednew_pos_embeds          r   get_abs_poszImageEncoderViT.get_abs_pos   s    <<??x#OOAq!Q77M),,U];;MM)#   bii  *11!Q1==M  Nr   r   c                 t   |                      |          }| j        1||                     | j        |                    d                    z   }| j        D ]} ||          }|                     |                    dddd                    }|                     |          }|                     |          }|S )Nr
   r   rF   r;   )	rc   rd   r   ru   rf   rk   rz   rl   rm   )r   r   blkneck_outputconv2_outputconv3_outputs         r   r"   zImageEncoderViT.forward   s    Q>%D$$T^QVVAYY???A; 	 	CAAAii		!Q1 5 566zz+..zz,//r   )r#   r$   r%   r   	LayerNormr&   r'   rA   boolr(   r)   tupler   r*   r+   r   r"   r,   r-   s   @r   rC   rC   ?   s        &(l%'W !"&/1#_
 _
_
 _
 	_

 _
 _
 _
 _
 _
 _
 O_
 	?_
 _
 _
  _
  !_
" #38_#_
$ 
%_
 _
 _
 _
 _
 _
B5< 3    ( %,        r   rC   c                        e Zd ZdZddej        ej        ddddfdeded	ed
e	de
ej                 de
ej                 de	de	dedeeef         dz  ddf fdZdej        dej        fdZ xZS )rh   zWTransformer blocks with support of window attention and residual propagation
    blocksrI   TFr   Nr_   rQ   rR   rT   rU   rV   rX   rY   rZ   r`   r   c           	      ,   t                                                        ||          | _        t          ||||||	dk    r|
n|	|	f          | _         ||          | _        t          |t          ||z            |          | _        |	| _	        dS )ai  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            norm_layer (nn.Module): Normalization layer.
            act_layer (nn.Module): Activation layer.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks. If it equals 0, then
                use global attention.
            input_size (tuple(int, int) or None): Input resolution for calculating the relative
                positional parameter size.
        r   )rQ   rT   rX   rY   r`   )r   r   r   N)
r   r   norm1RelPosAttentionattnnorm2r   r'   mlprZ   )r   r_   rQ   rR   rT   rU   rV   rX   rY   rZ   r`   r   s              r   r   zBlock.__init__   s    8 	Z__
##/%0A%5%5zzK;U
 
 
	  Z__
s3?';';
 
 
 'r   r   c                    |}|                      |          }| j        dk    r2|j        d         |j        d         }}t          || j                  \  }}|                     |          }| j        dk    rt          || j        |||f          }||z   }||                     |                     |                    z   }|S )Nr   r
   r;   )r   rZ   shapewindow_partitionr   window_unpartitionr   r   )r   r   shortcutHWpad_hws         r   r"   zBlock.forward   s    JJqMMa71:qwqzqA(D,<==IAvIIaLLa"1d&6AGGAqLA'''r   )r#   r$   r%   __doc__r   r   r&   r'   rA   r   r(   r)   r   r   r*   r+   r"   r,   r-   s   @r   rh   rh      s         &(l%'W!"&-1,' ,',' ,' 	,'
 ,' O,' 	?,' ,'  ,' ,' #s(Od*,' 
,' ,' ,' ,' ,' ,'\ %,        r   rh   c                        e Zd ZdZ	 	 	 	 	 ddededed	ed
edeeef         dz  ddf fdZdej	        dej	        fdZ
 xZS )r   z=Multi-head Attention block with relative position embeddings.   TFNr_   rQ   rT   rX   rY   r`   r   c                    t                                                       || _        ||z  }|dz  | _        t	          j        ||dz  |          | _        t	          j        ||          | _        || _        | j        r~|
J d            t	          j	        t          j        d|d         z  dz
  |                    | _        t	          j	        t          j        d|d         z  dz
  |                    | _        dS dS )	a  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads.
            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            input_size (tuple(int, int) or None): Input resolution for calculating the relative
                positional parameter size.
        g      rF   )r8   NzBInput size must be provided if using relative positional encoding.r;   r   r
   )r   r   rQ   scaler   r   qkvprojrX   r4   r*   r7   	rel_pos_h	rel_pos_w)	r   r_   rQ   rT   rX   rY   r`   head_dimr   s	           r   r   zRelPosAttention.__init__  s    $ 	")#t^
9S#'999Ic3''	& 	X))T *))  \%+a*Q-6G!6KX*V*VWWDN\%+a*Q-6G!6KX*V*VWWDNNN	X 	Xr   r   c           	         |j         \  }}}}|                     |                              |||z  d| j        d                              ddddd          }|                    d|| j        z  ||z  d                              d          \  }}}	d\  }
}| j        r$t          || j        | j	        ||f||f          \  }
}|
                    || j        ||z  d          }|
                    || j        ||z  d          }|	
                    || j        ||z  d          }	| j        r3|

                    || j        |
                    d          |
                    d          |
                    d                    }
|
                    || j        |                    d          |                    d          |                    d                    }|
|z   
                    || j        |
                    d          |
                    d          |                    d          z            }t          j        j                            |||	|          }n&t          j        j                            |||	          }|
                    || j        ||d                              ddddd                              |||d          }|                     |          }|S )	NrF   r;   r   r
      )NN)	attn_mask)r   r   reshaperQ   rz   unbindrX   add_decomposed_rel_posr   r   viewru   r*   r   
functionalscaled_dot_product_attentionr   )r   r   Br   r   _r   qkvrel_hrel_w	attn_biass                r   r"   zRelPosAttention.forward/  s   W
1a HHQKK1q5!T^R@@HHAqRSUVWW 	 ++aT^!3QUB??FFqII1a!u 	14>4>Aq6Aq6 LE5 FF1dna!eR00FF1dna!eR00FF1dna!eR00 	JJJ4>5::a==%**Q--A E JJ4>5::a==%**Q--A E ,,4>5::a==%**Q--%**Q--2O I #@@1a9 A  AA #@@AqIIA FF1dnaB//WQ1a##WQ1b!! 	
 IIaLLr   )r   TFTN)r#   r$   r%   r   r'   r   r   r   r*   r+   r"   r,   r-   s   @r   r   r   	  s        GG
 !"&-1!X !X!X !X 	!X
 !X  !X #s(Od*!X 
!X !X !X !X !X !XF+ +%, + + + + + + + +r   r   r   rZ   r   c           	      x   | j         \  }}}}|||z  z
  |z  }|||z  z
  |z  }|dk    s|dk    rt          j        | ddd|d|f          } ||z   ||z   }	}|                     |||z  ||	|z  ||          } |                     dddddd                                                              d|||          }
|
||	ffS )aU  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.

    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   r
   rF   r;   r      r   )r   r}   padr   rz   
contiguous)r   rZ   r   r   r   Cpad_hpad_wHpWpwindowss              r   r   r   ]  s     JAq!Q1{?*k9E1{?*k9EqyyEAIIE!aAua/00YE	B	q"#["2C[RSTTA			!Q1a##..0055b+{TUVV  RHr   r   r   hwc                 t   |\  }}|\  }}| j         d         ||z  |z  |z  z  }|                     |||z  ||z  ||d          }	|	                    dddddd                                                              |||d          }	||k    s||k    r&|	ddd|d|ddf                                         }	|	S )	a  
    Window unpartition into original sequences and removing padding.
    Args:
        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.

    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    r   r   r
   rF   r;   r   r   N)r   r   rz   r   )
r   rZ   r   r   r   r   r   r   r   r   s
             r   r   r   y  s    " FBDAqaR"W3{BCA	2bK/k2	 	A 	
		!Q1a##..0055aRDDA	Avvaaaa!RaRlO&&((Hr   q_sizek_sizerel_posc                 $   t          dt          | |          z  dz
            }|j        d         |k    r|j        }|                    t
          j                  }t          j        |	                    d|j        d         d          
                    ddd          |d                              |          }|	                    d|          
                    dd          }n|}t          j        | |j                  dddf         t          || z  d	          z  }t          j        ||j                  dddf         t          | |z  d	          z  }||z
  |dz
  t          | |z  d	          z  z   }||                                         S )
a\  
    Get relative positional embeddings according to the relative positions of
        query and key sizes.
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
        rel_pos (Tensor): relative position embeddings (L, C).

    Returns:
        Extracted positional embeddings according to relative positions.
    r;   r
   r   r   linear)ru   rv   )deviceNg      ?)r'   maxr   ry   r{   r*   r|   r}   r~   r   rz   aranger   long)	r   r   r   max_rel_distry   rel_pos_resizedq_coordsk_coordsrelative_coordss	            r   get_rel_posr     s    q3vv...233L}Q<''**U]++-OOAw}Q/44<<Q1EE
 
 
 "U))	 	
 *11"lCCKKAqQQ! |F7>:::111d7CcG G H |F7>:::47CcG G H  (*vzS&RU=V=V.VVO?//1122r   r   r   r   c                    |\  }}|\  }}t          |||          }	t          |||          }
| j        \  }}}|                     ||||          }t          j        d||	          }t          j        d||
          }|                    d          }|                    d          }|                    |||z  |d          }|                    |||z  d|          }||fS )a  
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
    Args:
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkr   r
   )r   r   r   r*   einsum	unsqueeze)r   r   r   r   r   q_hq_wk_hk_wRhRwr   r   r_   r_qr   r   s                    r   r   r     s    ( HCHC	S#y	)	)B	S#y	)	)BIAq#
))AsC
%
%CL)333EL)333EOOBEOOBEMM!S3YQ//EMM!S3Y3//E%<r   c                        e Zd ZdZ	 	 	 	 	 ddeeef         deeef         deeef         d	ed
eddf fdZdej        dej        fdZ	 xZ
S )rb   z#
    Image to Patch Embedding.
    rE   rE   r   r   rF   rG   r]   r^   ra   rN   rO   r   Nc                 z    t                                                       t          |||||          | _        dS )aP  
        Args:
            kernel_size (Tuple): kernel size of the projection layer.
            stride (Tuple): stride of the projection layer.
            padding (Tuple): padding size of the projection layer.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
        )r]   r^   ra   N)r   r   r   r   )r   r]   r^   ra   rN   rO   r   s         r   r   zPatchEmbed.__init__  sA      	i[QX
 
 
			r   r   c                 `    |                      |          }|                    dddd          }|S )Nr   r;   rF   r
   )r   rz   r!   s     r   r"   zPatchEmbed.forward  s-    IIaLLIIaAq!!r   )r   r   r   rF   rG   )r#   r$   r%   r   r   r'   r   r*   r+   r"   r,   r-   s   @r   rb   rb     s          (0"*#)
 
38_
 c3h
 sCx	

 
 
 

 
 
 
 
 
, %,        r   rb   c                  ,    t          dddg d          S )NrG   rH   )r;   r   r      )encoder_embed_dimencoder_depthencoder_num_headsencoder_global_attn_indexes)
_build_samrK   r   r   build_sam_vit_br     s)    $1MM	   r   c                     d}d}d}t          || |dt          t          j        j        d          ||dd|d|	          }|S )
NrJ   rD   rE   r   r0   r2   T   )rP   rO   rL   rR   rU   rQ   rM   rT   rX   r[   rZ   rS   )rC   r   r*   r   r   )r   r   r   r   prompt_embed_dim
image_sizevit_patch_sizeimage_encoders           r   r   r     sf     JN##58-4888#!7"  M r   c                   d    e Zd Zdej        defdZ	 d	dej        dej        dz  dej        fdZdS )
DeepCLIPVisionEmbeddingsrp   rq   c                 &   |                     d          }|                    d          }|d d         |dd          }}t          t          j        |j        d         dz
                      }t          t          j        |                    }|j        }||k    r|                    d|||                              dddd          	                                }|
                    t          j                  }t          j        |||fddd	          
                    |          }	|	                    dddd          }	|	                    ||z  |          }	t          j        ||	gd
          }
|
                    d||z  dz   |          }
|
S |S )Nr   r   r
   rF   r;   rs   TFrt   r_   )ru   squeezer'   mathr>   r   ry   r   rz   r   r{   r*   r|   r}   r~   cat)r   rp   rq   r_   abs_pos_new	cls_tokenr   r   ry   r   vision_pos_embeds              r   r   z$DeepCLIPVisionEmbeddings.get_abs_pos*  s    ll2ooa((#.rr?KO=	ty!21!5!9::;;ty**++x""1h#>>Aq!$$ 
 *,,U];;MM)#   bii  *11!Q1==M)..x(/BCHHM$y)])CKKK/44Q88Ka8OQTUU##Nr   Npixel_valuespatch_embedsr   c                    |j         d         }||}n|                     |          }|                    d                              dd          }| j                            |dd          }t          j        ||gd          }||                     | 	                    | j
                  |                    d                    z   }|S )Nr   r;   r
   r   r   )r   patch_embeddingflatten	transposeclass_embeddingexpandr*   r   r   position_embeddingposition_idsru   )r   r   r   
batch_sizeclass_embeds
embeddingss         r   r"   z DeepCLIPVisionEmbeddings.forwardL  s     "'*
#'LL//==L#++A..88A>>+22:q"EEYl;CCC
$"2"2##D$566
8J8J#
 #
 

 r   r   )r#   r$   r%   r*   r+   r'   r   r"   rK   r   r   r   r   )  s|         5<  3        F OS !L8=t8K	     r   r   c                       e Zd Z	 dddddededz  dedz  deddf
 fd	Zed
             Z	ed             Z
	 ddddej        dej        dz  dee         dz  dej        fdZdeeeej        f                  dee         fdZ xZS )DeepCLIPVisionTransformerN )num_hidden_layers_overrideprefixconfigquant_configr
  r  r   c                   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |||| dt                    | _        |j        }t          | j        j                  |j        k    r-t!          d| dt          | j        j                   d          d S )Nr   z.encoder)r  r  r
  r  attn_clszThe original encoder only has z layers, but you requested z layers.)r   r   r  hidden_sizer   r  r   r   layer_norm_epspre_layrnormr   r   transformernum_hidden_layerslenlayers
ValueError)r   r  r  r
  r  rO   r  r   s          r   r   z"DeepCLIPVisionTransformer.__init___  s     	&	26:: L8MNNN&%'A&&&'
 
 
 #4t&''&*BBBT1B T T-01A1H-I-IT T T   CBr   c                 N    t          |                                           j        S r   )next
parametersry   r   s    r   ry   zDeepCLIPVisionTransformer.dtype  s    DOO%%&&,,r   c                 N    t          |                                           j        S r   )r  r  r   r  s    r   r   z DeepCLIPVisionTransformer.device  s    DOO%%&&--r   )select_layersr   r   r  c                    |                      ||          }|                     |          }|                     ||d u          }|S )N)inputs_embedsreturn_all_hidden_states)r  r  r  )r   r   r   r  hidden_statesencoder_outputss         r   r"   z!DeepCLIPVisionTransformer.forward  sY     lCC))-88 **'%2$%> + 
 
 r   weightsc                     t          |                                           }t                      }|D ]D\  }}||         }t          |dt                    } |||           |                    |           E|S )Nweight_loader)dictnamed_parameterssetgetattrr	   add)r   r#  params_dictloaded_paramsnameloaded_weightparamr%  s           r   load_weightsz&DeepCLIPVisionTransformer.load_weights  s    4002233"%%%#* 	$ 	$D-%E#E?<QRRMM%///d####r   r   )r#   r$   r%   r   r   r'   strr   propertyry   r   r*   r+   listr"   r   r   r(  r0  r,   r-   s   @r   r  r  ^  sg        37 
 26        )4/ 
 %($J    
           D - - X- . . X. -1
 +/  l lT)
 Cy4' 
   $	HU33D-E$F 	3s8 	 	 	 	 	 	 	 	r   r  )*r   collections.abcr   	functoolsr   r*   torch.nnr   torch.nn.functionalr   r}   transformersr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr	   clipr   r   r)   r   r/   rC   rh   r   r+   r'   r   r   r   r   r   rb   r   r   r   r  rK   r   r   <module>r>     s    $ $ $ $ $ $                       ) ) ) ) ) ) X X X X X X 7 7 7 7 7 7 F F F F F F O O O O O O 3 3 3 3 3 3 3 31 1 1 1 1ry 1 1 1$    ")    B B B B Bbi B B BJB B B B BBI B B BJQ Q Q Q Qbi Q Q Qh|"%
5<sCx()   8\ #s(O 	c3h	
 \   <$3 $3S $35< $3EL $3 $3 $3 $3N"|"|" |" #s(O	"
 #s(O" \" " " "J       F    42 2 2 2 23 2 2 2jF F F F F	 F F F F Fr   