
    .`i&                        d Z ddlZddlmZ ddlmZ ddlZddlZddl	m
c mZ ddlm
Z
 ddlmZ ddlmZ  ee
j        d	          Zd
ej        dej        ez  dej        fdZ	 ddedej        deeef         dej        fdZ	 ddedej        deeef         dej        fdZ	 	 ddedeeeef         z  dedeeef         dej        f
dZ G d de
j                  Z G d de          ZdS )z
Shared resampler perceiver network used in multimodal models and
related helpers for sincos positional embeddings.

Example models: Qwen (Qwen-VL), MiniCPM-V 2.0
    N)Callable)partial)nn)ReplicatedLinear)QuantizationConfiggư>)epsabs_postgt_sizereturnc                 "   t          t          j        |                     d                              }| j        }t          |t                     r||f}||d         k    r||d         k    r| S t          j        |                                 	                    d||d          
                    dddd          |d         |d         fdd          
                    dddd                              dd                              |	          S )
Nr            bicubicF)sizemodealign_cornersdtype)intmathsqrtr   r   
isinstanceFinterpolatefloatreshapepermuteflattento)r	   r
   src_sizer   s       x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/resampler.pyget_abs_posr$   3   s     49W\\!__--..HME(C   (h'8A;8x{#:#:	MMOO##Ax2>>FFq!QPQRR1+x{+		
 	
 	
 
Aq!			A	%
    r   r   	embed_dimposversionc                    | dz  dk    sJ t          j        | dz  t           j                  }|| dz  z  }dd|z  z  }|dk    rl|                    d          }t          j        d	||          }t          j        |          }t          j        |          }t          j        ||gd
          }nVt          j        d||          }t          j        |          }t          j        |          }t          j        ||gd          }|S )z
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,) / (H, W)
    out: (M, D) / (H, W, D)
    r   r   r   g       @g      ?i'  r&   r   zm,d->mdr   axisz	hw,d->hwd)nparangefloat32r   einsumsincosconcatenate)r'   r(   r)   omegaoutemb_sinemb_cosembs           r#   !get_1d_sincos_pos_embed_from_gridr9   L   s     q=AIi1nBJ777E	Y_E%,E&kk"ooi	3..&++&++ngw/a888iS%00&++&++ngw/b999Jr%   gridc                     | dz  dk    sJ t          | dz  |d         |          }t          | dz  |d         |          }|dk    rt          j        ||gd          }nt          j        ||gd          }|S )Nr   r   r   r&   r+   r   )r9   r-   r3   )r'   r:   r)   emb_hemb_wr8   s         r#   !get_2d_sincos_pos_embed_from_gridr>   g   s     q=A .QQ E .QQ E &neU^!444neU^"555Jr%   F	grid_size	cls_tokenc                 d   t          |t                    r||}}n|d         |d         }}t          j        |t          j                  }t          j        |t          j                  }t          j        ||          }t          j        |d          }t          |t          j                  r|j        d||fk    sJ |dk    rY|	                    dd||g          }t          | ||          }	|r,t          j        t          j        d| g          |	gd          }	nt          | ||          }	|	S )z
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or
                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    r   r   r   r+   r   r&   )r   r   r-   r.   r/   meshgridstackndarrayshaper   r>   r3   zeros)
r'   r?   r@   r)   grid_h_sizegrid_w_sizegrid_hgrid_wr:   	pos_embeds
             r#   get_2d_sincos_pos_embedrL   {   s-    )S!! >#,i[#,Q<1[Y{"*555FY{"*555F;vv&&D8Dq!!!DdBJ''WDJ1k;:W,W,W,WW&||Q;<==5iwOO	 	V!Y(@(@)'LSTUUUI5iwOO	r%   c                        e Zd ZdZdedddfdededededz  d	eegej        f         d
e	de
dz  deddf fdZdefdZ xZS )BaseResamplerz
    A 2D perceiver-resampler network with one cross attention layers by
        (grid_size**2) learnable queries and 2d sincos pos_emb.
    Outputs:
        A tensor with the shape of (grid_size**2, embed_dim)
    NT num_queriesr'   	num_headskv_dim
norm_layerdo_post_projectionquant_configprefixr   c	                 L   t                                                       || _        || _        || _        t          j        t          j        | j        |                    | _	        |#||k    rt          ||d|| d          | _        nd | _        t          j        ||          | _         ||          | _         ||          | _        || _        | j        rG ||          | _        |dz  t          j        ||          z  }	t          j        |	          | _        d S d S )NFz.kv_proj)biasrU   rV   c                  8     t          j                    | i |d fS )N)r   Identity)argskwargss     r#   <lambda>z(BaseResampler.__init__.<locals>.<lambda>   s$    t.v..4 r%   g      )data)super__init__rP   r'   rQ   r   	Parametertorchemptyqueryr   kv_projMultiheadAttentionattnln_qln_kvrT   ln_postproj)selfrP   r'   rQ   rR   rS   rT   rU   rV   r^   	__class__s             r#   r`   zBaseResampler.__init__   s7    	&""\%+d.>	"J"JKK
&I"5"5+) ***  DLL DL ))Y??	Jy))	Z	**
"4" 	0%:i00DLtOu{9i'H'HHD$///DIII	0 	0r%   Nc                 V    |                     d                              d|d          S )Nr   )	unsqueezerepeat)rl   rd   rn   s      r#   _repeatzBaseResampler._repeat   s&    q!!((Aq111r%   )__name__
__module____qualname____doc__
DEFAULT_LNr   r   r   	LayerNormboolr   strr`   rr   __classcell__rm   s   @r#   rN   rN      s          "4>#'26(0 (0(0 (0 	(0
 d
(0 cUBL01(0 !(0 )4/(0 (0 
(0 (0 (0 (0 (0 (0T2 2 2 2 2 2 2 2 2r%   rN   c                        e Zd ZdZdeddddfdededed	edz  d
eegej        f         de	de	de
dz  deddf fdZ	 	 ddej        dej        dz  dej        dz  dej        fdZ xZS )
Resampler2aX  Resampler-perceiver network to be used for a variety of model types,
    e.g., Qwen-vl / Minicpmv 2.0. The main difference is the addition of the
    do_post_projection arg, which indicates whether or not there should be
    a post layer normalization and projector after the attention. This is
    present in minicpmv2.0, but not qwen-vl.
    NFTrO   r?   r'   rQ   rR   rS   adaptiverT   rU   rV   r   c
           
         t                                          |dz  |||||||	           || _        t          ||d          }
t	          j        t          j        |
                              d                    | _	        d S )Nr   )rT   rU   rV   r&   r)   F)
r_   r`   r   rL   r   ra   rb   
from_numpyrequires_grad_rK   )rl   r?   r'   rQ   rR   rS   r   rT   rU   rV   pos_embed_arrrm   s              r#   r`   zResampler2.__init__   s     	qL1% 	 		
 		
 		
 !/	9fUUU]++::5AA
 
r%   x	tgt_sizes	attn_maskc                    |4t          t          j        |                    d                              }| j        rKt          | j        |d          }t          j        |          	                    |j
        |j                  }n4t          | j        |          	                    |j
        |j                  }|                     |          \  }}|                     |                              ddd          }|j        d         }|                     | j                  }|                     |                     ||          | j                            d          z   ||                    d          z   ||          d         }	|	                    ddd          }| j        r|                     |          }|| j        z  }|S )Nr   r&   r   )devicer   r   r   )r   )r   r   r   r   r   rL   r'   rb   r   r!   r   r   r$   rK   re   ri   r   rE   rh   rd   rg   rr   rp   rT   rj   rk   )
rl   r   r   r   r   rK   _rn   qr5   s
             r#   forwardzResampler2.forward   s    DIaffQii0011I= 
	3	6  M (77::xqw ;  II $DNI>>AAxqw B  I ||A1JJqMM!!!Q**GAJIIdj!!iiLLA!9!9!!<!<<	##A&&&	  
 

  KK1a  " 	QADIAr%   )NN)rs   rt   ru   rv   rw   r   r   r   rx   ry   r   rz   r`   rb   Tensorr   r{   r|   s   @r#   r~   r~      s3         "4>#'26
 

 
 	

 d

 cUBL01
 
 !
 )4/
 
 

 
 
 
 
 
B *.)-	# #<# <$&# <$&	#
 
# # # # # # # #r%   r~   )r&   )Fr&   )rv   r   collections.abcr   	functoolsr   numpyr-   rb   torch.nn.functionalr   
functionalr   !vllm.model_executor.layers.linearr   'vllm.model_executor.layers.quantizationr   rx   rw   r   r   r$   rD   tupler9   r>   ry   rL   ModulerN   r~    r%   r#   <module>r      sX  :   $ $ $ $ $ $                           > > > > > > F F F F F FWR\t,,,
 1C     4 AG .3CHo
\   8 BH */4S#X
\   . %	 U38_$  38_	
 \   B32 32 32 32 32BI 32 32 32lI I I I I I I I I Ir%   