
    .`iT              
          d dl mZ d dlmZ d dlmZ d dlZd dlmZ d dl	mc m
Z d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d Z#dej$        dej$        dej$        de%ej$        ej$        f         fdZ& G d dej'                  Z( G d dej'                  Z) G d dej'                  Z* G d dej'                  Z+ G d dej'                  Z, G d d ej'                  Z-	 d(d"ej$        d#ej$        d$e.e/e/f         de.ej$                 fd%Z0 G d& d'e          Z1dS ))    )Sequence)deepcopy)cached_propertyN)ACT2FN)PreTrainedModel)divide$get_tensor_model_parallel_world_size)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)maybe_prefix)is_vit_use_data_parallel)current_platform)MoonViTConfigc                 v   | j         |j         dz   k    sJ | j        |j        f            | j        d d         |j        d d         k    sJ | j        |j        f            | j        d         d|j        d         z  k    sJ | j        |j        f            |j        t          j        k    sJ |j                    d S )N      )ndimshapedtypetorch	complex64)x	freqs_ciss     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/moonvit.py_apply_rope_input_validationr    E   s    6Y^a''''!'9?)C'''73B3<9?3B3////!'9?1K///72;!iob11111AGY_3M111?eo---y-----    xqxkr   returnc                 p   t          | |           t          ||           |                    d          }t          j         |                                 j        g | j        dd         ddR            }t          j         |                                j        g | j        dd         ddR            }t          j        ||z                                d          }t          j        ||z                                d          }|	                    |           |	                    |          fS )a  
    Args: (The leading dimensions of all inputs should be the same)
        xq: query, tensor of shape (..., num_heads, head_dim)
        xk: key, tensor of shape (..., num_heads, head_dim)
        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
    Returns:
        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
    r   Nr   r   )
r    	unsqueezer   view_as_complexfloatviewr   view_as_realflattentype_as)r"   r#   r   xq_xk_xq_outxk_outs          r   
apply_roper1   L   s    !Y/// Y///##B''I



 F#2# F FA F F F
G
GC



 F#2# F FA F F F
G
GCi0088<<Fi0088<<F>>"v~~b1111r!   c                   p     e Zd Z	 ddededededdf
 fdZd	 Zd
ej        dej        dej        fdZ	 xZ
S )Learnable2DInterpPosEmbbicubicheightwidthdiminterpolation_moder$   Nc                     t                                                       || _        || _        || _        t          j        t          j        |||                    | _	        | 
                                 d S N)super__init__r5   r6   r8   nn	Parameterr   emptyweightreset_parameters)selfr5   r6   r7   r8   	__class__s        r   r<   z Learnable2DInterpPosEmb.__init__d   sh     	
"4l5;vuc#B#BCCr!   c                 N    t           j                            | j                   d S r:   )r=   initnormal_r@   rB   s    r   rA   z(Learnable2DInterpPosEmb.reset_parametersn   s    
$$$$$r!   r   grid_hwsc                    g }|                                 D ]}|| j        j        d d         k    r/|                    | j                            d                     I|                    t          j        | j                            d                              d          || j	                  
                    d                              d                              d                     |t          j        |          z   }|S )Nr   r   )end_dim)r   r   r   r   )sizemode)r   r   r   )tolistr@   r   appendr+   Finterpolatepermuter&   r8   squeezer   cat)rB   r   rH   pos_embsr   outs         r   forwardzLearnable2DInterpPosEmb.forwardq   s    __&& 	 	E)#2#... 3 3A 3 > >????M++I66@@CC"!4  
 WQZZWY''WQW''	 	 	 	 %)H%%%
r!   )r4   )__name__
__module____qualname__intstrr<   rA   r   TensorrV   __classcell__rC   s   @r   r3   r3   c   s        KT    "% ,/ EH 	           % % %  %,        r!   r3   c                        e Zd Z	 	 	 	 ddededeeeef         z  dedef
 fd	Zd
ej        dej        dej        fdZ xZ	S )MoonVisionPatchEmbed      rc   rc   out_dimin_dim
patch_sizepos_emb_heightpos_emb_widthc                    t                                                       t          |t          t          f          sJ dt          |                       t          |t                    r||f}t          |          dk    sJ d|             || _        t          ||||          | _	        t          |||          | _        d S )NzInvalid patch_size type: r   z,Expected patch_size to be a tuple of 2, got )kernel_sizestride)r5   r6   r7   )r;   r<   
isinstancerZ   r   typelenrf   r   projr3   pos_emb)rB   rd   re   rf   rg   rh   rC   s         r   r<   zMoonVisionPatchEmbed.__init__   s     	*sHo66 	
 	
:Z(8(8::	
 	
6 j#&& 	2$j1J:!###G:GG $## %GJ
 
 
	 /!G
 
 
r!   r   grid_hwr$   c                     |                      |                              |                    d          d          }|                     ||          }|S )z
        Args:
            x (L, Channels): input tensor
            grid_hw (N, 2): grid height and width

        Returns:
            (L, Cout) tensor
        r   r   )ro   r)   rK   rp   )rB   r   rq   s      r   rV   zMoonVisionPatchEmbed.forward   sE     IIaLLaffQii,,LLG$$r!   )ra   rb   rc   rc   )
rW   rX   rY   rZ   tupler<   r   r\   rV   r]   r^   s   @r   r`   r`      s         ,4 
 

 
 %S/)	

 
 
 
 
 
 
 
6          r!   r`   c                        e Zd ZdZdej        fdededef fdZd Ze	de
j        fd	            Zd
e
j        de
j        fdZde
j        de
j        de
j        fdZ xZS )Rope2DPosEmbaR  2D rotary position embedding with multi-resolution support.

    This class is intended to be used in the following way:
    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
        The rope is shared across all attention layers and all heads.

    Refs:
    - RoFormer: https://arxiv.org/abs/2104.09864
    - VisionLLaMA: https://arxiv.org/abs/2403.00522
    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py

    Args:
        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
        max_height (int): the maximum height of the 2D grid
        max_width (int): the maximum width of the 2D grid
        theta_base (float): the base of the theta
        device (str): the device to store the precomputed cis
    i'  r7   
max_height	max_widthc                     t                                                       || _        | j        dz  dk    s
J d            || _        || _        || _        || _        d S )N   r   zdim must be divisible by 4)r;   r<   r7   rv   rw   
theta_basedevice)rB   r7   rv   rw   rz   r{   rC   s         r   r<   zRope2DPosEmb.__init__   sb     	x!|q   ">   $"$r!   c                 F    d| j          d| j         d| j         d| j         S )Nzdim=z, max_height=z, max_width=z, theta_base=)r7   rv   rw   rz   rG   s    r   
extra_reprzRope2DPosEmb.extra_repr   s2    xdhxxT_xx$.xxgkgvxxxr!   r$   c                    | j         | j        z  }t          j        d|                                                              | j                  }|| j        z  }|| j        z  }t          j        d| j        d          d| j        dz                                                               | j                  }d| j        || j        z  z  z  }t          j	        ||                                          }t          j	        ||                                          }t          j
        t          j        |          |          }	t          j
        t          j        |          |          }
t          j        |	                    d          |
                    d          gd          }|                    | j         | j        d          }|S )a  Calculate the cis(freqs) for each position in the 2D grid.

        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
        r   ry   Ng      ?r   r7   )rv   rw   r   aranger(   tor{   r7   rz   outerpolar	ones_likerS   r&   reshape)rB   Nflat_posx_posy_pos	dim_rangefreqsx_freqsy_freqsx_cisy_cisr   s               r   precomputed_freqs_cisz"Rope2DPosEmb.precomputed_freqs_cis   s    Odn,<1%%++--00==4>)DN*LDHa(():DHM):;AACCFFt{SS 	 t9tx+?@A+eU++1133+eU++1133EOG44g>>EOG44g>>I___$$eoo"o&=&=>B
 
 
	 %%dot~rJJ	r!   rH   c                      |                                 }t           fd|D                       sJ | j         j        f            t	          j         fd|D             d          }|S )z
        Args:
            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
        Returns:
            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
        c              3   r   K   | ]1\  }}d |cxk    o
j         k    nc od |cxk    o
j        k    nc V  2dS )r   N)rv   rw   .0hwrB   s      r   	<genexpr>z8Rope2DPosEmb.get_freqs_cis_by_seqlens.<locals>.<genexpr>   sy       
 
GKq!A%%%%do%%%%B!q*B*B*B*BDN*B*B*B*B
 
 
 
 
 
r!   c                 t    g | ]4\  }}j         d |d |f                             dj        dz            5S )Nr   r   )r   r   r7   r   s      r   
<listcomp>z9Rope2DPosEmb.get_freqs_cis_by_seqlens.<locals>.<listcomp>  sW       Aq *2A2rr62::2tx1}MM  r!   r   r   )rM   allrv   rw   r   rS   )rB   rH   shapesr   s   `   r   get_freqs_cis_by_seqlensz%Rope2DPosEmb.get_freqs_cis_by_seqlens   s     "" 
 
 
 
OU
 
 
 
 
 	
 	
 ON
	
 	
 
 I   "   
 
 
	 r!   pos_idxpos_idx_maskc                    |j         dd         |j         k    r$|j         d         dk    r|j        |j        dz   k    sJ |j         |j         f            |j        t          j        k    sJ |j                    |j         | j        dz  fz   }t          j        |t          j        | j                  }| j	        |d         |         |d         |         f         ||<   |S )a  
        Args:
            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
        Return:
            freqs_cis: tensor of shape (..., dim//2)
        Nr   r   r   )r   r{   ).r   ).r   )
r   r   r   r   boolr7   onesr   r{   r   )rB   r   r   shpr   s        r   get_freqs_cis_by_idxz!Rope2DPosEmb.get_freqs_cis_by_idx  s     M#2#,"444b!Q&& 1A 5555M<-. 656 !UZ///1C/// DHM#33Jut{
 
 
	 #'"<FOL)76?<+HH#
	, r!   )rW   rX   rY   __doc__r   device_typerZ   r<   r}   r   r   r\   r   r   r   r]   r^   s   @r   ru   ru      s        4 +   	      y y y u|    _8 %,    0|38<	       r!   ru   c                   f     e Zd ZdZ	 	 ddee         dedef fdZde	j
        d	e	j
        fd
Z xZS )MLP2zn
    Args:
        dims: [in_dim, hidden_dim, out_dim]
        bias: whether to use bias in linear layer.
    T dimsbiasprefixc           	         t                                                       t          |          dk    sJ t                      | _        t          |d         |d         |t          |d          | j                  | _        t          |d         |d         |t          |d          | j                  | _	        || _
        d S )Nra   r   r   fc0r   r   
disable_tpr   fc1)r;   r<   rn   r   use_data_parallelr   r   r   r   r   
activation)rB   r   r   r   r   rC   s        r   r<   zMLP2.__init__1  s     	4yyA~~~~!9!;!;'GG..-
 
 
 %GG..-
 
 
 %r!   r   r$   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S r:   )r   r   r   )rB   r   _s      r   rV   zMLP2.forwardK  s>    xx{{1OOAxx{{1r!   )Tr   )rW   rX   rY   r   listrZ   r   r[   r<   r   r\   rV   r]   r^   s   @r   r   r   *  s          % %3i% 	%
 % % % % % %4 %,        r!   r   c                        e Zd Z	 dej        dddededededef
 fd	Z	 dde	j
        de	j
        de	j
        d
z  fdZ	 dde	j
        de	j
        de	j
        d
z  de	j
        fdZ xZS )MoonVitEncoderLayerr   F)r   	attn_bias	num_heads
hidden_dimmlp_dimr   r   c          	         t                                                       t                      | _        || _        || _        | j        | j        z  | _        | j        rdnt                      | _        t          || j                  | _
        t          j        |          | _        t          j        |          | _        t          |||g|| d          | _        t#          || j        |||| d| j                  | _        t'          |||| d| j                  | _        t+          | j
        | j        | j        dz  | d	
          | _        d S )Nr   z.mlp)r   z.wqkv)hidden_size	head_sizetotal_num_headstotal_num_kv_headsr   r   r   z.wor   g      z.attn)r   r   scaler   )r;   r<   r   r   r   r   hidden_size_per_attention_headr	   tp_sizer   !num_attention_heads_per_partitionr=   	LayerNormnorm0norm1r   mlpr   wqkvr   wor
   attn)rB   r   r   r   r   r   r   rC   s          r   r<   zMoonVitEncoderLayer.__init__S  st    	!9!;!;"$.2o.O+'SAA-Q-S-S 	 28	4<1P1P.\*--
\*--
*-???
 
 

 &"9%(###-
 
 
	 $>>>-
 
 
 '<95t;###	
 
 
			r!   Nr   
cu_seqlensrope_freqs_cisc                    |                     d          }|                     |          \  }}|                                 dd         d| j        | j        fz   } |j        | }t          j        |d          \  }}	}
t          ||	|          \  }}	|dd         |dd         z
                                  }| 	                    |
                    d          |	
                    d          |

                    d          ||          }|                    || j        | j        z            }|                     |          \  }}|S )	zq
        Args:
            x (torch.Tensor): (seqlen, hidden_dim)
            cu_seqlens (torch.Tensor):
        r   Nr   ra   r   r   )r   
max_seqlen)rK   r   r   r   r)   r   unbindr1   maxr   r&   r   r   )rB   r   r   r   
seq_lengthxqkvr   	qkv_shaper"   r#   xvr   attn_outs                r   attention_qkvpackedz'MoonVitEncoderLayer.attention_qkvpacked  sF    VVAYY
))A,,aIIKK$2/(
 
	 ty)$\$B///
BBN33B nz#2#6;;==
99LLOOLLOOLLOO!!  
 
 ##212
 

 ggh''!r!   hidden_statesr$   c                     |}|                      |          }|                     |||          }||z   }|}|                     |                     |                    }||z   }|S )a  
        Args:
            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set

        Returns:
            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
        r   )r   r   r   r   )rB   r   r   r   residualr   s         r   rV   zMoonVitEncoderLayer.forward  sy     !

=11++:n , 
 
 !8+ M!:!:;; =0r!   r   r:   )rW   rX   rY   rO   gelurZ   r[   r   r<   r   r\   r   rV   r]   r^   s   @r   r   r   R  s#        1
 61
 1
 1
1
 1
 	1

 1
 1
 1
 1
 1
 1
 1
n /3	' '<' L' t+	' ' ' 'Z /3	 | L t+	
 
       r!   r   c                   j     e Zd Z	 ddededededdf
 fdZd	ej        d
ej        dej        fdZ	 xZ
S )MoonVitEncoderr   r   
num_layers	block_cfgr   r$   Nc                 .   t                                                       t          d         d         z  dd          | _        t	          j        fdt          |          D                       | _        t	          j        |          | _	        d S )Nr   r   i   c           	      6    g | ]}t          dd  d| iS )r   z.blocks. )r   )r   	layer_idxr   r   s     r   r   z+MoonVitEncoder.__init__.<locals>.<listcomp>  sX       
 	 $  $99i99   r!   )
r;   r<   ru   rope_2dr=   
ModuleListrangeblocksr   final_layernorm)rB   r   r   r   r   rC   s      ``r   r<   zMoonVitEncoder.__init__  s     	#l#y'==sC
 
 m    
 "'z!2!2  
 
  "|J77r!   r   rq   c                    | j                             |          }t          j        t          j        d|j        |j                  |d d df         |d d df         z                      |j                  f          }|                    dt          j	                  }t          | j                  D ]\  }} ||||          }|                     |          }|S )N)rH   r   )r{   r   r   )r7   r   r   )r   r   r   rS   zerosr{   r   r   cumsumint32	enumerater   r   )rB   r   rq   r   lengthsr   r   blocks           r   rV   zMoonVitEncoder.forward  s     >>>PP)Am&:'-PPPAA.22=3GHH
 
 ^^^==
!$+.. 	 	HAu!Ez.  MM ,,];;r!   r   )rW   rX   rY   rZ   dictr[   r<   r   r\   rV   r]   r^   s   @r   r   r     s         8 88 8 	8
 8 
8 8 8 8 8 8."\49L	       r!   r   r   r   r   rq   merge_kernel_sizec                    |                      d          }g }d}|                                D ]}|d         |d         }}| ||||z  z            }	|\  }
}||
z  ||z  }}|	                    ||
|||          }|                    ddddd                                          }|                    ||z  |
|z  d          }|                    |           |||z  z  }|S )Nr   r   r   r   ra   ry   )rK   rM   r)   rQ   
contiguousrN   )r   rq   r   d_modeloutputspre_sumx_shaper5   r6   seqkernel_heightkernel_width
new_height	new_widthreshaped_seq
padded_seqs                   r   patch_mergerr    s   
 ffRjjGGG>>## " "
GAJ'FUN223&7#| &- 7,9NI
xxy,
 
 $++Aq!Q::EEGG!&&"ML$@"
 

 	z"""6E>!Nr!   c                   t     e Zd ZeZdZdgZdZdZ	 ddede	f fdZ
dej        d	ej        d
ej        fdZ xZS )MoonVitPretrainedModelmoonvitPackingTransformerTr   configr   c           
          t                      j        |g|R i | t          |          }|j        | _        |j        | _        |j        | _        d| _        t          |j        |j        |j        |j	                  | _
        t          |j        |j        |j        |j        |j        t          d         dd| d          | _        d S )Nr   )rd   rf   rg   rh   gelu_pytorch_tanhT)r   r   r   r   r   z.encoder)r   r   r   r   )r;   r<   r   r   r   rf   vit_processing_typer`   init_pos_emb_heightinit_pos_emb_widthpatch_embedr   num_hidden_layersnum_attention_headsintermediate_sizer   encoder)rB   r  r   inputskwargsrC   s        r   r<   zMoonVitPretrainedModel.__init__  s     	3&333F333&!!!'!9!- +#, /&(!5 3	
 
 
 &)/#7$0!3$%89!  &&&
 
 
r!   pixel_valuesrq   r$   c                     |                      ||          }|                     ||          }t          ||| j                  }|S )z
        Args:
            pixel_values (torch.Tensor): The input pixel values.
            grid_hw (torch.Tensor): The grid height and width.

        Returns:
            torch.Tensor: The output tokens.
        )r   )r  r  r  r   )rB   r  rq   r   s       r   rV   zMoonVitPretrainedModel.forward=  sR     ((w??]G<<$7d6L
 
 
 r!   r   )rW   rX   rY   r   config_class
model_type_no_split_modules_supports_flash_attn_2_supports_sdpar[   r<   r   r\   rV   r]   r^   s   @r   r  r    s         LJ-.!N
 
 

 
 
 
 
 
 
B!L38<	       r!   r  )r   )2collections.abcr   copyr   	functoolsr   r   torch.nnr=   torch.nn.functional
functionalrO   transformers.activationsr   transformers.modeling_utilsr   vllm.distributedr   r	   9vllm.model_executor.layers.attention.mm_encoder_attentionr
   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r    vllm.model_executor.models.utilsr   !vllm.model_executor.models.visionr   vllm.platformsr   'vllm.transformers_utils.configs.moonvitr   r    r\   rs   r1   Moduler3   r`   ru   r   r   r   r   rZ   r  r  r   r!   r   <module>r-     sN  Z % $ $ $ $ $       % % % % % %                 + + + + + + 7 7 7 7 7 7 I I I I I I I I X X X X X X 7 7 7 7 7 7         
 : 9 9 9 9 9 F F F F F F + + + + + + A A A A A A? ? ?22,238<2
5<%&2 2 2 2.    bi   D( ( ( ( (29 ( ( (Vw w w w w29 w w wt% % % % %29 % % %Pt t t t t") t t tn, , , , ,RY , , ,d )/ |\ CH~ 
%,	   :8 8 8 8 8_ 8 8 8 8 8r!   