
    .`i*                     "   d dl Z d dlZdedededefdZdej        dej        eeeef         z  d	ededej        f
d
Z	 	 ddej        d	edededej        f
dZ	dej        de
ej                 dej        dededededeej        ef         fdZdS )    Ntokens_per_frame
num_framesqreturnc                 Z    | |z  }t          |d|z
  z            }| }t          ||          S )as  
    Compute the number of retained tokens for a given video.
    Method ensures that we retain all the tokens from the first frame
    regardless of the pruning rate.

    Args:
        tokens_per_frame: The number of tokens per frame.
        num_frames: The total number of frames.
        q: The pruning rate.

    Returns:
        The number of retained tokens.
       )intmax)r   r   r   total_tokensevs_num_tokensmin_num_tokenss         g/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/multimodal/evs.pycompute_retained_tokens_countr      s9      $j0LQ/00N%N~~...    video_embedsvideo_size_thwspatial_merge_sizec           
         t          t          |          \  }}}|                     |||z  ||z  |                     d                    } ||z  ||z  z  }t          j        j                            | dddf         | dddf         d          }d|z
  }	t	          j        dt	          j	        | dddddddf                   z  |	gd          }	|	
                    d          }
t	          j        |
ddd	          }t          |||
          }|d|         }t	          j        |
t          j                  }d||<   |                    |	                                          }|
                    d          }|S )aP  
    Computes the retention mask for input video embeddings.

    Args:
        video_embeds (`torch.Tensor`): The input video embeddings
            of shape `(T * H * W // spatial_merge_size ^ 2, hidden_size)`
        video_size_thw (`torch.LongTensor` of shape `(3)`):
            The temporal, height and width of video.
        spatial_merge_size: Size reduction for rows & cols dimensions.
        q: (`float`): Pruning rate factor [0,1)

    Returns:
        `torch.Tensor`: The retention mask for the video embeddings of
            `(T * H * W // spatial_merge_size ^ 2)` shape.
    r   N.dim   r   T)r   
descendingstable)r   r   r   )dtype)mapr	   reshapesizetorchnn
functionalcosine_similaritycat	ones_likeviewargsortr   
zeros_likebool)r   r   r   r   THWr   
similaritydissimilaritydissimilarity_flatorderretain_num_tokenstopk_indicesretention_maskmasks                   r   compute_retention_maskr4   &   s   * #~&&GAq!  ''			"	 L //A9K4KL$66QRRW|CRCH52 7  J 
NM I	u|BQB111aK899	9=Iq  M '++B//M,"dSSSE5)a1   +++,L%&8
KKKN#'N< #++M,>,>,@,@AANr""DKr         ?tokens_per_secondvideo_second_per_gridc                 J   | d         }| d         |z  }| d         |z  }t          j        |                              dd                              d||z                                ||z                                                                            }t          j        |                              ddd                              |d|                                          }t          j        |                              ddd                              ||d                                          }	t          j        |g                              ddd                              |||                                          }t          j        |||	|gd          }
|
S )a  
    Computes the mrope for video embeddings based on the grid dimensions.
    Computed mrope positions match original qwen 2.5 implementation,
    but positions are built for media being the first element in sequence.

    Args:
        video_size_thw: Media size (num frames, rows, cols)
        spatial_merge_size: Size reduction for rows & cols dimensions.
        tokens_per_second: Number of tokens per second.
        video_second_per_grid: Number of seconds per video.

    Returns:
        Tensor of shape `(T * H * W, 4)` where last dimension
        represents mrope positions [0:3), while the last channel
        contains value of llm_grid_w repeated for all positions.
    r   r      r   r   )	r   aranger%   expandmullongflattentensorstack)r   r   r6   r7   
llm_grid_t
llm_grid_h
llm_grid_wt_indexh_indexw_index	positionss              r   compute_mrope_for_mediarH   _   sg   ,  "J"&88J"&88J L$$T"a[[VB
Z/00S"%::;;		  	Z  	aQ	
B
	+	+		  	Z  	aB	
J	+	+		  	j\""	aA	
J
	3	3		  Wgw
CKKKIr   	input_idsmultimodal_positionsmrope_positionsnum_computed_tokensvision_start_token_idimage_token_idvideo_token_idc                    t          j        t          j        |                                          }|                                 }|                     |          }	|                     |          }
|	|
z  }| }t          |          dk    rT|                                r9t          |	                                
                                dz   |z
            n| }||fS t          j        |          }t          j        |d|                   }||k    rT|                                r9t          |	                                
                                dz   |z
            n| }||fS | |k                        d          d         }|D ]}|||k              }t          |          rK|d         }t          j        |d|                   }||k    }|r||z
  }|}n-|||k             d         }d}|}n|||k             d         }d}|}|d|f         dz   }|dz   |z   }||j        d         z   }|dd         |z   |dd||f<   |d         |z   }t          j        ||d                                         d	          }||z   dz
  |dd||f<   ||j        d         z  }|	                                dz   |z
  
                                }||fS )
au  
    Update part of input mrope positions.
    Original mrope_positions are computed incorrectly, so once we prune media
    tokens we should reflect this in the mrope positions for the LLM.

    This method supports chunked prefill approach where
    multimodal_embeddings are passed to LLM in chunks, so input
    multimodal_embeddings may contain zero, some or even some part of all
    multimodal_embeddings for a given prompt.

    Each multimodal_positions has 4 extra channels
    (First 3 channels corresponds to original 3 mrope positions, last channel
    is the maximum width of the media repeated). Provided multimodal_positions
    do not reflect location of media position in sequence - they are computed
    like the media is in the 0-th position in the sequence.

    Method works as follows: it recomputes mrope_positions starting from the
    `num_computed_tokens` for `total_len_of_multimodal_embeddings` and then
    shifts all text tokens that goes after total_len_of_multimodal_embeddings.

    It also handles case when multimodal_embeddings is partial
    (e.g. one media is split into two prefill stages)

    Args:
        input_ids: (N,) All input tokens of the prompt (entire sequence).
        multimodal_positions: List of mrope positions for each media.
        mrope_positions: Existing mrope positions (4, N) for entire sequence.
        num_computed_tokens: A number of computed tokens so far.
        vision_start_token_id: Token indicating start of vision media.
        image_token_id: Image token id
        video_token_id: Video token id

    Returns:
        Tuple of (mrope_positions, mrope_position_delta).
    r   r   NT)as_tupler      )rR   r   r   )typingcastr   
LongTensorclonenumeleqlenr	   r
   itemcount_nonzerononzeroshapecumsumr=   )rI   rJ   rK   rL   rM   rN   rO   rG   N
image_mask
video_mask
media_mask	text_maskdeltatotal_mm_tokensseen_mm_tokensvision_start_indicesmm_posseen_vision_start_indiceslast_vision_start_token'seem_mm_tokens_before_last_vision_startin_the_middle_of_mediamm_embeddings_seenglobal_mm_startnext_vision_start_tokenbaselocal_start	local_endoffsettext_pos_summrope_positions_deltas                                  r   recompute_mrope_positionsrv      sl   \ #)+///11# #I 	An--Jn--Jj(JI   A%%9B9J9JRY]]__))++a/14555QRPR%)*55O(4H5H4H)IJJN ((9B9J9JRY]]__))++a/14555QRPR%%)>>GGQUGVV	 ' ?/ ?/ %9 #66%
! ()) #	6
 '@&C#6;6I33347 73 !HH # & :"%LL # #: +?(,??++' &'""9
 ';$(;;''# "#5O _,-1%),>>&,q/1	.4QqSkD.@	!!![**+ $|Iijj$9$>$>$@$@aHHH$06$9A$=	!!!Yq[.! 	v|A.&]]__q014::<<+++r   )r5   r5   )rS   r   r	   floatr   TensorrU   tupler4   rH   listrv    r   r   <module>r|      s    //'*//4// / / /,6,6$uS#s]';;6 6 	6
 \6 6 6 6x  ##&	8 8$88 8 !	8
 \8 8 8 8vL,L,u|,L, %L, 	L,
 L, L, L, 5S !L, L, L, L, L, L,r   