
    .`ikS                        U d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZmZmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ  ee          Z  ede          Z! G d dee!                   Z" G d dee	e!                   Z# G d de          Z$de$de#fdZ%ddde&dej'        dedz  defdZ(de&dej'        defdZ)d Z*dede+fdZ,e
d         Z-e-eej.        gej.        f         z  Z/ee0d <   d!e/e1z  deej.        gej.        f         fd"Z2d#e&d!e/e1z  de&fd$Z3ddddd%d&ej.        e4ej.                 z  d'ej5        j6        dz  d(e4e&         dz  d)e&dz  d*eej.        gej.        f         dz  d+e/dz  dej.        fd,Z7d-ej.        d.ej5        j8        dej.        fd/Z9	 dAd1e4e&         d2e&de:e4e&         e4e&         e4e&         f         fd3Z;d.ej5        j8        d4ej.        d5e4e4e&                  d6e
d7         de:ej.        d8f         f
d9Z<d:e&d;e&d<e&d=e4e&         d>ej.        d?ej.        dej.        fd@Z=dS )B    N)ABCabstractmethod)Callable)FinalGenericLiteralProtocol	TypeAliasTypeVar)PretrainedConfig)MultiModalConfig
VllmConfigget_current_vllm_config)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)init_logger)current_platform)AttentionBackendEnum_C)boundc                       e Zd ZU eed<   dS )_RootConfigvision_configN)__name__
__module____qualname__r   __annotations__     u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/vision.pyr   r      s         r    r   c                        e Zd Zdee         ddf fdZedededefd            Zedefd            Z	edefd	            Z
edefd
            Z xZS )VisionEncoderInfo	hf_configreturnNc                 n    t                                                       || _        |j        | _        d S N)super__init__r$   r   )selfr$   	__class__s     r!   r)   zVisionEncoderInfo.__init__!   s1    "&4r    image_widthimage_heightc                    t           r'   NotImplementedError)r*   r,   r-   s      r!   get_num_image_tokensz&VisionEncoderInfo.get_num_image_tokens'   s
     "!r    c                     t           r'   r/   r*   s    r!   get_image_sizez VisionEncoderInfo.get_image_size0       !!r    c                     t           r'   r/   r3   s    r!   get_patch_sizez VisionEncoderInfo.get_patch_size4   r5   r    c                     t           r'   r/   r3   s    r!   get_patch_grid_lengthz'VisionEncoderInfo.get_patch_grid_length8   r5   r    )r   r   r   r   r   r)   r   intr1   r4   r7   r9   __classcell__)r+   s   @r!   r#   r#       s       5+b/ 5d 5 5 5 5 5 5 " " 	"
 
" " " ^" " " " " ^" " " " " ^" "s " " " ^" " " " "r    r#   c                   &    e Zd ZU ee         ed<   dS )VisionLanguageConfigr   N)r   r   r   r   r   r   r   r    r!   r=   r=   =   s$         )******r    r=   r$   r%   c                 >   ddl m}m} ddlm}m} ddlm}m} t          | j
        |          r ||           S t          | j
        |          r ||           S t          | j
        |          r ||           S dt          | j
                   }t          |          )N   )CLIPEncoderInfoCLIPVisionConfig)PixtralHFEncoderInfoPixtralVisionConfig)SiglipEncoderInfoSiglipVisionConfigzUnsupported vision config: )clipr@   rA   pixtralrB   rC   sigliprD   rE   
isinstancer   typer0   )r$   r@   rA   rB   rC   rD   rE   msgs           r!   get_vision_encoder_inforL   A   s    77777777BBBBBBBB========))+;<< *y)))))+>?? /##I...))+=>> ,  +++
GY-D(E(E
G
GC
c
"
""r    attn_backend_override	head_sizedtyperN   c                0    t          j        | ||          S )zE
    Get the available attention backend for Vision Transformer.
    )backend)r   get_vit_attn_backend)rO   rP   rN   s      r!   _get_vit_attn_backendrT   R   s'     0%   r    c                     	 t                      }|j        }||j        nd}n# t          $ r d}Y nw xY w||j        nd}t          | ||          }|S )z;
    Get the attention backend for Vision Transformer.
    NrM   )r   model_configmultimodal_configAssertionErrormm_encoder_attn_backendrT   )rO   rP   vllm_configrV   rW   rN   attn_backends          r!   rS   rS   b   s    !"9";";"/.:.FL**D 	  ! ! ! !
 ( 	11 
 )3  L
     # 22c                      	 t                      } | j        }||j        nd}n# t          $ r d}Y nw xY w||j        nd}|dk    S )z>
    Get the tensor parallel type for Vision Transformer.
    Ndata)r   rV   rW   rX   mm_encoder_tp_mode)rZ   rV   rW   r_   s       r!   is_vit_use_data_parallelr`      s    !"9";";"/.:.FL**D 	  ! ! ! ! 1B0M,,SW  ''r\   rZ   c                     | j         j        S )zICallable to be passed to `@support_torch_compile`'s `enable_if` argument.)compilation_configcompile_mm_encoder)rZ   s    r!   should_torch_compile_mm_vitrd      s    )<<r    )classdefaultfullVisionFeatureSelectStrategystrategyc                 ~    t          |           r| S | dk    rd S | dk    rd S | dk    rd S t          d|           )Nre   c                 $    | d d d dd d f         S Nr?   r   featss    r!   <lambda>z._get_vision_feature_selector.<locals>.<lambda>   s    U111bqb!!!8_ r    rf   c                 $    | d d dd d d f         S rl   r   rm   s    r!   ro   z._get_vision_feature_selector.<locals>.<lambda>   s    U111abb!!!8_ r    rg   c                     | S r'   r   rm   s    r!   ro   z._get_vision_feature_selector.<locals>.<lambda>   s    U r    $Unexpected feature select strategy: )callable
ValueError)ri   s    r!   _get_vision_feature_selectorru      sp       7,,, 9,,,6"""
HHHH
I
IIr    num_vision_tokensc                     t          |          r.t          j        d| d          } ||          }|j        d         S |dk    rdS |dk    r| dz
  S |dk    r| S t	          d|          )Nr?   @   re   rf   rg   rr   )rs   torchemptyshapert   )rv   ri   dummy_featuresdummy_selected_featuress       r!   get_num_selected_vision_tokensr~      s      0Q(92>>"*(>":":&,Q//7q9 1$$6  
HHHH
I
IIr    )select_layersmax_possible_layerslast_hs_procfeature_select_strategyencoder_outputspost_layer_normr   r   r   r   c                $   	
 |at           t          j                  st          d          | |            |t	          |          
 
            | |           S  S |t          d          t                     dz
  }||z
  	 	fd|D             }|d         |dz
  dfv }||r ||d                   |d<   |t	          |          

fd|D             }||r ||d                   |d<   t          j        |d          S )	a3  Given the outputs a visual encoder module that may correspond to the
    output of the last layer, or a list of hidden states to be stacked,
    handle post normalization and resolve it into a single output tensor.

    Args:
        encoder_outputs: Output of encoder's last layer or all hidden states.
        post_layer_norm: Post norm to apply to the output of the encoder.
        select_layers: Optional layer indices to grab from the encoder
            outputs; if provided, encoder outputs must be a list.
        max_possible_layers: Total layers in the fully loaded visual encoder.
        last_hs_proc: Optional callable to be applied to the last layer if it
            is used, e.g., pooling head for Siglip. This is done prior to
            feature selection and layer normalization. If select_layers are
            provided, the output of last_hs_proc must be able to be
            concatenated with the other select_layers along the last dimension.
        feature_select_strategy: Defines how to select the hidden states
            from each layer.
    NzJExpected only a single encoder output when `select_layers` is not providedz@`max_possible_layers` must be provided alongside `select_layers`r?   c                 B    g | ]}|d k    r|         n
|z            S r   r   ).0	layer_idxr   offsets     r!   
<listcomp>z2resolve_visual_encoder_outputs.<locals>.<listcomp>  sK         >> 		""Y/0  r    c                 &    g | ]} |          S r   r   )r   hsselect_featuress     r!   r   z2resolve_visual_encoder_outputs.<locals>.<listcomp>  s#    9992??2&&999r    dim)rI   ry   Tensorrt   ru   lencat)r   r   r   r   r   r   num_loaded_layershs_pooluses_last_layerr   r   s   `        @@r!   resolve_visual_encoder_outputsr      s   6 /5<88 	2   #*l?;;O".:;RSSO-oo>>O&"??333"N
 
 	
 O,,q0 #44F     '	  G $B',?!,CR+HHOO"l72;//*67NOO9999999 ""%ogbk229W"%%%%r    image_inputvision_modelc                    | j         d         }t                      }||z   dz
  |z  }||z  |z
  }dd|                                 dz
  z  z  d|fz   }t          j        j                            | |          }t                      }|||z  |dz   |z  df         }	 ||	          }
|
                                }
t          |
d          }
|
d|df         }
|
S )aW  Run a vision model with data parallelism (DP) sharding. The function
    will shard the input image tensor on the first dimension and run the vision
    model

    Args:
        image_input (torch.Tensor): Image input tensor.
        vision_model (torch.nn.Module): Vision model.
    Returns:
        torch.Tensor: Output image embeddings
    r   r?   r      .r   N)
r{   r   r   ry   nn
functionalpadr   
contiguousr   )r   r   
num_chunksmp_world_sizenum_chunks_per_ranknum_padded_chunksr   image_input_paddedrankimage_input_per_rankvision_embeddingss              r!   run_dp_sharded_vision_modelr     s    "1%J8::M%59mK+m;jH
!{((1,-
.!5F1G
GC,00cBB)++D-""dQh2E%EEsJ %%9::)446689JPQRRR)+:+s*:;r    r   sizesnum_gpusc                    
 t                     }|dk    rg dg|z  dg|z  fS d t          |          D             }dg|z  
t          t          |           fdd          }|D ]T}t          t          |          
fd          }||                             |           
|xx          |         z  cc<   Ut          t                               }t          t                               }t          |          D ]E}	|                    ||	                    |                    t          ||	                              F||
fS )aq  
    Generate load balancing assignment and metadata
    for distributing data across GPUs.
    The load is determined by the total image sizes,
    not the number of images.

    Args:
        sizes: The size of each image
        num_gpus: Number of GPUs to balance across

    Returns:
        shuffle_indices:
            Indices to reorder data for balanced loading
        gpu_sample_counts:
            Number of samples assigned to each GPU
        grouped_sizes_per_gpu:
            Total size assigned to each GPU

    Example:
        ```
        sizes = [1000, 100, 200, 50]
        num_gpus = 2
        ```

    r   c                 @    g | ]}t          t                               S r   )listr:   )r   _s     r!   r   z/get_load_balance_assignment.<locals>.<listcomp>`  s     <<<qtCy{{<<<r    c                     |          S r'   r   )ir   s    r!   ro   z-get_load_balance_assignment.<locals>.<lambda>g  s    a r    T)keyreversec                     |          S r'   r   )r   	gpu_loadss    r!   ro   z-get_load_balance_assignment.<locals>.<lambda>l  s    Yq\ r    )r   )r   rangesortedminappendr   r:   extend)r   r   	n_samplesgpu_assignmentslarge_to_small_indicesidxmin_gpushuffle_indicesgpu_sample_countsgpu_idr   s   `         @r!   get_load_balance_assignmentr   ;  s{   < E

I A~~A3>A3>11 =<E(OO<<<OhI
 $i0000$   & ) )eHoo+A+A+A+ABBB '',,,'eCj( 3ikkOS	// ? ? 	v6777 	  _V%<!=!=>>>>.	::r    pixel_valuesgrid_thw_list	rope_type)rope_3drope_2d.c                  $% t                      }t                      }d D             }dgt          j        |          $t	          ||          \  }}}	dgt          j        |          }
||
|         |
|dz                     }t          |          dk    r"t          j        $fd|D                       }n.t          j        dj	        d         fj
        j                  }|dk    r| j        d         | j        d         z  %n| j        | j        z  %t          |	          %z  }fd|D             }|dk    r|j	        d         dk    rJ | |t          j        |                    }t!          |t"                    rt          j        |d          }nt%          | j        d	d
          }t          j        d%|fj
        j                  }nF|j	        d         dk    r | ||          }n(t          j        d| j        fj
        j                  }|j	        d         }||k     r||z
  }|dk    r;t          j        ||j	        d         |j	        d         f|j        |j
                  }n.t          j        ||j	        d         f|j        |j
                  }t          j        ||gd          }n|}t+          |d          }t#          t          j                             }t/          |          D ]2}||z  }||	|         %z  z   }|                    |||                    3%fd|D             }d
gt                    z  }d}t/          |          D ]N}||         }|dk    r>||||z            }||         }d} |D ]}!||!         }"|| | |"z            ||!<   | |"z  }  ||z  }Ot3          d |D                       }#t          |#          t          |          k    s
J d            |#S )a  Run a vision model with data parallelism (DP) sharding.
    The function will shard the input image tensor on the
    first dimension and run the vision model.
    This function is used to run the vision model with mrope.

    Args:
        vision_model (torch.nn.Module): Vision model.
        pixel_values (torch.Tensor): Image/Video input tensor.
        grid_thw_list: List of grid dimensions for each image
        rope_type: Type of rope used in the vision model.
                   Different rope types have different dimension to do ViT.
                   "rope_3d" for 3D rope (e.g., Qwen2.5-VL)
                   "rope_2d" for 2D rope (e.g., Kimi-VL)
    Returns:
        torch.Tensor: Output image embeddings

    Example:
        ```
        vision_model.out_hidden_size = 64
        vision_model.spatial_merge_size = 2
        pixel_values.shape = (1350, channel)
        grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]]
        tp_size = 2
        ```

    c                 6    g | ]}t          j        |          S r   )mathprod)r   grid_thws     r!   r   z5run_dp_sharded_mrope_vision_model.<locals>.<listcomp>  s"    KKK8,,KKKr    r   r?   c                 B    g | ]}|         |d z                     S )r?   r   )r   r   cum_patches_per_imager   s     r!   r   z5run_dp_sharded_mrope_vision_model.<locals>.<listcomp>  sD        2158MaRSe8TTU  r    )devicerP   r   c                      g | ]
}|         S r   r   )r   r   r   s     r!   r   z5run_dp_sharded_mrope_vision_model.<locals>.<listcomp>  s    FFF=+FFFr    r   hidden_sizeNr   )rP   r   c                     g | ]}|z  S r   r   )r   
patch_sizeembed_dim_reduction_factors     r!   r   z5run_dp_sharded_mrope_vision_model.<locals>.<listcomp>  s,          7A1	1     r    c              3      K   | ]}||V  	d S r'   r   )r   embeds     r!   	<genexpr>z4run_dp_sharded_mrope_vision_model.<locals>.<genexpr>5  s0        8I8I8I8I8I r    zFound unassigned embeddings)r   r   	itertools
accumulater   r   ry   r   rz   r{   r   rP   merge_kernel_sizespatial_merge_sizemaxtensorrI   r   getattrconfigout_hidden_sizer   r   r   r   tuple)&r   r   r   r   tp_sizetp_rank_localpatches_per_imageimage_to_tp_rankr   grouped_pixel_values_lencum_gpu_sample_countsimage_idxs_localpixel_values_localmax_len_per_ranklocal_grid_thw_listimage_embeds_localout_dimcurrent_lenpadding_sizepaddingimage_embeds_local_paddedgathered_embedsrank_embeddingsr   	start_idxend_idxpatches_per_output_imageoriginal_order_embeddingscurrent_idxcountrank_images
rank_embedembed_startimg_idximg_patchesout_embeddingsr   r   s&    ``                                 @@r!   !run_dp_sharded_mrope_vision_modelr    s]   B 344G 344M LK]KKKI!56G!H!HI 	$$5w?? D(*B
 I!56G!H!HI (m,/D]UVEV/WW
 q  "Y    )  
 
 #["1%&&$
 
 
 I*1-0Nq0QQ 	#"
 +l.MM 	# 3448RRFFFF5EFFF I#A&**!-"EL1D$E$E" " ,d33 J%*Y/Aq%I%I%I"l1=$GGG!&.8#*"(" " " #A&**!-.@BU!V!V "'L01#*"(" " " %*1-K%%%'+5	!!k &,Q/&,Q/
 ).)0  GG k17:;(.)0  G
 %*I/A7.KQR$S$S$S!!$6! 77PVWXXXO 5<(**Og C C++	$T*.HH
 	y/@ABBBB       EV     
 "&]);); ;Kg ! !!$'199 +;u9L+LMK(.JK& + +6w?5?+";;6)'2 {*5 K  4    N ~#&?"@"@@@@% A@@ r    r   
vision_idxr   t_indexgrid_hsgrid_wsc                 *   g }||         |z  }||         |z  }t          j        |                              ddd                              t	          |          d|                                          }	t          j        |                              ddd                              t	          |          |d                                          }
t          j        |                              |j                                      dd                              d||z            	                                                                }t          j
        ||	|
g          }|                    || z              t          j        |d          }|S )Nr?   r   r   )ry   arangeviewexpandr   flattenr   tor   longstackr   r   )r   r  r   r  r  r  llm_pos_ids_list
llm_grid_h
llm_grid_wh_indexw_indext_index_tensor_llm_pos_idsllm_pos_idss                 r!   get_llm_pos_ids_for_visionr  >  sQ    $(::J$(::JZ  	aQ	Gb*	-	-		  	Z  	aB	Gj"	-	-		  	W	J			b!	J+	,	,		  ;ABBLL94555),!444Kr    )r   )>r   r   abcr   r   collections.abcr   typingr   r   r   r	   r
   r   ry   transformersr   vllm.configr   r   r   vllm.distributedr   r   r   vllm.loggerr   vllm.platformsr   #vllm.v1.attention.backends.registryr   r   loggerr   r   r#   r=   rL   r:   rP   rT   rS   r`   boolrd   VisionFeatureSelectStrategyStrr   rh   r   strru   r~   r   r   	LayerNormr   Moduler   r   r   r  r  r   r    r!   <module>r&     sw         # # # # # # # # $ $ $ $ $ $ H H H H H H H H H H H H H H H H  ) ) ) ) ) ) M M M M M M M M M M         
 $ # # # # # + + + + + + D D D D D D	X		WT)***    (2,   " " " " "WR[ " " ":+ + + + +8 + + +#'; #@Q # # # #* :>	  ; 0$6	
     ;    :( ( (&=Z =D = = = =
 "))C!D  #Xu|nel.J%KK Y   
J)C/Ju|nel*+J J J J(JJ)C/J 	J J J J2 '+&*BFBFP& P& P&\D$66P&X'$.P& 9t#	P&
 tP& EL>5<784?P& 94?P& \P& P& P& P&f-2X_
\   F B; B;9B;B; 49d3ic*+B; B; B; B;J{(/{,{ S	?{
 +,{ 5<{ { { {|""" " #Y	"
 \" \" \" " " " " "r    