
    .`iwr                     (   U d dl mZ d dlmZmZmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@mAZA ddlBmCZC ddlDmEZEmFZFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZV ddlWmXZXmYZY  G d de@          ZZ G d de@          Z[ G d  d!e@          Z\eZe[z  e\z  Z]ee^d"<    G d# d$ej_                  Z` G d% d&e
          Za G d' d(e
          Zb G d) d*e8          Zc ed+ec,          Zd G d- d.e6ed                   Ze G d/ d0ec          Zf G d1 d2e7ed                   Zg G d3 d4egef                   Zh G d5 d6ec          Zi G d7 d8e7ei                   Zjd9e9d:ecfd;Zkdd<d=edd>e6ed         d?e)dz  d:e7fd@ZldAead:emfdBZnddCdDdAeadEe%dz  dFeodz  dGepd:eCePz  eNz  f
dHZq e'jr        elekeeI           G dJ dKej_        eGeHeIeF                      Zs G dL dMef          Zt G dN dOeh          Zu e'jr        eueteeI           G dP dQes                      ZvdS )R    )abstractmethod)IterableMappingSequence)	AnnotatedFinalLiteralProtocol	TypeAliasTypeVarN)BatchFeatureCLIPVisionConfigLlavaConfigPixtralVisionConfigPretrainedConfigSiglipVisionConfig)LlavaProcessor)PixtralProcessor)
VllmConfig)BaseDummyOptions)
get_act_fn)ColumnParallelLinearRowParallelLinear)QuantizationConfig)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsEagle3SupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)PixtralHFEncoderInfoPixtralHFVisionModel)SiglipVisionModel)AutoWeightsLoaderWeightsMapperget_layer_indexinit_vllm_registered_modelmaybe_prefix)get_num_selected_vision_tokensget_vision_encoder_infoc                   j    e Zd ZU dZdZed         ed<   eej	         e
dddd          f         ed<   dS )	LlavaImagePixelInputsa!  
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width

    Note that `height` or `width` may be different per batch and image,
    in which case the data is passed as a list instead of a batched tensor.
    pixel_valuestypebn   hwN__name__
__module____qualname____doc__rE   r	   __annotations__r   torchTensorr/        t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/llava.pyrC   rC   J   s[         	 	 %3D'.
!222EL++dAsC*H*HHIIIIIIrS   rC   c            
           e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddddddh          f         ed	<   d
S )PixtralHFImagePixelInputsa  
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels
        - h: Height
        - w: Width

    Note that `height` or `width` may be different per batch and image,
    in which case the data is passed as a list instead of a batched tensor.
    pixel_values_pixtralrE   rF   crH   rI   )dynamic_dimsrD   N)rK   rL   rM   rN   rE   r	   rO   r   rP   rQ   listr/   rR   rS   rT   rV   rV   Z   s         	 	 -CD'(
)BBBtEL))D#sCsCjAAA	C     rS   rV   c                   h    e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   dS )	LlavaImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsrE   rF   ifshsdataNrJ   rR   rS   rT   r\   r\   m   sY           %3D'.
!222
EL++dE4"@"@@
AAAAAArS   r\   LlavaImageInputsc                   h     e Zd Z	 	 ddedededededz  def fd	Zd
ej	        dej	        fdZ
 xZS )LlavaMultiModalProjectorN vision_hidden_sizetext_hidden_sizeprojector_hidden_actmultimodal_projector_biasquant_configprefixc                     t                                                       t          ||||| d          | _        t	          |          | _        t          ||||| d          | _        d S )Nz	.linear_1)biasri   rj   z	.linear_2)super__init__r   linear_1r   actr   linear_2)selfre   rf   rg   rh   ri   rj   	__class__s          rT   rn   z!LlavaMultiModalProjector.__init__   s     	,*%'''
 
 
 233)*%'''
 
 
rS   image_featuresreturnc                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)ro   rp   rq   )rr   rt   hidden_states_s       rT   forwardz LlavaMultiModalProjector.forward   sE    ==88q//==77qrS   )Nrd   )rK   rL   rM   intstrboolr   rn   rP   rQ   rz   __classcell__rs   s   @rT   rc   rc   ~   s         37
 

 
 "	

 $(
 )4/
 
 
 
 
 
 
6el u|        rS   rc   c                   z    e Zd ZU ee         ed<   ee         ed<   ee         ed<   eeee         z           ed<   dS )LlavaLikeConfigvision_configimage_token_indexvision_feature_select_strategyvision_feature_layerN)	rK   rL   rM   r   r   rO   r{   r|   rZ   rR   rS   rT   r   r      s]         )****Sz!!!$)#J...d3i000000rS   r   c                   &    e Zd ZU ee         ed<   dS )LlavaLikeProcessorimage_tokenN)rK   rL   rM   r   r|   rO   rR   rS   rT   r   r      s"         srS   r   c                       e Zd ZdefdZd Zededefd            Z	de
eedz  f         fdZded	edefd
ZdefdZdefdZdS )BaseLlavaProcessingInforu   c                 @    | j                             t                    S rw   )ctxget_hf_configr   rr   s    rT   r   z%BaseLlavaProcessingInfo.get_hf_config   s    x%%k222rS   c                 D    t          |                                           S rw   )rA   r   r   s    rT   rA   z/BaseLlavaProcessingInfo.get_vision_encoder_info   s    &t'9'9';';<<<rS   kwargsc                     t           rw   NotImplementedErrorrr   r   s     rT   get_hf_processorz(BaseLlavaProcessingInfo.get_hf_processor   s    !!rS   Nc                 
    dd iS )NimagerR   r   s    rT   get_supported_mm_limitsz/BaseLlavaProcessingInfo.get_supported_mm_limits   s    rS   image_widthimage_heightc                    |                                  }|                                 }t          |                    ||          |j                  S Nr   r   )r   rA   r@   get_num_image_tokensr   )rr   r   r   	hf_configvision_encoder_infos        rT   r   z,BaseLlavaProcessingInfo.get_num_image_tokens   s_     &&((	"::<<-44') 5   4
 
 	
rS   c                 x    |                                  }|                                x}}t          ||          S )N)widthheight)rA   get_image_sizer$   )rr   r   r   r   s       rT   !get_image_size_with_most_featuresz9BaseLlavaProcessingInfo.get_image_size_with_most_features   s=    "::<<,;;===uV4444rS   c                 ^    |                                  \  }}|                     ||          S r   )r   r   )rr   target_widthtarget_heights      rT   get_max_image_tokensz,BaseLlavaProcessingInfo.get_max_image_tokens   s;    &*&L&L&N&N#m(($& ) 
 
 	
rS   )rK   rL   rM   r   r   rA   r   objectr   r   r   r|   r{   r   r   r$   r   r   rR   rS   rT   r   r      s        3 3 3 3 3= = = " "4F " " " ^"cDj)A    
 
 	

 

 
 
 
"59 5 5 5 5

c 
 
 
 
 
 
rS   r   _I)boundc            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	LlavaDummyInputsBuilder	mm_countsru   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nr   r   )getinfor   r   )rr   r   
num_images	processorr   s        rT   get_dummy_textz&LlavaDummyInputsBuilder.get_dummy_text   s;    ]]7A..
I..00	+Z''rS   Nseq_len
mm_optionsc                     |                     dd          }| j                                        \  }}|r|                     d          nd }d|                     ||||          iS )Nr   r   )r   r   r   	overrides)r   r   r   _get_dummy_images)rr   r   r   r   r   r   r   image_overridess           rT   get_dummy_mm_dataz)LlavaDummyInputsBuilder.get_dummy_mm_data   s|     ]]7A..
&*i&Q&Q&S&S#m5?I*..111T T++"$%)	 ,  
 	
rS   rw   )
rK   rL   rM   r   r|   r{   r   r   r   r   rR   rS   rT   r   r      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rS   r   c                       e Zd ZdefdZdS )LlavaProcessingInfor   c                      | j         j        t          fi |}|j        -|                                                                 }||_        |S rw   )r   r   r   
patch_sizerA   get_patch_size)rr   r   hf_processorr   s       rT   r   z$LlavaProcessingInfo.get_hf_processor   sT    0tx0JJ6JJ "*5577FFHHJ&0L#rS   NrK   rL   rM   r   r   rR   rS   rT   r   r      s/              rS   r   c            	           e Zd Zededeeef         deeef         fd            Z	de
deeef         dedee         fdZdS )	BaseLlavaMultiModalProcessor	hf_inputshf_processor_mm_kwargsru   c                     t           rw   r   rr   r   r   s      rT   _get_mm_fields_configz2BaseLlavaMultiModalProcessor._get_mm_fields_config  s
     "!rS   mm_itemsout_mm_kwargsc                       j                                         }|j        dt          f fd}t	          dg|          gS )Nitem_idxc                                          dt          t          f          }t          |t                    r|                    |           }n;|                    |           }j                            |j        |j	                  }g|z  S )Nr   r   )
	get_itemsr"   r#   
isinstanceget_feature_sizer   r   r   r   r   )r   imagesnum_image_tokens
image_sizeimage_token_idr   rr   s       rT   get_replacementzIBaseLlavaMultiModalProcessor._get_prompt_updates.<locals>.get_replacement  s    ''-/BC F &"566 #)#:#:8#D#D  #228<<
#'9#A#A * 0!+!2 $B $ $ 
 ##&666rS   r   modalitytargetreplacement)r   r   r   r{   r*   )rr   r   r   r   r   r   r   s   ``    @rT   _get_prompt_updatesz0BaseLlavaMultiModalProcessor._get_prompt_updates  s{     I++--	"4	7c 	7 	7 	7 	7 	7 	7 	7 	7"  &'+  
 	
rS   N)rK   rL   rM   r   r   r   r|   r   r   r   r%   r    r   r+   r   rR   rS   rT   r   r     s        "" !(V 4" 
++	,	" " " ^"
%
 !(V 4
 -	

 
,	
 
 
 
 
 
rS   r   c                   B    e Zd Zdedeeef         deeef         fdZdS )LlavaMultiModalProcessorr   r   ru   c                 l    t          t          j        d          t          j        d                    S Nr   )rD   r]   dictr   batchedr   s      rT   r   z.LlavaMultiModalProcessor._get_mm_fields_config3  7    
 .6w??.6w??
 
 
 	
rS   N)	rK   rL   rM   r   r   r|   r   r   r   rR   rS   rT   r   r   2  sX        

 !(V 4
 
++	,	
 
 
 
 
 
rS   r   c                       e Zd ZdefdZdS )PixtralHFProcessingInfor   c                 2     | j         j        t          fi |S rw   )r   r   r   r   s     rT   r   z(PixtralHFProcessingInfo.get_hf_processor?  s     (tx()9DDVDDDrS   Nr   rR   rS   rT   r   r   >  s6        E E E E E E ErS   r   c            
            e Zd Zdedeeef         deeef         deeef         def
 fdZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZ xZS )PixtralHFMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsru   c                    t                                          ||||          }|                    d          }|G|d         }t          |          t          |          k    sJ d t	          ||          D             |d<   |S )N)r   r   r   r   rD   image_sizesc                 <    g | ]\  }\  }}|d d d |d |f         S rw   rR   ).0prH   rI   s       rT   
<listcomp>zCPixtralHFMultiModalProcessor._call_hf_processor.<locals>.<listcomp>Y  sC     1 1 1!*FQ!!!RaR!)1 1 1rS   )rm   _call_hf_processorr   lenzip)	rr   r   r   r   r   processed_outputsrD   r   rs   s	           rT   r   z/PixtralHFMultiModalProcessor._call_hf_processorD  s     "GG66!	 7 
 
 ),,^<<# ,M:K|$$K(8(888881 1.1,.L.L1 1 1n- ! rS   r   r   c                 l    t          t          j        d          t          j        d                    S r   r   r   s      rT   r   z2PixtralHFMultiModalProcessor._get_mm_fields_config_  r   rS   r   r   c                   	
  | j         j        di |}| j                                         }| j                                         }|                                }||j                 
|j        ||j                 t          |j	        t                    sJ t          |          	dt          f	
fd}t          dg|          gS )Nr   c                     
                     dt                    }|                    |           }                    |j        |j                  \  }}	g|z  gz   |z  }|d<   t          j        |	          S )Nr   r   )r   r#   r   get_patch_grid_sizer   r   r,   select_token_id)r   r   r   ncolsnrowstokensencoder_infoimage_break_idimage_end_idr   r   s         rT   r   zIPixtralHFMultiModalProcessor._get_prompt_updates.<locals>.get_replacement{  s    ''1DEEF..x88J';;&,'. <  LE5
 &&..1AAUJF%F2J&6v~NNNrS   r   r   rR   )r   r   r   get_tokenizer	get_vocabimage_break_tokenr   image_end_tokenr   r   r   r8   r{   r*   )rr   r   r   r   r   r   	tokenizervocabr   r  r  r  r   s    `       @@@@rT   r   z0PixtralHFMultiModalProcessor._get_prompt_updatesi  s    /DI.HH1GHH	I++--	I++--	##%%y:;"4Y67)13FGGGGG+I66	Oc 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O  &'+  
 	
rS   )rK   rL   rM   r|   r   r   r   r   r   r   r%   r    r   r+   r   r~   r   s   @rT   r   r   C  s       !! f%! 3;'	!
 CK(! 
! ! ! ! ! !6

 !(V 4
 
++	,	
 
 
 
&
%&
 !(V 4&
 -	&

 
,	&
 &
 &
 &
 &
 &
 &
 &
rS   r   r   ru   c                     |                      t                    }t          |j        t                    rt          |           S t          |           S rw   )r   r   r   r   r   r   r   )r   r   s     rT   _build_llava_or_pixtral_hf_infor    sI     !!+..I))+>?? ,&s+++s###rS   cacher   dummy_inputsr  c                    t          | t                    rt          | ||          S t          | t                    rt	          | ||          S t          t          |                     )Nr  )r   r   r   r   r   r   rE   )r   r  r  s      rT   $_build_llava_or_pixtral_hf_processorr    s     $/00 
+
 
 
 	
 $+,, 
'
 
 
 	
 d4jj
)
))rS   r   c                 "   | j         }| j        j        t          |t                    rt          |          S t          |t          t          f          rt          fd|D                       S t          dt          |           d          )zDetermine the number of hidden layers to initialize up to in the
    visual encoder.

    Args:
        hf_config: Model config with vision feature layer(s).
    c              3   8   K   | ]}t          |          V  d S rw   )r=   )r   idxnum_hidden_layerss     rT   	<genexpr>z)_get_num_hidden_layers.<locals>.<genexpr>  s.      UUs?3(9::UUUUUUrS   zvision_layer_feature type: z is not supported)r   r   r  r   r{   r=   rZ   tuplemax	TypeErrorrE   )r   feature_layersr  s     @rT   _get_num_hidden_layersr    s     3N!/A.#&& V~/@AAA	NT5M	2	2 VUUUUnUUUUUU
Md>&:&:MMM  rS   rd   )require_post_normrj   ri   r  rj   c                f   | j         }t          |           }t          |t                    rt	          |||||          S t          |t
                    rt          |||||          S t          |t                    rt          |||||          S dt          |           }t          |          )N)ri   num_hidden_layers_overrider  rj   zUnsupported vision config: )r   r  r   r   r1   r   r:   r   r9   rE   r   )r   ri   r  rj   r   r  msgs          rT   init_vision_tower_for_llavar    s     +M /y99-!122 
%'8/
 
 
 	
 
M#5	6	6 
 %'8/
 
 
 	
 
M#6	7	7 
#%'8/
 
 
 	
 >](;(;
=
=C
c
"
""rS   )r   r  c                       e Zd Zg dddgdZ eddddd	
          Zededededz  fd            Z	de
edf         ddfdZde
edf         fdZdddededdf fdZdededz  fdZdeez  ez  dej        eej                 z  dej        e
ej        df         z  fdZdeez  dej        e
ej        df         z  fdZd edej        e
ej        df         z  fd!Zdedefd"Z	 	 d1d#ej        d$ej        d%edz  d&ej        dz  dedej        ez  fd'Z d(ej        dej        dz  fd)Z!d*e"e
eej        f                  de#e         fd+Z$de%fd,Z&d-edefd.Z'd/edefd0Z( xZ)S )2LlavaForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   iru   Nc                 N    |                     d          rdS t          d          )Nr   <image>z Only image modality is supported)
startswith
ValueError)clsr   r*  s      rT   get_placeholder_strz1LlavaForConditionalGeneration.get_placeholder_str  s,    w'' 	9;<<<rS   layers.c                 B    ||                                  j        _        d S rw   )get_language_modelmodelaux_hidden_state_layers)rr   r1  s     rT   set_aux_hidden_state_layersz9LlavaForConditionalGeneration.set_aux_hidden_state_layers  s    BH!!'???rS   c                 n    t          |                                 j        j                  }d|dz  |dz
  fS )N   rG   )r   r3  r4  r1  )rr   
num_layerss     rT   "get_eagle3_aux_hidden_state_layersz@LlavaForConditionalGeneration.get_eagle3_aux_hidden_state_layers  s6    00228?@@
:?JN33rS   rd   )rj   vllm_configrj   c                4   t                                                       |j        j        }|j        }|j        j        }|| _        || _        |j        j        |j        j	        dk    rdg|j        _        |j
        |j        j        dk    rd|_
        |                     |d          5  t          ||dt          |d                    | _        t#          |j        j        |j        j        |j
        |j        |t          |d          	          | _        d d d            n# 1 swxY w Y   |                     |          5  t-          ||j        t          |d
                    | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )NmistralMistralForCausalLMgelur   Fvision_tower)ri   r  rj   multi_modal_projector)re   rf   rg   rh   ri   rj   language_model)r;  r   rj   )rm   rn   model_configr   ri   multimodal_configconfigtext_configarchitectures
model_typerg   r   
hidden_act_mark_tower_modelr  r?   r@  rc   hidden_sizerh   rA  _mark_language_modelr>   rB  make_empty_intermediate_tensors)rr   r;  rj   rE  ri   rD  rs   s         rT   rn   z&LlavaForConditionalGeneration.__init__  s)   )3"/'4F!2
 ,4"-::0D/EF,'/$/699*0F'##K99 	 	 ;)"'#FN;;	! ! !D *B#)#7#C!'!3!?%+%@*0*J)#F,CDD* * *D&	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   ,A-D%%D),D)+E<<F F r   c                    |                     dd           }|                     dd           }||d S |N| j        j        j        dk    rt	          d|          S | j        j        j        x}}t          d|||d          S |5| j        j        j        dk    rt          d          t          d|	          S t          d
          )NrD   r]   pixtralrW   )rE   rD   )rH   rI   )rE   rD   resolve_bindingsz)Pixtral-HF does not support image_embeds.)rE   r`   z This line should be unreachable.)
poprE  r   rH  rV   r   rC   r.  r\   AssertionError)rr   r   rD   r]   
expected_h
expected_ws         rT   _parse_and_validate_image_inputz=LlavaForConditionalGeneration._parse_and_validate_image_inputH  s    zz.$77zz.$77L$84#{(3y@@0/!-   
 '+k&?&JJJ(#)'1
!C!C    #{(3y@@ !LMMM,#!   
 ?@@@rS   r@  rD   c                 0     ||| j         j                  S )N)feature_select_strategy)rE  r   )rr   r@  rD   s      rT   _image_pixels_to_featuresz7LlavaForConditionalGeneration._image_pixels_to_featuresj  s(     |$(K$N
 
 
 	
rS   inputsc                 H    |d         }|                      | j        |          S )NrD   )rX  r@  )rr   rY  rD   s      rT   _process_image_pixelsz3LlavaForConditionalGeneration._process_image_pixelsv  s'     n---d.?NNNrS   image_inputc                 F   |d         dk    r|d         S |                      |          }t          |t          j                  r|                     |          S d |D             }|                     t          j        |                    }t          j        ||          }|S )NrE   r]   r`   c                 (    g | ]}|j         d          S )r   )shape)r   image_features     rT   r   zFLlavaForConditionalGeneration._process_image_input.<locals>.<listcomp>  s     TTTM,Q/TTTrS   )r[  r   rP   rQ   rA  catsplit)rr   r\  rt   feature_sizesr]   s        rT   _process_image_inputz2LlavaForConditionalGeneration._process_image_input~  s     v.00v&&33K@@nel33 	>--n===TT^TTT11%)N2K2KLL{<??rS   c                 N     | j         di |}|g S |                     |          S )NrR   )rU  rd  )rr   r   r\  s      rT   embed_multimodalz.LlavaForConditionalGeneration.embed_multimodal  s9    :d:DDVDDI((555rS   	input_ids	positionsintermediate_tensorsinputs_embedsc                 J    |d}| j                             ||||          }|S )a  Run forward pass for LLaVA-1.5.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.

        Concretely, consider a text prompt:
        `"USER: <image>\nWhat's the content of the image?\nASSISTANT:"`.

        Tokenizer outputs:
        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.

        To reserve space in KV cache, we have to insert placeholder tokens
        before they are inputted to the model, so the input processor prepends
        additional image tokens (denoted as `32000`), resulting in:
        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
        29901]`.

        We insert 575 tokens so that including the original image token in the
        input, there are a total of 576 (24 * 24) image tokens, which
        corresponds to the number of image tokens inputted to the language
        model, i.e. the number of image tokens outputted by the visual encoder.

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Position indices for the input tokens.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [`LlavaImageInputs`][vllm.model_executor.models.llava.LlavaImageInputs]
        N)rj  )rB  r4  )rr   rg  rh  ri  rj  r   rx   s          rT   rz   z%LlavaForConditionalGeneration.forward  s?    Z  + M+11y"6m 2 
 
 rS   rx   c                 6    | j                             |          S rw   )rB  compute_logits)rr   rx   s     rT   rm  z,LlavaForConditionalGeneration.compute_logits  s     "11-@@@rS   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r;   load_weightshf_to_vllm_mapper)rr   rn  loaders      rT   rq  z*LlavaForConditionalGeneration.load_weights  s+    "4((""743I"JJJrS   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        rB  rA  r@  )rB  	connectortower_model)r7   from_string_fieldr   s    rT   get_mm_mappingz,LlavaForConditionalGeneration.get_mm_mapping  s'     /+-&
 
 
 	
rS   r   c                     |S rw   rR   )rr   r   s     rT   get_num_mm_encoder_tokensz7LlavaForConditionalGeneration.get_num_mm_encoder_tokens  s
      rS   num_vision_tokensc                     |S rw   rR   )rr   r{  s     rT   get_num_mm_connector_tokensz9LlavaForConditionalGeneration.get_num_mm_connector_tokens  s
     ! rS   NN)*rK   rL   rM   packed_modules_mappingr<   rr  classmethodr|   r{   r0  r  r6  r:  r   rn   r   ra   rU  r1   r:   r9   rP   rQ   rZ   rX  rC   rV   r[  rd  r2   rf  r-   rz   rm  r   setrq  r7   rx  rz  r}  r~   r   s   @rT   r!  r!    s        322$i0 
 & &=#2,D1
 
   =3 =3 =3: = = = [=I%S/ Id I I I I4E#s(O 4 4 4 4 BD 0
 0
 0
z 0
3 0
 0
 0
 0
 0
 0
 0
d A A	D	  A  A  A  AD

%(99<PP

 lT%,%77

 
elC/0	0	

 

 

 

O%(AAO 
elC/0	0O O O O% 
elC/0	0   $6 64H 6 6 6 6 <@-14 4<4 <4 2D8	4
 |d*4 4 
+	+4 4 4 4lA|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
   
       !! 
! ! ! ! ! ! ! !rS   r!  c                       e Zd ZdefdZdS )MantisProcessingInfor   c                    |                                  }|                                 }|                    d|                                           |                    d|j                    | j        j        t          fi |S )Nr   r   )r   rA   
setdefaultr   r   r   r   r   )rr   r   r   vision_infos       rT   r   z%MantisProcessingInfo.get_hf_processor  s    &&((	2244,(B(B(D(DEEE,4	
 	
 	

 )tx(BB6BBBrS   Nr   rR   rS   rT   r  r    s6        
C 
C 
C 
C 
C 
C 
CrS   r  c                   |     e Zd Z	 	 d	deee         z  dedeeef         deeef         dz  de	dz  de
f fdZ xZS )
MantisMultiModalProcessorNr   r   r   tokenization_kwargsmm_uuidsru   c                    | j                                         }|j        }| j                             dd          t	                                          |||||          }|                     |          }	|	                                }
|d         }|d         }dt          ffd}| 	                    t          d|gz  |	          g|
          }|                     |d
         |          \  }}|                     |	||          }|                     ||          }|                     ||
           d |                                D             }t!          d||||          S )Nr   r   )r  r   	mm_hashesr   c                 H    d                     d| dz    ddz  dg          S )Nrd   z(image r0   z	: <Image>r,  z	</Image>))join)r   r   s    rT   get_replacement_mantisz?MantisMultiModalProcessor.apply.<locals>.get_replacement_mantis!  s<    775hl555 00  rS   r   r   prompt_token_idsc                 .    i | ]\  }}|d  |D             S )c                 6    g | ]}|                                 S rR   )to_range)r   items     rT   r   z>MantisMultiModalProcessor.apply.<locals>.<dictcomp>.<listcomp>C  s     @@@4t}}@@@rS   rR   )r   r   placeholderss      rT   
<dictcomp>z3MantisMultiModalProcessor.apply.<locals>.<dictcomp>B  s=     !
 !
 !
&, @@<@@@!
 !
 !
rS   
multimodal)rE   r  r   r  mm_placeholders)r   r   r   r   rm   apply_to_mm_itemsget_all_countsr{   _bind_and_group_updatesr*   _apply_prompt_updates_get_mm_prompt_updates_find_mm_placeholders_validate_mm_placeholdersitemsr   )rr   r   r   r   r  r  r   r   resultr   mm_item_countsr   r  r  mantis_mm_repls
prompt_idsry   
orig_replsr  mm_placeholder_rangesr   rs   s                       @rT   r  zMantisMultiModalProcessor.apply  s    I++--	"4  999 : 
 

 "  
 
 $$W--!0022;'	;'		S 	 	 	 	 	 	 66!$*+.>> 6   	
 	
 22%&
 

A
 00"
 


 44ZLL&&GGG!
 !
*9*?*?*A*A!
 !
 !

  '1
 
 
 	
rS   r~  )rK   rL   rM   r|   rZ   r{   r   r   r   r!   r   r  r~   r   s   @rT   r  r     s         <@.2L
 L
d3iL
 $L
 !(V 4	L

 %S&[1D8L
 %t+L
 
L
 L
 L
 L
 L
 L
 L
 L
 L
 L
rS   r  c                       e Zd ZdS )MantisForConditionalGenerationN)rK   rL   rM   rR   rS   rT   r  r  R  s         	DrS   r  )wabcr   collections.abcr   r   r   typingr   r   r	   r
   r   r   rP   torch.nnnntransformersr   r   r   r   r   r   transformers.models.llavar   transformers.models.pixtralr   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.cacher   vllm.multimodal.inputsr   r   r   r    r!   vllm.multimodal.parser"   r#   r$   r%   vllm.multimodal.processingr&   r'   r(   r)   r*   r+   r,   vllm.sequencer-   vllm.utils.tensor_schemar.   r/   clipr1   
interfacesr2   r3   r4   r5   r6   module_mappingr7   rO  r8   r9   siglipr:   utilsr;   r<   r=   r>   r?   visionr@   rA   rC   rV   r\   ra   rO   Modulerc   r   r   r   r   r   r   r   r   r   r   r  r  r{   r  r}   r|   r  register_processorr!  r  r  r  rR   rS   rT   <module>r     sW          7 7 7 7 7 7 7 7 7 7 J J J J J J J J J J J J J J J J                       5 4 4 4 4 4 8 8 8 8 8 8 " " " " " " 3 3 3 3 3 3 < < < < < < U U U U U U U U F F F F F F / / / / / / > > > > > >                                          . - - - - - > > > > > > > > ! ! ! ! ! !              + * * * * * ? ? ? ? ? ? ? ? % % % % % %              L K K K K K K KJ J J J JL J J J        &	B 	B 	B 	B 	B 	B 	B 	B 558QQ )   
         ry      F1 1 1 1 1h 1 1 1       *
 *
 *
 *
 *
0 *
 *
 *
Z WT0111
 
 
 
 
4R8 
 
 
>    1   )
 )
 )
 )
 )
#:2#> )
 )
 )
X	
 	
 	
 	
 	
;<OP 	
 	
 	
E E E E E5 E E E
L
 L
 L
 L
 L
#:;R#S L
 L
 L
^$	$$ $ $ $ 26	* * *
*(,* ($.	*
 * * * *.o #    0 &*&# &# &#&#$t+&# d{	&#
 &# ((+??&# &# &# &#R ('(	((  
y! y! y! y! y!I|/^y! y! 
y!xC C C C C. C C CM
 M
 M
 M
 M
 8 M
 M
 M
d ('	(  
	 	 	 	 	%B 	 	 
	 	 	rS   