
    .`iV                        U d dl mZ d dlmZmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8  G d de!          Z9 G d de!          Z:e9e:z  Z;e
e<d<    G d de-e	          Z= G d de+          Z> ede>          Z? G d d e*e?                   Z@ G d! d"e@e>                   ZA ejB        eAe>e,#           G d$ d%ejC        e'e(                      ZDdS )&    )abstractmethod)IterableMapping)	AnnotatedFinalLiteralProtocol	TypeAliasTypeVarN)BatchFeatureLlavaNextConfigLlavaNextProcessor)get_anyres_image_grid_shapeunpad_image)
VllmConfig)MULTIMODAL_REGISTRY)MultiModalFieldConfig)	ImageSize)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsMultiModal
SupportsPP)BaseLlavaMultiModalProcessorBaseLlavaProcessingInfoLlavaDummyInputsBuilderLlavaLikeConfigLlavaMultiModalProjectorinit_vision_tower_for_llava)SiglipVisionModel)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)get_num_selected_vision_tokensc            
           e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddddddh	          f         ed<   eej	        d
z   edd          f         ed<   d
S )LlavaNextImagePixelInputsa?  
    Dimensions:
        - bn: Batch size * number of images
        - np: Number of patches + 1
        - c: Number of channels (3)
        - h: Height
        - w: Width

    Note that `num_patches` may be different per batch and image,
    in which case the data is passed as a list instead of a batched tensor.
    pixel_valuestypebnnp   hw)dynamic_dimsN   image_sizes)__name__
__module____qualname____doc__r,   r   __annotations__r   torchTensorlistr        y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/llava_next.pyr*   r*   +   s         
 
 %3D'.
!222tEL))D$34&AAA	C   
 5<$.D!0D0DDEEEEEEr>   r*   c                   h    e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   dS )	LlavaNextImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsr,   r-   ifshsdataN)r5   r6   r7   r8   r,   r   r9   r   r:   r;   r   r=   r>   r?   rA   rA   B   sY           %3D'.
!222
EL++dE4"@"@@
AAAAAAr>   rA   LlavaNextImageInputsc                   >    e Zd ZU eeee                           ed<   dS )LlavaNextLikeConfigimage_grid_pinpointsN)r5   r6   r7   r   r<   intr9   r=   r>   r?   rH   rH   S   s+         T#Y000000r>   rH   c                   v    e Zd ZdefdZdefdZdededefdZded	ed
ededede	eef         fdZ
defdZdS )LlavaNextProcessingInforeturnc                 @    | j                             t                    S N)ctxget_hf_configr   )selfs    r?   rQ   z%LlavaNextProcessingInfo.get_hf_configX   s    x%%o666r>   kwargsc                      | j         j        t          fi |}|j        -|                                                                 }||_        |S rO   )rP   get_hf_processorr   
patch_sizeget_vision_encoder_infoget_patch_size)rR   rS   hf_processorrV   s       r?   rU   z(LlavaNextProcessingInfo.get_hf_processor[   sU    0tx01CNNvNN "*5577FFHHJ&0L#r>   image_widthimage_heightc                p   |                                  }|                                 }t          |                    ||          |j                  }t          ||f|j        |                                          \  }}|                     |||	                                ||          \  }}	||	z   |z   S )NrZ   r[   )
image_sizegrid_pinpointsrV   )original_heightoriginal_widthnpatchesnum_patch_heightnum_patch_width)
rQ   rW   r(   get_num_image_tokensvision_feature_select_strategyr   rI   get_image_size_get_num_unpadded_featuresget_patch_grid_length)
rR   rZ   r[   	hf_configvision_encoder_infobase_feature_sizerc   rd   unpadded_feature_sizenewline_feature_sizes
             r?   re   z,LlavaNextProcessingInfo.get_num_image_tokensg   s     &&((	"::<<:44') 5   4
 
 -H$k2$9*99;;-
 -
 -
)/ ++(&(>>@@-+ , 
 
	
!  %';;>OOOr>   r`   ra   rb   rc   rd   c                   ||z  }||z  }||z  }||z  }	||	k    r4t          t          |||z  z  d                    }
||
z
  dz  }|d|z  z
  }n3t          t          |||z  z  d                    }||z
  dz  }|d|z  z
  }||z  }|}||fS )N   r3   )rJ   round)rR   r`   ra   rb   rc   rd   current_heightcurrent_widthaspect_ratiocurrent_aspect_ratio
new_heightpadding	new_widthunpadded_featuresnewline_featuress                  r?   rh   z2LlavaNextProcessingInfo._get_num_unpadded_features   s     "$44 ?2%7,~=...o)GH!LL J &
2q8G+q7{;NNn(HI1MM I %y0Q6G)Q[9M*]:)!#344r>   c                     |                                  }d\  }}|j        D ]5\  }}|                     ||          }||k    r|}t          ||          }6|dk    s|t	          d          |S )N)r   Nr]   )widthheightr   z(Cannot have a largest feature size of 0!)rQ   rI   re   r   
ValueError)rR   rj   largest_feature_sizelargest_feature_pinpointr}   r|   	feat_sizes          r?   !get_image_size_with_most_featuresz9LlavaNextProcessingInfo.get_image_size_with_most_features   s    &&((	9@66&; 	Q 	QMFE11! 2  I ///'0$+45+P+P+P(1$$(@(HGHHH''r>   N)r5   r6   r7   rH   rQ   objectrU   rJ   re   tuplerh   r   r   r=   r>   r?   rL   rL   W   s        72 7 7 7 7	 	 	 	 	"P "P 	"P
 
"P "P "P "PJ5 5 	5
 5 5 5 
sCx5 5 5 5B(9 ( ( ( ( ( (r>   rL   _I)boundc            	       R    e Zd Zededeeef         deeef         fd            Z	dS ) BaseLlavaNextMultiModalProcessor	hf_inputshf_processor_mm_kwargsrM   c                     t           rO   )NotImplementedErrorrR   r   r   s      r?   _get_mm_fields_configz6BaseLlavaNextMultiModalProcessor._get_mm_fields_config   s
     "!r>   N)
r5   r6   r7   r   r   r   strr   r   r   r=   r>   r?   r   r      sb        "" !(V 4" 
++	,	" " " ^" " "r>   r   c                   B    e Zd Zdedeeef         deeef         fdZdS )LlavaNextMultiModalProcessorr   r   rM   c                     t          t          j        d          t          j        d          t          j        d                    S )Nimage)r+   r4   rB   )dictr   batchedr   s      r?   r   z2LlavaNextMultiModalProcessor._get_mm_fields_config   sE    
 .6w??-5g>>.6w??
 
 
 	
r>   N)	r5   r6   r7   r   r   r   r   r   r   r=   r>   r?   r   r      sX        	
	
 !(V 4	
 
++	,		
 	
 	
 	
 	
 	
r>   r   )infodummy_inputsc                       e Zd Z edddddd          Zeded	ed
edz  fd            Zddde	ded
df fdZ
ded
edz  fdZdeez  dej        d
ej        fdZdej        dej        ded
ej        fdZded
ej        eej        df         z  fdZded
ej        eej                 z  fdZded
efd Z	 d0dd!d"d#ej        d$edz  d%ej        dz  d&ed
ej        f
 fd'Z	 	 d1d#ej        d(ej        d)edz  d*ej        dz  ded
ej        ez  fd+Zd,ej        d
ej        dz  fd-Zd.e eeej        f                  d
e!e         fd/Z" xZ#S )2!LlavaNextForConditionalGenerationzlanguage_model.model.zvision_tower.zmulti_modal_projector.image_newlinezlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zmodel.image_newlinezlm_head.)orig_to_new_prefixmodalityirM   Nc                 N    |                     d          rdS t          d          )Nr   z<image>z Only image modality is supported)
startswithr~   )clsr   r   s      r?   get_placeholder_strz5LlavaNextForConditionalGeneration.get_placeholder_str   s,    w'' 	9;<<<r>    )prefixvllm_configr   c          
         t                                                       |j        j        }|j        }|j        j        }|j        }t          |t                    r|j	        j
        }d | _        n`t          |t          t          f          r$|j	        j
        t          |          z  }|| _        n t          dt!          |           d          || _        || _        |                     |d          5  t'          ||dt)          |d                    | _        t-          j        t1          j        |j        j
                            | _        t9          ||j        j
        |j        |j                  | _        d d d            n# 1 swxY w Y   |                      |          5  tC          ||j        t)          |d          	          | _"        d d d            n# 1 swxY w Y   | j"        j#        | _#        d S )
Nzvision_layer_feature type: z is not supportedr   Fvision_tower)quant_configrequire_post_normr   )vision_hidden_sizetext_hidden_sizeprojector_hidden_actmultimodal_projector_biaslanguage_model)r   rj   r   )$super__init__model_configrj   r   multimodal_configvision_feature_layer
isinstancerJ   vision_confighidden_sizeselect_layersr<   r   len	TypeErrorr,   config_mark_tower_modelr"   r'   r   nn	Parameterr:   emptytext_configr   r!   r   r   multi_modal_projector_mark_language_modelr&   r   make_empty_intermediate_tensors)	rR   r   r   r   r   r   r   r   	__class__s	           r?   r   z*LlavaNextForConditionalGeneration.__init__   s   )3"/'4F%:*C00 	!'!5!A!%D,tUm<< 		!'!5!AC$E E " "6D$d3G.H.H $ $ $  
 !2##K99 	 	 ;)"'#FN;;	! ! !D "$F.:;;" "D *B#5!'!3!?%+%@*0*J	* * *D&	 	 	 	 	 	 	 	 	 	 	 	 	 	 	" &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   5BF		FF)+G  G$'G$rS   c                 ,   |                     dd           }|                     dd           }|                     dd           }||d S |)| j        j        j        x}}t	          d||||d          S |t          d|          S t          d          )Nr+   r4   rB   )r0   r1   )r,   r+   r4   resolve_bindings)r,   rE   z This line should be unreachable.)popr   r   r^   r*   rA   AssertionError)rR   rS   r+   r4   rB   
expected_h
expected_ws          r?   _parse_and_validate_image_inputzALlavaNextForConditionalGeneration._parse_and_validate_image_input+  s     zz.$77jj55zz.$77L$84#&*k&?&JJJ,#)'##" "	    #0#!   
 ?@@@r>   r   r+   c                 <     ||| j         | j        j                  S )N)r   feature_select_strategy)r   r   rf   )rR   r   r+   s      r?   _image_pixels_to_featuresz;LlavaNextForConditionalGeneration._image_pixels_to_featuresI  s.     |,$(K$N
 
 
 	
r>   r^   patch_embeddingsstrategyc          	      N   |dk    r|                     dd          S |                    d          rb| j        j        j        | j        j        j        z  x}}|d         }||z  |j        d         k    rt          d          |j        d         dk    r|dd          }|                                \  }}	t          ||	f| j        j
        | j        j        j                  \  }
}|
|z  }|d |                             |
|||d          }d|v r|                    ddd	dd
                                                               dd	                               d	d
          }t          |||	f          }t          j        | | j        d d d d f         j        g |j        d d         dR                      |j                  fd          }|                     dd	                              dd          }n?|                    dd	dd
d                                                               dd
          }t          j        ||fd          }nBd|v r<t          j        || j        d                              |j                  fd          }n|}|S t          d|           )Nflatr   r   spatialz<The number of patches is not consistent with the image size.unpad   r3   r/   )dimz!Unexpected patch merge strategy: )flattenr   r   r   r^   rV   shaper~   tolistr   rI   viewpermute
contiguousr   r:   catr   expandtodevice	transpose)rR   r^   r   r   r}   r|   base_patch_embedsother_patch_embedsorig_height
orig_widthrc   rd   num_patchesmerged_patch_embeddingss                 r?   _merge_image_patch_embeddingsz?LlavaNextForConditionalGeneration._merge_image_patch_embeddingsW  s2    v#++Aq111y)) K	+)4;,78FU
 !1 3~!2!8!;;; R    %a(1,,%5abb%9" +5*;*;*=*='Z 5P *-K4K-85 51 /
 /@ &8%E%J%J$ovub& &" h&&*221aAqAA# A A	 ' *5*[*,E* *& */.$D.qqq$}=#G%7%=crc%BGDEG G GR 2 9::	 * * *& *<)C)CAq)I)I)S)S1* *&&
 +221aAqAA# A ' +0)&(:;+ + +'' h&&.3i- .t4778I8PQQ / / /++ /@+**GXGGHHHr>   inputs.c                    |d         }t          |t          j                  rt|j        \  }}}}}|                    ||z  |||          }|                     | j        |          }	|                     |	          }
 |
j        ||g|
j        dd          R  S d |D             }t          j        |          }|                     | j        |          }	t          j	        |                     |	          |          S )Nr+   r   c                 (    g | ]}|j         d          S )r   )r   ).0vs     r?   
<listcomp>zKLlavaNextForConditionalGeneration._process_image_pixels.<locals>.<listcomp>  s     B B B B B Br>   )
r   r:   r;   r   r   r   r   r   r   split)rR   r   r+   br   cr0   r1   stacked_pixel_valuesstacked_image_featuresstacked_patch_embeddingsnum_patches_per_batchs               r?   _process_image_pixelsz7LlavaNextForConditionalGeneration._process_image_pixels  s6    n-lEL11 	&2&8#A{Aq!#/#4#4Q_aA#N#N %)%C%C!#7& &" (,'A'A&( ($ 1+0;!9!?!C    !C B\ B B B$y66!%!?!?3"
 "
 {&&'=>>@U
 
 	
r>   image_inputc                 r    |d         dk    r|d         S                       |          }|                    d          Xt          |d                   } j        j        }|j        xt          j        fdt          |          D                        fdt          |          D             S )Nr,   rB   rE   r4   c                     g | ]}gS r=   r=   )r   _default_heightdefault_widths     r?   r   zJLlavaNextForConditionalGeneration._process_image_input.<locals>.<listcomp>  s    LLLQ.-0LLLr>   c                 R    g | ]#\  }}                     |         |d           $S )spatial_unpad)r   )r   )r   r   patch_features_batchr4   rR   s      r?   r   zJLlavaNextForConditionalGeneration._process_image_input.<locals>.<listcomp>  sQ     
 
 
 (' ..A 4 /  
 
 
r>   )
r   getr   r   r   r^   r:   	as_tensorrange	enumerate)rR   r   r   
batch_sizer   r   r   r4   s   `    @@@r?   _process_image_inputz6LlavaNextForConditionalGeneration._process_image_input  s     v.00v&&55kBB!oom44[011J K5M-:-EEN]/LLLLL%
:K:KLLL K
 
 
 
 
 ,55E+F+F	
 
 
 	
r>   c                 R     | j         di |}|g S |                     |          }|S )Nr=   )r   r  )rR   rS   r   vision_embeddingss       r?   embed_multimodalz2LlavaNextForConditionalGeneration.embed_multimodal  s?    :d:DDVDDI 55kBB  r>   T)is_multimodalhandle_oov_mm_token	input_idsmultimodal_embeddingsr  r	  c                    ||!t                                          |          S t                                          ||||          S )N)r  r  r	  )r   embed_input_ids)rR   r
  r  r  r	  r   s        r?   r  z1LlavaNextForConditionalGeneration.embed_input_ids  sU     !(M,A77**9555ww&&"7' 3	 ' 
 
 	
r>   	positionsintermediate_tensorsinputs_embedsc                 J    |d}| j                             ||||          }|S )a$	  Run forward pass for LlaVA-NeXT.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.

        Concretely, consider a text prompt:
        `"A chat between a curious human and an artificial intelligence
        assistant. The assistant gives helpful, detailed, and polite answers to
        the human's questions.
        USER: <image>\nWhat is shown in this image? ASSISTANT:"`.

        Tokenizer outputs:
        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
        29871, 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973, 319, 1799,
        9047, 13566, 29901]`.

        To reserve space in KV cache, we have to insert placeholder tokens
        before they are inputted to the model, so the input processor prepends
        additional image tokens (denoted as `32000`), resulting in:
        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
        29871, 32000, ..., 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973,
        319, 1799, 9047, 13566, 29901]`.

        Unlike in LLaVA-1.5, the number of image tokens inputted to the language
        model depends on the original size of the input image. Including the
        original image token in the input, the required number of image tokens
        is given by [`LlavaNextProcessingInfo.get_num_image_tokens`][vllm.model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Position indices for the input tokens.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [`LlavaNextImageInputs`][vllm.model_executor.models.llava_next.LlavaNextImageInputs]
        N)r  )r   model)rR   r
  r  r  r  rS   hidden_statess          r?   forwardz)LlavaNextForConditionalGeneration.forward  s?    l  + M+11y"6m 2 
 
 r>   r  c                 6    | j                             |          S rO   )r   compute_logits)rR   r  s     r?   r  z0LlavaNextForConditionalGeneration.compute_logits<  s     "11-@@@r>   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r$   load_weightshf_to_vllm_mapper)rR   r  loaders      r?   r  z.LlavaNextForConditionalGeneration.load_weightsB  s+    "4((""743I"JJJr>   rO   )NN)$r5   r6   r7   r%   r  classmethodr   rJ   r   r   r   r   rF   r   r   r#   r:   r;   r   r   r*   r   r   r<   r  r   r  boolr  r   r  r  r   setr  __classcell__)r   s   @r?   r   r      s{        & &=#2,D#21
 
	 	 	 =3 =3 =3: = = = [= BD 5
 5
 5
z 5
3 5
 5
 5
 5
 5
 5
 5
nAA		$A A A A<
%(99
 l
 
	
 
 
 
SI,SI:?,SIUXSI	SI SI SI SIj
)
 
elC/0	0
 
 
 
<
)
 
U\*	*
 
 
 
2! !4H ! ! ! ! >B

 .2$(
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
0 <@-1< <<< << 2D8	<
 |d*< < 
+	+< < < <|A|A 
	A A A AKHU33D-E$F K3s8 K K K K K K K Kr>   r   )Eabcr   collections.abcr   r   typingr   r   r   r	   r
   r   r:   torch.nnr   transformersr   r   r   2transformers.models.llava_next.modeling_llava_nextr   r   vllm.configr   vllm.multimodalr   vllm.multimodal.inputsr   vllm.multimodal.parser   vllm.sequencer   vllm.utils.tensor_schemar   r   clipr   
interfacesr   r   r   llavar   r   r   r    r!   r"   siglipr#   utilsr$   r%   r&   r'   visionr(   r*   rA   rF   r9   rH   rL   r   r   r   register_processorModuler   r=   r>   r?   <module>r5     s          - - - - - - - - J J J J J J J J J J J J J J J J        J J J J J J J J J J       
 # " " " " " / / / / / / 8 8 8 8 8 8 + + + + + + - - - - - - > > > > > > > > ! ! ! ! ! ! L L L L L L L L L L                & % % % % %            3 2 2 2 2 2F F F F F F F F.	B 	B 	B 	B 	BL 	B 	B 	B  == i   
1 1 1 1 1/8 1 1 1e( e( e( e( e(5 e( e( e(P WT0111" " " " "'CB'G " " "
 
 
 
 
$%<=
 
 
 (' 	 (  
cK cK cK cK cK	3Ez cK cK 
cK cK cKr>   