
    .`ioO                        d Z ddlZddlmZmZ ddlmZmZ ddlZddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z< dZ=g dZ>ddddZ?ddd dZ@d!ej        d"eAfd#ZB G d$ d%ej
        jC                  ZD G d& d'e7          ZE G d( d)ej
        jF                  ZG G d* d+e0          ZH G d, d-e.eH                   ZI G d. d/e/eH                   ZJ e%jK        eJeHeI0           G d1 d2e
jC        e;e<                      ZLdS )3zPyTorch Ovis model.    N)IterableMapping)	AnnotatedLiteral)Tensor)gumbel_softmaxpadsoftmax)BatchFeaturePretrainedConfig)
VllmConfig)BaseDummyOptions)ReplicatedLinear)QuantizationConfig)
AIMv2Model)SiglipVisionModel)AutoWeightsLoader
flatten_bninit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacement)IntermediateTensors)OvisProcessor)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPPz<image>)iiiiiz	<unused0>z<|reserved_special_token_0|>z<|image_pad|>)gemma2llamaqwen2   i igP y_softdimc                     |                      |d          }t          j        | t          j                                      ||d          S )NT)keepdim)memory_formatg      ?)argmaxtorch
zeros_likelegacy_contiguous_formatscatter_)r-   r.   indexs      s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/ovis.py	st_argmaxr9   N   sM    MM#tM,,E4   hsE3     c            	           e Zd Z	 	 ddededz  def fdZ	 	 ddededz  dedej        fdZ	e
dej        fd	            Ze
dej        fd
            Zdej        dej        fdZdej        dej        fdZdej        dej        fdZ xZS )VisualTokenizerN configquant_configprefixc                    t                                                       || _        |                     ||| d          | _        |j        t          t                    z
  }t          j	        
                    t          |j        j        |j        z  |j        z  |dd          t          j	                            |                    | _        d S )Nz	.backboner>   r?   r@   F)biasreturn_bias)super__init__r>   _init_backbonebackbone
vocab_sizelenIMAGE_INDICATOR_IDSr3   nn
Sequentialr   backbone_confighidden_sizehidden_stride	LayerNormhead)selfr>   r?   r@   head_dim	__class__s        r8   rF   zVisualTokenizer.__init__W   s     	++%''' , 
 
 $s+>'?'??H''&2&'&' !   Hx((

 

			r:   returnc                     |j         j        }|dk    rt          |j         |d|          S |dk    rt          |j         ||          S t	          d|           )Naimv2F)r>   r?   require_post_normr@   siglip_vision_modelrB   z)Unsupported visual tokenizer model_type: )rN   
model_typer   r   
ValueError)rS   r>   r?   r@   r[   s        r8   rG   zVisualTokenizer._init_backboner   s     +6
  -)"'	    000$-)   
 QZQQRRRr:   c                 X    t          | j                                                  j        S N)nextrR   
parametersdtyperS   s    r8   ra   zVisualTokenizer.dtype   s!    DI((**++11r:   c                 X    t          | j                                                  j        S r^   )r_   rR   r`   devicerb   s    r8   rd   zVisualTokenizer.device   s!    DI((**++22r:   logitsc                     | j         j        dk    rt          |d          }nk| j         j        dk    rt          || j         j        d          }n>| j         j        dk    rt          |d          }nt          d| j         j                   |S )	Nr
   r.   gumbel_argmaxT)tauhardr9   zLInvalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got )r>   tokenize_functionr
   r   rj   r9   r\   )rS   re   tokenss      r8   tokenizezVisualTokenizer.tokenize   s    ;(I55V,,,FF[*o==#FdKKKFF[*k99v2...FFI)-)FI I   r:   pixel_valuesc           	         |                      |          }| j        j        r|d d dd d d f         }| j        j        dk    r6|j        \  }}}t          |dz            }|dz  |k    s
J d            |                    ||||          }| j        j        || j        j        z  z
  | j        j        z  }t          |ddd|d|fdd          }||z  }|                    ||| j        j        z  | j        j        || j        j        z  | j        j        |          }|                    dddddd	          }|	                    d          }|                    |d
| j        j        | j        j        z  |z            }|S )Nr%   g      ?   z5The token sequence length should be a perfect square.r   constant         rg   )
rH   r>   drop_cls_tokenrP   shapeintreshaper	   permuteflatten)rS   ro   featuresnLdsqrt_lpls           r8   encodezVisualTokenizer.encode   s   ==..;% 	*122qqq)H ;$q((nGAq!C[[F19>>>G ">>  ''661==H)Vdk6O-OP)*B 8aAr1b%9:qIIHbLF''$+33)$+33) H  ''1aAq99H''**H''2t{04;3LLqP H r:   c                     |                      |          }|                     |          }|                     |          }t          j        j                            |dt          t                    fdd          }|S )z8[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]r   rr   )modevalue)	r   rR   rn   r3   rL   
functionalr	   rJ   rK   )rS   ro   r|   re   rm   s        r8   forwardzVisualTokenizer.forward   sw    ;;|,,8$$v&& $(('(()	 ) 
 
 r:   )Nr=   )__name__
__module____qualname__r   r   strrF   rL   ModulerG   propertyr3   ra   rd   r   rn   r   r   __classcell__rU   s   @r8   r<   r<   V   s        37	
 
 
 )4/
 	
 
 
 
 
 
< 37	S S S )4/S 	S
 
S S S S. 2u{ 2 2 2 X2 3 3 3 3 X3u|     '5< 'EL ' ' ' 'REL U\        r:   r<   c                       e Zd ZU dZed         ed<   eej         e	dddd          f         ed<   eej         e	d	          f         ed
<   ee
e          e	d          f         ed<   dS )OvisImagePatchInputsa  
    Dimensions:
        - bnp: Batch size * number of images * number of patches
        - h: Height of each patch
        - w: Width of each patch
        - patch_indicators: Batch size * (number of patches + 1)
        - bn: Batch size * number of images
    image_patchestypebnprs   hw	flat_datapatch_indicatorsindicator_tokensbnpatches_per_imageN)r   r   r   __doc__r   __annotations__r   r3   r   r$   listrx    r:   r8   r   r      s           /
""""{{5!S#'F'FFGGGGkk:L.M.M MNNNN cKK,=,=!=>>>>>>r:   r   c                   ^     e Zd Z fdZdedef fdZed             Zed             Z xZ	S )VisualEmbeddingc                 :     t                      j        |i | d S r^   )rE   rF   )rS   argskwargsrU   s      r8   rF   zVisualEmbedding.__init__   s%    $)&)))))r:   visual_tokensrV   c                     |j         t          j        t          j        t          j        t          j        t          j        fv r!t                                          |          S t          j	        || j
                  S r^   )ra   r3   int8int16int32int64longrE   r   matmulweight)rS   r   rU   s     r8   r   zVisualEmbedding.forward   s]    JKKKJ#
 
 
 77??=111|M4;777r:   c                     | j         j        S r^   )r   rd   rb   s    r8   rd   zVisualEmbedding.device   s    {!!r:   c                     | j         j        S r^   )r   ra   rb   s    r8   ra   zVisualEmbedding.dtype   s    {  r:   )
r   r   r   rF   r   r   r   rd   ra   r   r   s   @r8   r   r      s        * * * * *	8V 	8 	8 	8 	8 	8 	8 	8 " " X" ! ! X! ! ! ! !r:   r   c                   `    e Zd ZdefdZdefdZdefdZde	eedz  f         fdZ
defdZdS )	OvisProcessingInfor   c                      | j         j        t          f|                                 |                                 d|S )N)image_pad_tokenimage_segment_len)ctxget_hf_processorr"   get_image_pad_tokenget_image_segment_len)rS   r   s     r8   r   z#OvisProcessingInfo.get_hf_processor  sP    (tx(
 4466"88::
 
 	
 
 	
r:   rV   c                     |                                  j        }|j        j        }|j        j        }|j        }t          j        ||z            }||z  dk    sJ d| d|             ||z  dz  dz
  S )Nr   zpatch_grid_length z# is not divisible by hidden_stride rq   r%   )get_hf_configvisual_tokenizer_configrN   
image_size
patch_sizerP   mathceil)rS   r   r   r   rP   patch_grid_lengths         r8   r   z(OvisProcessingInfo.get_image_segment_len  s    "&"4"4"6"6"N,<G
,<G
/= Ij:&=>> =0A555-!2 - -*- - 655
 "]2q81<<r:   c                     |                                                                  }|j        }t                              |          S r^   )r   get_text_configr[   IMAGE_PAD_TOKEN_MAPget)rS   hf_text_configtext_model_types      r8   r   z&OvisProcessingInfo.get_image_pad_token  s;    ++--==??(3"&&777r:   Nc                 
    dd iS )Nimager   rb   s    r8   get_supported_mm_limitsz*OvisProcessingInfo.get_supported_mm_limits  s    r:   c                     |                                                                  \  }}|                                 j        j        }t          ||z  dz  ||z  dz            S )N	   )widthheight)r   get_image_sizer   r   rP   r   )rS   r   r   hss       r8   !get_image_size_with_most_featuresz4OvisProcessingInfo.get_image_size_with_most_features   s_    --//>>@@!!9G urzA~frkAoFFFFr:   )r   r   r   objectr   rx   r   r   r   r   r   r   r   r   r:   r8   r   r     s        
 
 
 
 
=s = = = =8S 8 8 8 8
cDj)A    G9 G G G G G Gr:   r   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	OvisDummyInputsBuilder	mm_countsrV   c                 B    |                     dd          }t          |z  S )Nr   r   )r   IMAGE_TOKEN)rS   r   
num_imagess      r8   get_dummy_textz%OvisDummyInputsBuilder.get_dummy_text)  s     ]]7A..
Z''r:   Nseq_len
mm_optionsc                     |                     dd          }| j                                        \  }}|r|                     d          nd }d|                     ||||          i}|S )Nr   r   )r   r   r   	overrides)r   infor   _get_dummy_images)	rS   r   r   r   r   target_widthtarget_heightimage_overridesmm_datas	            r8   get_dummy_mm_dataz(OvisDummyInputsBuilder.get_dummy_mm_data-  s     ]]7A..
&*i&Q&Q&S&S#m5?I*..111T T++"$%)	 ,  
 r:   r^   )
r   r   r   r   r   rx   r   r   r   r   r   r:   r8   r   r   (  s        (S(9 (c ( ( ( ( =A	  38$ C!112T9	
 
     r:   r   c            
           e Zd Zdee         dee         fdZdedeeef         deeef         deeef         de	f
 fdZ
d	ee         dee         fd
Zde	deeef         deeef         fdZdedeeef         dedee         fdZ xZS )OvisMultiModalProcessorimage_indicatorsrV   c                 j    | j                                         }|j        j        fd|D             S )a  
        Filter image indicators placeholders and convert them to corresponding
        tokens in visual tokenizer.
        For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305]
        should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]
        c                 ,    g | ]}|d k     |z   dz   S )ii,  r   ).0xvte_vocab_sizes     r8   
<listcomp>zMOvisMultiModalProcessor.image_indicators_to_visual_tokens.<locals>.<listcomp>R  s(    OOOQa$hh"S(hhhr:   )r   r   r   rI   )rS   r   	hf_configr   s      @r8   !image_indicators_to_visual_tokensz9OvisMultiModalProcessor.image_indicators_to_visual_tokensE  s>     I++--	":EOOOO2BOOOOr:   promptr   	mm_kwargs
tok_kwargsc                    
 |sP j                                         }|                    |d          }t          t	          |g          d          S t                                          ||||          } j                                         

fd|d         D             } fd	|D             }	t          j	        |	          |d
<   |S )NF)add_special_tokens)	input_idspt)tensor_type)r   r   r   r   c                 :    g | ]}                     |          S r   )construct_image_indicators)r   gridhf_processors     r8   r   z>OvisMultiModalProcessor._call_hf_processor.<locals>.<listcomp>i  s7     
 
 
 33D99
 
 
r:   gridsc                 :    g | ]}                     |          S r   )r   )r   	indicatorrS   s     r8   r   z>OvisMultiModalProcessor._call_hf_processor.<locals>.<listcomp>m  s7     
 
 
 229==
 
 
r:   r   )
r   get_tokenizerr   r   dictrE   _call_hf_processorr   r3   tensor)rS   r   r   r   r   	tokenizer
prompt_idsprocessed_outputsr   r   r   rU   s   `         @r8   r   z*OvisMultiModalProcessor._call_hf_processorT  s     	P	//11I"))&U)KKJ
| < < <$OOOO!GG66!	 7 
 
 y1133
 
 
 
)'2
 
 

 
 
 
-
 
 
 16=M0N0N,-  r:   prompt_tokensc                     |S r^   r   )rS   r  s     r8   _apply_hf_processor_tokens_onlyz7OvisMultiModalProcessor._apply_hf_processor_tokens_onlyt  s
     r:   	hf_inputshf_processor_mm_kwargsc                     t          t          j        d          t          j        d          t          j        d                    S )Nr   )ro   r   r   )r   r   batched)rS   r  r	  s      r8   _get_mm_fields_configz-OvisMultiModalProcessor._get_mm_fields_configz  sE    
 .6w??'/882:7CC
 
 
 	
r:   mm_itemsout_mm_kwargsc                 R     dt           f fd}t          dt          |          gS )Nitem_idxc                     d         |          }|d         j         }j                                        }|                    |          S )Nr   r   )datar   r   construct_image_placeholders)r  out_itemr   r   r  rS   s       r8   get_replacement_oviszIOvisMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_ovis  sF    $W-h7HG$)D95577L<<TBBBr:   r   )modalitytargetreplacement)rx   r    r   )rS   r  r	  r  r  s   `  ` r8   _get_prompt_updatesz+OvisMultiModalProcessor._get_prompt_updates  s]    	C3 	C 	C 	C 	C 	C 	C 	C  "0  
 	
r:   )r   r   r   r   rx   r   r   r   r   r   r   r  r   r  r   r   r    r  r   r   s   @r8   r   r   D  sx       Ps)P 
cP P P P!! f%! 3;'	!
 CK(! 
! ! ! ! ! !@Cy 
c   	
	
 !(V 4	
 
++	,		
 	
 	
 	

%
 !(V 4
 -	

 
	 
 
 
 
 
 
 
 
r:   r   )r   dummy_inputsc                   Z    e Zd Zededededz  fd            Zddded	ef fd
Zde	de
dz  fdZde
defdZde	defdZ	 	 ddej        dej        dedz  dej        dz  de	dej        ez  fdZdej        dej        dz  fdZdeeeej        f                  dee         fdZ xZS )Ovisr  irV   Nc                 X    |                     d          rt          S t          d          )Nr   z Only image modality is supported)
startswithr   r\   )clsr  r  s      r8   get_placeholder_strzOvis.get_placeholder_str  s-    w'' 	;<<<r:   r=   )r@   vllm_configr@   c                   t                                                       |j        j        }|j        }|| _        |                     |          5  t          |                    |	                                          t          |d                    | _        d d d            n# 1 swxY w Y   |                     |d          5  t          |j        || d          | _        t!          | j        j        j        | j        j                  | _        d d d            n# 1 swxY w Y   | j        	                                j        }t*          |         | _        |                                 j        | _        d S )Nllm)r"  r@   r   z.visual_tokenizerrB   )rE   rF   model_configr   r?   r>   _mark_language_modelr   with_hf_configr   r   r$  _mark_tower_modelr<   r   visual_tokenizerr   rI   rO   vter[   IMAGE_PAD_TOKEN_ID_MAPimage_pad_token_idget_language_modelmake_empty_intermediate_tensors)rS   r"  r@   r>   r?   r   rU   s         r8   rF   zOvis.__init__  s   )3"/(.&&{33 	 	1'66v7M7M7O7OPP#FE22  DH	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ##K99 	 	$35) 333% % %D!
 '3>@W DH	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 +5577B"8"I ##%%E 	,,,s&   A
B''B+.B+AD""D&)D&r   c           	         |                     dd           }|                     dd           }||d S ||t          |t          j        t          f          st          dt          |                     t          |t          j        t          f          st          dt          |                     t          dt          |d          d |D             t          |d          	          S t          d
          )Nro   r   z*Incorrect type of pixel values. Got type: z.Incorrect type of indicator_tokens. Got type: r   T)concatc                 (    g | ]}|j         d          S )r   )rw   )r   r   s     r8   r   z8Ovis._parse_and_validate_image_input.<locals>.<listcomp>  s    "D"D"D!171:"D"D"Dr:   )r   r   r   r   z This line should be unreachable.)
pop
isinstancer3   r   r   r\   r   r   r   AssertionError)rS   r   ro   r   s       r8   _parse_and_validate_image_inputz$Ovis._parse_and_validate_image_input  s.    zz.$77!::&8$??$4$<4#(8(DlU\4,@AA  UlASASUU   .t0DEE  6!%l!3!36 6  
 ($$\$???"D"D|"D"D"D!+,<T!J!J!J	    ?@@@r:   image_inputc           
         |d         }|d         }|d         }t          t          d |                    }| j        j        }|                     |                    |                    }|                     |          }|                     |          }	|	                    |          }
|                    |d          }g }t          |
|          D ]\  }}g }t          |j	        d                   D ]>}|
                    t          j        |||dz            ||         gd                     ?|
                    ||dz   d                     |
                    t          j        |d                     t          |          S )Nr   r   r   c                 "    | dk    r| dz   n| dz   S )Nr%   rq   r   )r   s    r8   <lambda>z+Ovis._process_image_input.<locals>.<lambda>  s    1q55!a%%a!e r:   r   rh   r%   )r   mapr)  ra   tor*  splitziprangerw   appendr3   cattuple)rS   r6  image_patches_flatr   r   indicator_per_imagetarget_dtyper   visual_embedsindicator_embedsindicator_embeds_per_imagevisual_embeds_per_imagevision_embeddingsr   visualvision_embeddings_per_imager  s                    r8   _process_image_inputzOvis._process_image_input  s    )5'(;<&'9:"335FGG
 
 ,2--.@.C.CL.Q.QRR//88$455%5%;%;<O%P%P""/"5"56GQ"5"O"O!$&(?"
 "
 		T 		TIv +-'6<?++  +22IyQU3VAY?QGGG    (..yQ/ABBB$$UY/JPQ%R%R%RSSSS&'''r:   c                 R     | j         di |}|g S |                     |          }|S )Nr   )r5  rL  )rS   r   r6  image_featuress       r8   embed_multimodalzOvis.embed_multimodal  s>    :d:DDVDDI22;??r:   r   	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)r   rP  rQ  rR  )r$  )rS   r   rP  rQ  rR  r   hidden_statess          r8   r   zOvis.forward  s=      + M !5'	 ! 
 
 r:   rT  c                 6    | j                             |          S r^   )r$  compute_logits)rS   rT  s     r8   rV  zOvis.compute_logits#  s     x&&}555r:   weightsc                 J    t          |           }|                    |          S r^   )r   load_weights)rS   rW  loaders      r8   rY  zOvis.load_weights)  s#    "4((""7+++r:   )NN)r   r   r   classmethodr   rx   r!  r   rF   r   r   r5  r&   rL  rO  r3   r   r!   r   rV  r   rA  setrY  r   r   s   @r8   r  r    s        =3 =3 =3: = = = [= BD 
 
 
z 
3 
 
 
 
 
 
<AA		$A A A A<(/(	( ( ( (B 4H     <@-1 < < 2D8	
 |d*  
+	+   *6|6 
	6 6 6 6,HU33D-E$F ,3s8 , , , , , , , ,r:   r  )Mr   r   collections.abcr   r   typingr   r   r3   torch.nnrL   r   torch.nn.functionalr   r	   r
   transformersr   r   vllm.configr   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   'vllm.model_executor.layers.quantizationr    vllm.model_executor.models.aimv2r   !vllm.model_executor.models.siglipr    vllm.model_executor.models.utilsr   r   r   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r    vllm.sequencer!   'vllm.transformers_utils.processors.ovisr"   vllm.utils.tensor_schemar#   r$   
interfacesr&   r'   r(   r   rK   r   r+  rx   r9   r   r<   r   	Embeddingr   r   r   r   register_processorr  r   r:   r8   <module>rs     sR  &    - - - - - - - - % % % % % % % %              < < < < < < < < < < 7 7 7 7 7 7 7 7 " " " " " " 3 3 3 3 3 3 > > > > > > F F F F F F 7 7 7 7 7 7 ? ? ? ? ? ?            0 / / / / /         
 A @ @ @ @ @ @ @            . - - - - - A A A A A A > > > > > > > > L L L L L L L L L L 444  +      el          @ @ @ @ @eho @ @ @F? ? ? ? ?< ? ? ?"! ! ! ! !eh( ! ! !0#G #G #G #G #G+ #G #G #GL    34FG   8T
 T
 T
 T
 T
56HI T
 T
 T
n ('	'  
K, K, K, K, K,29(* K, K, 
K, K, K,r:   