
    .`i;                        d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlmZm Z m!Z!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z: e G d d                      Z; G d dej<                  Z= G d de3          Z>e>Z? G d de*          Z@ G d d e(e@                   ZA G d! d"e)e@                   ZB ejC        eBe@eA#           G d$ d%ej<        ee                      ZDdS )&    N)IterableMappingSequence)	dataclass)	AnnotatedAnyLiteral)nn)BatchFeature)GELUActivation)
VllmConfig)BaseDummyOptions)ReplicatedLinear)SupportsMultiModal
SupportsPP)MoonVitPretrainedModel)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)ImageEmbeddingItemsImageProcessorItemsMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)KimiVLConfigMoonViTConfig)TensorSchemaTensorShape   )AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefix)!run_dp_sharded_mrope_vision_modelc                   ,    e Zd ZU dZeed<   dZeed<   dS )MaxImageTokenMetai   widthheightN)__name__
__module____qualname__r,   int__annotations__r-        v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/kimi_vl.pyr+   r+   X   s2         E3FCr4   r+   c                   T     e Zd Z	 d
dededef fdZdej        dej        fd	Z	 xZ
S )KimiVLMultiModalProjectorF configuse_data_parallelprefixc           	         t                                                       || _        |j        j        |j        j        d         z  |j        j        d         z  | _        t          j                            |j        j        d          | _	        t          | j        | j        dt          |d                    | _        t          | j        |j        j        dt          |d                    | _        t                      | _        d S )	Nr   r%   gh㈵>)epsTlinear_1)biasr;   linear_2)super__init__r:   vision_confighidden_sizemerge_kernel_sizetorchr
   	LayerNormpre_normr   r(   r>   text_configr@   r   act)selfr9   r:   r;   	__class__s       r5   rB   z"KimiVLMultiModalProjector.__init___   s     	!2  ,"4Q78"4Q78 	 **6+?+KQU*VV(
33	
 
 
 )*
33	
 
 
 "##r4   image_featuresreturnc                     |                      |                              d| j                  }|                     |          \  }}|                     |          }|                     |          \  }}|S )N)rH   viewrD   r>   rJ   r@   )rK   rM   hidden_states_s       r5   forwardz!KimiVLMultiModalProjector.forwardz   si    n55::2t?OPP==77q//==77qr4   )Fr8   )r.   r/   r0   r!   boolstrrB   rF   TensorrT   __classcell__rL   s   @r5   r7   r7   ^   s        SU$ $"$7;$MP$ $ $ $ $ $6el u|        r4   r7   c                       e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   edddd          f         ed<   eej	         edd          f         ed	<   d
S )KimiVLImagePixelInputsz
    Dimensions:
        - nc: Number of channels
        - np: Number of patches
        - ps: Patch size
        - ni: Number of images
    pixel_valuestypenp   psni   image_grid_hwsN)r.   r/   r0   __doc__r]   r	   r2   r   rF   rW   listr$   r3   r4   r5   r[   r[      s           %3D'.
!222tEL))D!T4((	*   
 elKKa,@,@@AAAAAAr4   r[   c                   f    e Zd Zd Zdeeedz  f         fdZdededefdZe	defd            Z
dS )	KimiVLProcessingInfoc                 @    | j                             t                    S N)ctxget_hf_configr!   rK   s    r5   rk   z"KimiVLProcessingInfo.get_hf_config   s    x%%l333r4   rN   Nc                 
    dd iS )Nimager3   rl   s    r5   get_supported_mm_limitsz,KimiVLProcessingInfo.get_supported_mm_limits   s    r4   image_widthimage_heightc                   |                                  }|j        j        }|j        j        }|j        j        }|}|}t          |t                    sJ d|             t          |t                    sJ d|             |
J d            ||z  ||z  z  |k    rHt          j        |||z  ||z  z  z            }	t          ||	z            t          ||	z            }}
|
|}}|\  }}||z  |||z  z  z
  ||z  z  }||z  |||z  z  z
  ||z  z  }||z   |d         |z  z  }||z   |d         |z  z  }t          ||z            S )Nz#height must be int, current height z!width must be int, current width zkernel_size must be specifiedr   r%   )	get_hf_processorimage_processor
patch_sizerE   in_token_limit
isinstancer1   mathsqrt)rK   rp   rq   hf_processorru   kernel_sizerv   r-   r,   scalenew_wnew_hkernel_heightkernel_width
pad_height	pad_widthtoken_heighttoken_widths                     r5   get_num_image_tokensz)KimiVLProcessingInfo.get_num_image_tokens   s    ,,..!1<
"2D%5D&#&&VV(Vf(V(VVV&%%%RR'R5'R'RRR%&&(G&&&ZFj$89NJJI5J#66Z;O"PQ E uu}--s6E>/B/B5E!56E&1#| J&=:3M)NNZ')
 :%
1J(KKJ&(	
 +Q*1LMy(k!nz.IJ<+-...r4   c                 4    |                                  j        S ri   )rk   media_placeholder_token_idrl   s    r5   image_token_idz#KimiVLProcessingInfo.image_token_id   s    !!##>>r4   )r.   r/   r0   rk   r   rV   r1   ro   r   propertyr   r3   r4   r5   rg   rg      s        4 4 4cDj)A    #/ #/ 	#/
 
#/ #/ #/ #/J ? ? ? ? X? ? ?r4   rg   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	KimiVLDummyInputsBuilder	mm_countsrN   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nrn   r   )getinfors   image_token)rK   r   
num_images	processorr   s        r5   get_dummy_textz'KimiVLDummyInputsBuilder.get_dummy_text   s;    ]]7A..
I..00	+Z''r4   Nseq_len
mm_optionsc                     |                     dd          }|r|                     d          nd }d|                     t          j        t          j        ||          iS )Nrn   r   )r,   r-   r   	overrides)r   _get_dummy_imagesr+   r,   r-   )rK   r   r   r   r   image_overridess         r5   get_dummy_mm_dataz*KimiVLDummyInputsBuilder.get_dummy_mm_data   sj     ]]7A..
5?I*..111T T++'-(/%)	 ,  
 	
r4   ri   )
r.   r/   r0   r   rV   r1   r   r   r   r   r3   r4   r5   r   r      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r4   r   c            	       v    e Zd Zdedeeef         deeef         fdZde	deee
f         dedee         fdZdS )	KimiVLMultiModalProcessor	hf_inputshf_processor_mm_kwargsrN   c                     |                     dt          j        d                    }|                    d          }t	          t          j        d|          t          j        d                    S )Nrc   )r   rb   rP   rn   )r\   rc   )r   rF   emptyproddictr   flat_from_sizesbatched)rK   r   r   rc   image_grid_sizess        r5   _get_mm_fields_configz/KimiVLMultiModalProcessor._get_mm_fields_config   su    
 #'7V9L9LMM)..r22 .>)  18AA	
 
 
 	
r4   mm_itemsout_mm_kwargsc                 f      j         j        dt          f fd}t          dg|          gS )Nitem_idxc                                          dt          t          f          }t          |t                    r|                    |           }n;|                    |           }j                            |j        |j	                  }g|z  S )Nrn   )rp   rq   )
	get_itemsr   r   rw   get_feature_sizeget_image_sizer   r   r,   r-   )r   imagesnum_image_tokens
image_sizer   r   rK   s       r5   get_replacementzFKimiVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement  s    ''-/BC F &"566 #)#:#:8#D#D  #228<<
#'9#A#A * 0!+!2 $B $ $ 
 ##&666r4   rn   )modalitytargetreplacement)r   r   r1   r   )rK   r   r   r   r   r   s   ``   @r5   _get_prompt_updatesz-KimiVLMultiModalProcessor._get_prompt_updates   sj     1	7c 	7 	7 	7 	7 	7 	7 	7 	7"  &'+  
 	
r4   N)r.   r/   r0   r   r   rV   objectr   r   r   r   r   r   r   r   r3   r4   r5   r   r      s        

 !(V 4
 
++	,	
 
 
 
"
%
 !(S 1
 -	

 
,	
 
 
 
 
 
r4   r   )r   dummy_inputsc                       e Zd ZdZededededz  fd            Z	 dded	eddf fd
Z	de
dedz  fdZ ej                    dedej        fd            Zdedej        fdZde
dedz  fdZ	 	 ddej        dej        dedz  dej        dz  de
defdZdej        dej        fdZdeeeej        f                  fdZ xZS )KimiVLForConditionalGenerationTr   irN   Nc                 N    |                     d          rdS t          d          )Nrn   z?<|media_start|>image<|media_content|><|media_pad|><|media_end|>z Only image modality is supported)
startswith
ValueError)clsr   r   s      r5   get_placeholder_strz2KimiVLForConditionalGeneration.get_placeholder_str#  s.    w'' 	UTT;<<<r4   r8   vllm_configr;   c           	      
   t                                                       |j        }|j        }|j        }|| _        || _        t          |j        t                    sJ |j	        j
        dk    | _        |j        j        | _        |                     |d          5  t          |j        t!          |d                    | _        t%          || j        t!          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t+          ||j        t!          |d          dg	          | _        d d d            n# 1 swxY w Y   | j        j        | _        | j        j        | _        d S )
Ndatarn   vision_tower)r;   multi_modal_projector)r9   r:   r;   language_modelDeepseekV2ForCausalLM)r   	hf_configr;   architectures)rA   rB   model_configr   quant_configr9   rw   rC   r"   multimodal_configmm_encoder_tp_moder:   rI   rD   _mark_tower_modelr   r(   r   r7   r   _mark_language_modelr'   r   make_empty_intermediate_tensorsr   media_placeholder)rK   r   r;   r   r9   r   rL   s         r5   rB   z'KimiVLForConditionalGeneration.__init__*  s   
 	"/+5"/(&.>>>>>*=G 	 "-9##K99 		 		 6$#FN;;! ! !D *C"&"8#F,CDD* * *D&		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 &&{33 	 	"<' ,#F,<==67	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	, '+k&Ls%   AC==DD-EEEkwargsc                     |                     dd           }|                     dd           }|d S t          d||          S )Nr\   rc   )r]   r\   rc   )popr[   )rK   r   r\   rc   s       r5   _parse_and_validate_image_inputz>KimiVLForConditionalGeneration._parse_and_validate_image_inputV  sV     zz.$77$4d;;4%%)
 
 
 	
r4   inputsc                     |d         }|d         }| j         r*t          | j        ||                                d          S |                     ||          S )Nr\   rc   rope_2d)	rope_type)r:   r)   r   tolist)rK   r   r\   rc   s       r5   _process_image_pixelsz4KimiVLForConditionalGeneration._process_image_pixelsg  sm    n- 01! 	C4!%%''#	    $$\>BBBr4   image_inputc                    |d         dk    sJ |                      |          }t          |t          t          f          sJ d |D             }|                     t          j        |                                        |          S )Nr]   r\   c                 (    g | ]}|j         d          S )r   )shape).0xs     r5   
<listcomp>zGKimiVLForConditionalGeneration._process_image_input.<locals>.<listcomp>y  s    666!171:666r4   )r   rw   re   tupler   rF   catsplit)rK   r   rM   lengthss       r5   _process_image_inputz3KimiVLForConditionalGeneration._process_image_inputu  s    6"n444433K@@.4-8888866~666))%)N*C*CDDJJ7SSSr4   c                 R     | j         di |}|d S |                     |          }|S )Nr3   )r   r   )rK   r   r   vision_embeddingss       r5   embed_multimodalz/KimiVLForConditionalGeneration.embed_multimodal|  sA    :d:DDVDD4 !55kBB  r4   	input_ids	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)r   r   r   r   )r   )rK   r   r   r   r   r   rR   s          r5   rT   z&KimiVLForConditionalGeneration.forward  s=      + M++!5'	 , 
 
 r4   rR   c                 6    | j                             |          S ri   )r   compute_logits)rK   rR   r   s      r5   r   z-KimiVLForConditionalGeneration.compute_logits  s    "11-@@@r4   weightsc                 J    t          |           }|                    |          S ri   )r&   load_weights)rK   r   loaders      r5   r   z+KimiVLForConditionalGeneration.load_weights  s#    "4((""7+++r4   )r8   )NN)r.   r/   r0   supports_encoder_tp_dataclassmethodrV   r1   r   r   rB   r   KimiVLImageInputsr   rF   inference_moder[   rW   r   r   r   r   r    rT   r   r   r   r   rX   rY   s   @r5   r   r     s/         $=3 =3 =3: = = = [= *M *M*M *M 
	*M *M *M *M *M *MX

	T	!
 
 
 
" UC,B Cu| C C C CT0A Tel T T T T! !MD4H ! ! ! ! <@-1 < < 2D8	
 |d*  
   (AEL Au| A A A A,HU33D-E$F , , , , , , , ,r4   r   )Erx   collections.abcr   r   r   dataclassesr   typingr   r   r	   rF   r
   transformersr   transformers.activationsr   vllm.configr   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   %vllm.model_executor.models.interfacesr   r   "vllm.model_executor.models.moonvitr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   vllm.sequencer    vllm.transformers_utils.configsr!   r"   vllm.utils.tensor_schemar#   r$   utilsr&   r'   r(   visionr)   r+   Moduler7   r[   r   rg   r   r   register_processorr   r3   r4   r5   <module>r     s  Z  7 7 7 7 7 7 7 7 7 7 ! ! ! ! ! ! * * * * * * * * * *        % % % % % % 3 3 3 3 3 3 " " " " " " 3 3 3 3 3 3 > > > > > > P P P P P P P P E E E E E E / / / / / /                    
              . - - - - - G G G G G G G G > > > > > > > > N N N N N N N N N N 5 5 5 5 5 5        
! ! ! ! !	 ! ! !HB B B B B\ B B B* + .? .? .? .? .?- .? .? .?b
 
 
 
 
56JK 
 
 
:0
 0
 0
 0
 0
 78L M 0
 0
 0
f ('	)  
, , , , ,RY0BJ , , 
, , ,r4   