
    .`iS                        d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZA ddlBmCZC ddlDmEZE ddlFmGZGmHZH ddlImJZJmKZKmLZL  eeM          ZNe G d d                      ZO G d  d!eG          ZP G d" d#e          ZQ G d$ d%e<          ZR G d& d'e:eR                   ZS G d( d)e;eR                   ZT e-jU        eTeReS*           G d+ d,ejV        e&e'                      ZWd-eCd.eXd/eYdz  fd0ZZdS )1z
Kimi-K2.5 Model Implementation for vLLM.

Kimi-K2.5 extends Kimi-K2 with vision support

This module defines:
- KimiK25ProcessingInfo/KimiK25MultiModalProcessor: Processing logic
- KimiK25ForConditionalGeneration: Main model class
    N)IterableMappingSequence)	dataclass)	AnnotatedAnyLiteral)nn)BatchFeature)ProcessorMixin)
VllmConfig)BaseDummyOptions)get_pp_group)init_logger)SharedFusedMoE)LogitsProcessor)ParallelLMHead)default_weight_loadermaybe_remap_kv_scale_name)DeepseekV2Model)SupportsMultiModal
SupportsPP)KimiK25MultiModalProjectorMoonViT3dPretrainedModelvision_tower_forward)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensorsVisionChunkVisionChunkImageVisionChunkVideo)MultiModalDataItemsVisionChunkProcessorItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdate)IntermediateTensors)KimiK25Config)cached_get_image_processor)TensorSchemaTensorShape   )PPMissingLayeris_pp_missing_parametermaybe_prefixc                   ,    e Zd ZU dZeed<   dZeed<   dS )MaxImageTokenMetai  widthheightN)__name__
__module____qualname__r7   int__annotations__r8        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/kimi_k25.pyr6   r6   H   s2         E3FCr?   r6   c                       e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   edddd          f         ed<   eej	         edd          f         ed<   d	S )
KimiK25MediaPixelInputsz
    Media input schema for K2-VL model.

    Dimensions:
        - np: Number of patches (flattened from all media items)
        - ps: Patch size
        - nm: Number of media items
    pixel_valuestypenp   psnm	grid_thwsN)r9   r:   r;   __doc__rD   r	   r=   r   torchTensorlistr0   r>   r?   r@   rB   rB   N   s           %3D'.
!222tEL))D!T4((	*   
 {{4';';;<<<<<<r?   rB   c                   `     e Zd ZdgZdZd	 fd	Z	 d
dee         dz  dee         de	fdZ
 xZS )MoonshotKimiVAutoProcessor	tokenizerAutoTokenizerNc                 X    t                                          |           || _        d S N)super__init__media_processor)selfrV   rP   	__class__s      r@   rU   z#MoonshotKimiVAutoProcessor.__init__f   s)    ###.r?   vision_chunkstextreturnc                    i }|1t          |t                    sJ | j                            |          }t	          dt          j        |g          i|          S )a  
        Args:
            vision_chunks: List of VisionChunk items to be processed.
                For image: VisionChunkImage with type='image', image=PIL.Image
                For video_chunk: VisionChunkVideo with type='video_chunk', video_chunk=list[PIL.Image]
            text: The token ids to be fed to a model (required).
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- list of token ids to be fed to a model.
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `vision_chunks` is not `None`.
            - **grid_thws** -- list of image 3D grid in LLM. Returned when `vision_chunks` is not `None`.
        N	input_ids)data)
isinstancerM   rV   
preprocessr   rK   tensor)rW   rY   rZ   kwargs	mm_inputss        r@   __call__z#MoonshotKimiVAutoProcessor.__call__k   sp    ( 	$mT22222,77FFIU\4&11
 
 
 	
r?   NNrS   )r9   r:   r;   
attributestokenizer_classrU   rM   r!   r<   r   rd   __classcell__rX   s   @r@   rO   rO   b   s        J%O/ / / / / / 37
 
K(4/
 3i	
 

 
 
 
 
 
 
 
r?   rO   c                   Z     e Zd ZdZdeddf fdZd Zd Zdee	e
dz  f         fdZ xZS )	KimiK25ProcessingInfozProcessing information for Kimi-K2.5 model.

    Provides configuration and utilities for processing both
    images and video-chunks.
    ctxr[   Nc                 h   t                                          |           |                                 | _        | j        j        | _        t          | j        j        j	        d          }|| _
        t          | j
        |                                           | _        | j
        j        | _        d S )NT)trust_remote_code)rV   rP   )rT   rU   get_hf_config	hf_configmedia_placeholder_token_idmedia_token_idr.   rl   model_configmodelrV   rO   get_tokenizerhf_processormedia_tokens_calculator)rW   rl   rV   rX   s      r@   rU   zKimiK25ProcessingInfo.__init__   s    ++--"nG4H!'4
 
 
  /6 0((**
 
 
 (,';'S$$$r?   c                     | j         S rS   )rv   rW   s    r@   get_hf_processorz&KimiK25ProcessingInfo.get_hf_processor   s      r?   c                 @    | j                             t                    S rS   )rl   ro   r-   ry   s    r@   ro   z#KimiK25ProcessingInfo.get_hf_config   s    x%%m444r?   c                 
    dd iS Nvision_chunkr>   ry   s    r@   get_supported_mm_limitsz-KimiK25ProcessingInfo.get_supported_mm_limits   s    %%r?   )r9   r:   r;   rJ   r)   rU   rz   ro   r   strr<   r   rh   ri   s   @r@   rk   rk      s         T2 Tt T T T T T T! ! !5 5 5&cDj)A & & & & & & & &r?   rk   c            	            e Zd ZdZdeddf fdZdeeef         de	e         fdZ
d Z	 dd	edeeef         d
eeef         dz  defdZ xZS )KimiK25DummyInputsBuilderz2Builds dummy inputs for Kimi-K2.5 model profiling.infor[   Nc                     t                                          |           | j        j        | _        | j        j        j        | _        d S rS   )rT   rU   r   rr   rV   num_frames_per_chunkframe_per_chunk)rW   r   rX   s     r@   rU   z"KimiK25DummyInputsBuilder.__init__   s>    "i6#y8Mr?   	mm_countsc                 D    |                     dd          }| j        g|z  S )Nr~   r   )getrr   )rW   r   	num_medias      r@   get_dummy_textz(KimiK25DummyInputsBuilder.get_dummy_text   s&    MM.!44	#$y00r?   c                    |                      t          j        t          j        | j                  }t          d|          }| j                            |          }t          d|                      t          j        t          j        d          d                   }| j                            |          }||k    r|gS |gS )N)r8   r7   
num_imagesvideo_chunk)rD   r   imager1   r   )rD   r   )	_get_dummy_imagesr6   r8   r7   r   r#   r   rw   r"   )rW   dummy_videosvideo_chunk_dummy_itemvideo_chunk_num_tokensimage_dummy_itemimage_num_tokenss         r@   get_dummy_mm_itemsz,KimiK25DummyInputsBuilder.get_dummy_mm_items   s    --$+#)+ . 
 
 "2L"
 "
 "
 "&!B!B""
 "
 ,(((/'- )   	
 
 
  9<<=MNN!%555*++$%%r?   seq_len
mm_optionsc                 2    |                                  }d|iS r}   )r   )rW   r   r   r   dummy_itemss        r@   get_dummy_mm_dataz+KimiK25DummyInputsBuilder.get_dummy_mm_data   s      --//,,r?   rS   )r9   r:   r;   rJ   rk   rU   r   r   r<   rM   r   r   r   r   r   rh   ri   s   @r@   r   r      s        <<N2 Nt N N N N N N
1S(9 1d3i 1 1 1 1& & &B =A	- -- 38$- C!112T9	-
 
- - - - - - - -r?   r   c            	           e Zd ZdZdedeeef         deeef         fdZ	de
deeef         dedee         fdZd	 Zd
S )KimiK25MultiModalProcessorz]Multi-modal processor for Kimi-K2.5.

    Handles both image and video-chunk modalities.
    	hf_inputshf_processor_mm_kwargsr[   c                     |                     dt          j        d                    }|                    d          }t	          t          j        d|          t          j        d                    S )a  Indicates how to slice media input into multiple items.

        pixel_values: [N, 3, patch_size, patch_size], all patches collected from B medias
        grid_thws: [B,3], each item: [N_t, N_h ,N_w], indicates the grid size in time/height/width direction
                    for current item.

        by multiplying [N_t, N_h ,N_w], we get the number of patches for each media item, thus we can slice
        pixel_values by pixel_values[start:start + N_t*N_h*N_w] to get patches of one item.

        rI   )r   rF   r~   )rC   rI   )r   rK   emptyproddictr   flat_from_sizesbatched)rW   r   r   rI   
grid_sizess        r@   _get_mm_fields_configz0KimiK25MultiModalProcessor._get_mm_fields_config   sn     MM+u{6/B/BCC	^^B''
.>
  ,3NCC	
 
 
 	
r?   mm_itemsout_mm_kwargsc                       j                                         }|j        dt          f fd}t	          dg|          gS )Nitem_idxc                                          dt          f          }j                            ||                    }g|z  S r}   )	get_itemsr%   r   rw   )r   medianum_media_tokenrr   r   rW   s      r@   get_replacementzGKimiK25MultiModalProcessor._get_prompt_updates.<locals>.get_replacement  sC    &&~8Q7STTE"i??hPPO"#o55r?   r~   )modalitytargetreplacement)r   ro   rq   r<   r*   )rW   r   r   r   rp   r   rr   s   ``    @r@   _get_prompt_updatesz.KimiK25MultiModalProcessor._get_prompt_updates   s{     I++--	"=	6c 	6 	6 	6 	6 	6 	6 	6 	6 '&'+  
 	
r?   c                 @    | j         j                            |          S rS   )r   rV   split_video_chunks)rW   videos     r@   r   z-KimiK25MultiModalProcessor.split_video_chunks  s    y(;;EBBBr?   N)r9   r:   r;   rJ   r   r   r   objectr   r   r$   r   r   r   r+   r   r   r>   r?   r@   r   r      s         


 !(V 4
 
++	,	
 
 
 
2
%
 !(S 1
 -	

 
,	
 
 
 
,C C C C Cr?   r   )r   dummy_inputsc                       e Zd ZdZdZededededz  fd            Z	 dd	e	d
eddf fdZ
dededz  fdZdedeej                 fdZdededz  fdZdej        j        fdZ	 	 ddej        dej        dedz  dej        dz  dedefdZdej        dej        fdZdeeeeeef                  fdZdeeeej        f                  fdZ xZS )KimiK25ForConditionalGenerationzKimi-K2.5 model for conditional generation.

    Supports both image and video-chunk modalities.
    Video-chunks are temporal segments (typically 4 frames) that are
    processed with temporal pooling.
    Tr   ir[   Nc                 F    |dk    rdS |dk    rdS t          d|           )Nr   z?<|media_begin|>image<|media_content|><|media_pad|><|media_end|>r   z<|kimi_k25_video_placeholder|>zUnsupported modality: )
ValueError)clsr   r   s      r@   get_placeholder_strz3KimiK25ForConditionalGeneration.get_placeholder_str(  s=     wTT  33<(<<===r?    vllm_configprefixc                 t   t                                                       |j        }|j        }|| _        |j        }|j        j        dk    | _        |j	        j
        | _
        t          j                                        | _        t          |j        t#          |d                    | _        | j                            | j        |j                  | _        t+          |j        | j        t#          |d                    | _        | j                            | j        |j                  | _        || _        t/          j        |          }|j        j        j	        |j        _        t3          |t#          |d                    | _        t7                      j        r5t;          |j        |j	        j
        t#          |d	                    | _        ntA                      | _        | j        j!        | _!        tE          |d
d          }tG          |j        |          | _$        | j        j%        | _&        d S )Nr^   vision_tower)r   )devicedtypemm_projector)configuse_data_parallelr   language_model)r   r   lm_headlogit_scaleg      ?)scale)'rT   rU   rs   rp   r   quant_configmultimodal_configmm_encoder_tp_moder   text_confighidden_sizerK   cudacurrent_devicer   r   vision_configr4   r   tor   r   r   copydeepcopyr   r   r   is_last_rankr   
vocab_sizer   r2   make_empty_intermediate_tensorsgetattrr   logits_processorrq   media_placeholder)	rW   r   r   rs   r   r   sub_vllm_configr   rX   s	           r@   rU   z(KimiK25ForConditionalGeneration.__init__3  s(   
 	"/ , 6"/ *=G 	 "-9j//114 77
 
 
 !-00;l&8 1 
 
 7'"477
 
 

 !-00;l&8 1 
 
 )-44(2> 	$. .'(899
 
 
 >>& 	,)!".#FI66  DLL *++DL? 	, fmS99 /0A U U U&*k&Lr?   rb   c                 :   |                     dd           }|                     dd           }|d S t          |t                    rt          j        |d          }t          |j                  dk    st          |j                  dk    r4 |j        |j        d         |j        d         z  g|j        dd          R  }t          | j	        
                                          j        }|                    |          }t          |t          j                  sJ d	t          |                       |                    d
|j        d
                   }|j        dk    r|                    d          dk    sJ d|j                     t#          d||          S )NrC   rI   r   )dim   rF   r1      z%expect grid_thws to be a tensor, get r   z unexpected shape for grid_thws: )rD   rC   rI   )popr_   rM   rK   catlenshapereshapenextr   
parametersr   r   rL   rD   ndimsizerB   )rW   rb   rC   rI   target_dtypes        r@   _parse_and_validate_media_inputz?KimiK25ForConditionalGeneration._parse_and_validate_media_inputn  s    zz.$77JJ{D11	4lD)) 	: 9\q999L|!""a''3|/A+B+Ba+G+G/</"1%(:1(==@L@RSTSUSU@V  L
 D-88::;;A#|44)U\22 	
 	
EDOOEE	
 	
2 %%b)/"*=>>	~""y~~a'8'8A'='='=@y@@ (>'== '%
 
 
 	
r?   media_inputc                 d    t          | j        |d         |d         | j        | j                  }|S )NrC   rI   )r   r   )r   r   r   r   )rW   r   media_featuress      r@   _process_media_inputz4KimiK25ForConditionalGeneration._process_media_input  sB     .'$*"4
 
 
 r?   c                 R     | j         di |}|d S |                     |          }|S )Nr>   )r   r   )rW   rb   r   vision_embeddingss       r@   embed_multimodalz0KimiK25ForConditionalGeneration.embed_multimodal  sA    :d:DDVDD4 !55kBB  r?   c                     | j         S rS   r   ry   s    r@   get_language_modelz2KimiK25ForConditionalGeneration.get_language_model  s    ""r?   r]   	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)r]   r   r   r   r   )rW   r]   r   r   r   rb   hidden_statess          r@   forwardz'KimiK25ForConditionalGeneration.forward  s=      + M++!5'	 , 
 
 r?   r   c                 .     | j         | j        |fi |}|S rS   )r   r   )rW   r   rb   logitss       r@   compute_logitsz.KimiK25ForConditionalGeneration.compute_logits  s%    &&t|]MMfMMr?   c                 ~    | j         j        }t          |dd           sg S t          j        | ddd|j        d          S )Nn_routed_experts	gate_proj	down_projup_projr   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_expertsnum_redundant_experts)r   r   r   r   make_expert_params_mappingr  )rW   r   s     r@   get_expert_mappingz2KimiK25ForConditionalGeneration.get_expert_mapping  sW     (v1488 	I8 + +'/"#
 
 
 	
r?   weightsc                    | j         j        }ddddd}ddg}t          |dd           rt          |d	d           r|d
dgz  }|                                 }t	          |                                           }|D ]}|d d         \  }}	t          |          dk    r|d         ni }
d|v r2t          ||          }|Ed|v sd|v rN|                                D ]\  }}||v r|	                    ||          } d}d|v r
| j
        d}n|D ]p\  }}}||vrd|v r||vr|	                    ||          }|                    d          r||vrDt          ||           rU||         }|j        } |||	|fi |
  nft          |          D ]T\  }\  }}}}||vr|	                    ||          }t          ||           r6||         }|j        } |||	|f||d|
  nd}|ri|                    d          r||vrt          ||          }|t          ||           r||         }t          |dt                     } |||	fi |
 d S )Nr   r   zmm_projector.linear_1zmm_projector.linear_2)zlanguage_model.lm_headzlanguage_model.modelzmm_projector.proj.0zmm_projector.proj.2).gate_up_projz
.gate_projr   )r  z.up_projr1   kv_lora_rankq_lora_rank).fused_qkv_a_projz	.q_a_projr   )r  z.kv_a_proj_with_mqar1   r   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedFvisionTzmlp.experts.z.bias)	expert_idshard_idweight_loader)r   r   r   r  r   named_parametersr   #get_spec_layer_idx_from_weight_nameitemsreplacer   endswithr3   r  	enumerater   r   )rW   r  r   _KEYS_TO_MODIFY_MAPPINGstacked_params_mappingexpert_params_mappingparams_dictargsnameloaded_weightrb   
spec_layerkey_to_modifynew_keyuse_default_weight_loading
param_nameweight_namer  paramr  _r  s                         r@   load_weightsz,KimiK25ForConditionalGeneration.load_weights  s   (&/$4 $;#:#
 #
 /,"
 6>400 	WM46
 6
 	 #5?' " !% 7 7 9 94002233 N	> N	>D"&rr(D- #D		AT!WW2F$,,<VTJJJ%&$..2IT2Q2Q*A*G*G*I*I @ @&w D((<<w??D).&4$015.9O *: *:5JX"$.. &$..D4K4K <<Z@@D}}W-- !$k2I2I .tT:: ! '-E$)$7M!M%KKFKKKE ##899: :  "#! &d22$#||KDD24>> %$ +D 1(-(;%!)  '0%-  %   592) >==)) d+.E.E0{CC<*466 #D) '@U V Ve]==f===]N	> N	>r?   )r   re   )r9   r:   r;   rJ   supports_encoder_tp_dataclassmethodr   r<   r   r   rU   r   rB   r   rM   rK   rL   r   r    r   r
   Moduler   r,   r   r  tupler  r   r/  rh   ri   s   @r@   r   r     sG          $>3 >3 >3: > > > [> 9M 9M9M 9M 
	9M 9M 9M 9M 9M 9Mv 
 
	 4	' 
  
  
  
D2	el	   ! !MD4H ! ! ! !#EHO # # # # <@-1 < < 2D8	
 |d*  
   &EL u|    
DsCc/A)B$C 
 
 
 
g>HU33D-E$F g> g> g> g> g> g> g> g>r?   r   r   r,  r[   c                     t          | d          r;| j        dk    r0| j        }t          | j                  D ]}d||z    d|v r||z   c S d S )Nnum_nextn_predict_layersr   zmodel.layers..)hasattrr5  num_hidden_layersrange)r   r,  	layer_idxr   s       r@   r  r  :  s{     v122 %'!++,	v677 	% 	%A/y1}///;>> 1}$$$ ?4r?   )[rJ   r   collections.abcr   r   r   dataclassesr   typingr   r   r	   rK   r
   transformersr   transformers.processing_utilsr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   vllm.loggerr   $vllm.model_executor.layers.fused_moer   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   &vllm.model_executor.models.deepseek_v2r   %vllm.model_executor.models.interfacesr   r   'vllm.model_executor.models.kimi_k25_vitr   r   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r    r!   r"   r#   vllm.multimodal.parser$   r%   vllm.multimodal.processingr&   r'   r(   r)   r*   r+   vllm.sequencer,   vllm.transformers_utils.configsr-   !vllm.transformers_utils.processorr.   vllm.utils.tensor_schemar/   r0   utilsr2   r3   r4   r9   loggerr6   rB   rO   rk   r   r   register_processorr2  r   r   r<   r  r>   r?   r@   <module>rV     s     7 7 7 7 7 7 7 7 7 7 ! ! ! ! ! ! * * * * * * * * * *        % % % % % % 8 8 8 8 8 8 " " " " " " 3 3 3 3 3 3 ) ) ) ) ) ) # # # # # # ? ? ? ? ? ? G G G G G G N N N N N N        C B B B B B P P P P P P P P         
 0 / / / / /                  Q P P P P P P P                . - - - - - 9 9 9 9 9 9 H H H H H H > > > > > > > > H H H H H H H H H H	X		        
= = = = =l = = =('
 '
 '
 '
 '
 '
 '
 '
T& & & & &. & & &@1- 1- 1- 1- 1- 67L M 1- 1- 1-h6C 6C 6C 6C 6C!89N!O 6C 6C 6Cr ('	*  
Y> Y> Y> Y> Y>bi1CZ Y> Y> 
Y>x(+4Z     r?   