
    .`iY                        U d Z ddlZddlmZmZmZ ddlmZmZm	Z	 ddl
Z
ddlmZ ddlmc mZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZGmHZHmIZImJZJ dZK G d de>          ZL G d de>          ZMeLeMz  ZNe	eOd<    G d  d!ejP                  ZQ G d" d#e/          ZR G d$ d%e,eR                   ZS G d& d'e.eR                   ZT e jU        eTeReS(           G d) d*ejP        eDeE                      ZVdS )+zFInference-only Deepseek-VL2 model compatible with HuggingFace weights.    N)IterableMappingSequence)	AnnotatedLiteral	TypeAlias)	rearrangerepeat)BatchFeature)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)QuantizationConfig)replace_linear_class)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsMultiModalUUIDDict)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilder)BaseMultiModalProcessorBaseProcessingInfoMultiModalProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)cached_tokenizer_from_config)DeepseekVLV2ConfigMlpProjectorConfigVisionEncoderConfig)DeepseekVLV2Processor)TensorSchemaTensorShape)set_default_torch_dtype   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix<image>c            	           e Zd ZU dZed         ed<   eej         e	dddddh          f         ed	<   eej         e	d
d          f         ed<   dS )DeepseekVL2ImagePixelInputsz
    Dimensions:
        - bnp: Batch size * number of images * number of patches
        - p: Number of patches
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebnp   hw)dynamic_dimsdatabn   images_spatial_cropN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr'        {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/deepseek_vl2.pyr3   r3   B   s           .
!!!!
EL++eQSPUw"W"W"WW
XXXX"5<T11E1E#EFFFFFFrG   r3   c                       e Zd ZU dZed         ed<   eej        e	ej                 z   e
ddd          f         ed<   dS )	 DeepseekVL2VImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - f: Image feature size
        - h: Hidden size (must match language model backbone)
    image_embedsr5   r<   fr8   r;   N)r?   r@   rA   rB   r   rC   r   rD   rE   listr'   rF   rG   rH   rJ   rJ   Q   sa           .
!!!!
EL4#55{{4c7R7RR
SSSSSSrG   rJ   DeepseekVL2ImageInputsc                   *     e Zd Zdef fdZd Z xZS )MlpProjectorcfgc                 x   t                                                       || _        |j        | _        |j        r
J d            | j        dk    r|j        }|j        }t          j        |j	        |j
        z  |j
        z  |j        |z            g}t          d|dz
            D ]`}|                    t          j                               |                    t          j        |j        |z  |j        |z                       a|                    t          j                               |                    t          j        |j        |z  |j                             t          j        | }nB| j        dk    r t          j        |j	        |j                  }nt!          d|j                   || _        d S )Nz)Token pooling is not supported currently.downsample_mlp_gelur)   linearzUnsupported projector type: )super__init__rQ   projector_typetoken_poolingdepth	mlp_rationnLinear	input_dimdownsample_ration_embedrangeappendGELU
SequentialNotImplementedErrorlayers)selfrQ   	mlp_depthrZ   modules_	__class__s         rH   rV   zMlpProjector.__init__c   s   !0$QQ&QQQ$"777	II	MC$883;OOK)+ G 1i!m,,  rwyy)))IckI5s{Y7NOO    NN2799%%%NN29S[9%<ckJJKKKmW-GG H,,is{;;GG%Cs/ACC   rG   c           	         |j         \  }}}| j        dk    rt          |dz            x}}	 || j        j        z  r| j        j        || j        j        z  z
  }nd}|                    ||||          }|dk    rt          j        |ddd|d|fdd          }	 |                    dddd          }t          j	        || j        j        | j        j        d          }|                    ddd          }| 
                    |          S )	NrS         ?r   constantr7   r)   r=   )kernel_sizestridepadding)shaperW   intrQ   r^   reshapeFpadpermuteunfoldre   )rf   xbshwr]   r8   r9   ru   s           rH   forwardzMlpProjector.forward   s   GB	"777$$$A!48,, h/!dh6O2OO		"aI..AQwwE!aAsAs3ZCC		!Q1%%A H5x0	  A 		!Q""A{{1~~rG   )r?   r@   rA   r#   rV   r{   __classcell__rj   s   @rH   rP   rP   b   sU        .      B      rG   rP   c            	       l    e Zd Zd ZdefdZdeeedz  f         fdZ	ddd	ed
ede
defdZdefdZdS )DeepseekVL2ProcessingInfoc                 @    | j                             t                    S N)ctxget_hf_configr"   rf   s    rH   r   z'DeepseekVL2ProcessingInfo.get_hf_config   s    x%%&8999rG   kwargsc                 2     | j         j        t          fi |S r   )r   get_hf_processorr%   )rf   r   s     rH   r   z*DeepseekVL2ProcessingInfo.get_hf_processor   s     (tx()>II&IIIrG   returnNc                 
    dd iS )NimagerF   r   s    rH   get_supported_mm_limitsz1DeepseekVL2ProcessingInfo.get_supported_mm_limits   s    rG   T)croppingimage_widthimage_heightr   c                   |                                  }|j        }|j        }|j        }|r%|                    ||f          \  }}	||z  |	|z  }}
ndx}
}t          j        ||z  |z            x}}||dz   z  }||z  |
|z  dz   z  }||z   dz   S )Nr)   )r   
image_size
patch_sizer^   select_best_resolutionmathceil)rf   r   r   r   hf_processorr   r   r^   
best_widthbest_heightnum_width_tilesnum_height_tilesr8   r9   global_views_tokenslocal_views_tokenss                   rH   get_num_image_tokensz.DeepseekVL2ProcessingInfo.get_num_image_tokens   s     ,,..!,
!,
'8 		3&2&I&Il+' '#J j(z) .OO
 232O.	:37GGHHHA1q5k.27JQ7NO"%77!;;rG   c                                                        }|j        }t          | fd          \  }}t          ||          S )Nc                 J                         | d         | d                   S )Nr)   r   )r   r   )r   )rx   rf   s    rH   <lambda>zMDeepseekVL2ProcessingInfo.get_image_size_with_most_features.<locals>.<lambda>   s)    $33aDqt 4   rG   )key)widthheight)r   candidate_resolutionsmaxr   )rf   	hf_configr   r   r   s   `    rH   !get_image_size_with_most_featuresz;DeepseekVL2ProcessingInfo.get_image_size_with_most_features   sa    &&((	 ) ?!   
 
 
 uV4444rG   )r?   r@   rA   r   objectr   r   strrr   r   boolr   r   r   rF   rG   rH   r   r      s        : : :J J J J JcDj)A     HL< < <!<14<@D<	< < < <2	59 	5 	5 	5 	5 	5 	5rG   r   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	DeepseekVL2DummyInputsBuilder	mm_countsr   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nr   r   )getinfor   image_token)rf   r   
num_images	processorr   s        rH   get_dummy_textz,DeepseekVL2DummyInputsBuilder.get_dummy_text   s;    ]]7A..
I..00	+Z''rG   Nseq_len
mm_optionsc                     |                     dd          }| j                                        }|r|                     d          nd }d|                     |j        |j        ||          iS )Nr   r   )r   r   r   	overrides)r   r   r   _get_dummy_imagesr   r   )rf   r   r   r   r   max_image_sizeimage_overridess          rH   get_dummy_mm_dataz/DeepseekVL2DummyInputsBuilder.get_dummy_mm_data   s}     ]]7A..
DDFF5?I*..111T T++$*%,%)	 ,  
 	
rG   r   )
r?   r@   rA   r   r   rr   r   r   r   r   rF   rG   rH   r   r      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rG   r   c                   J    e Zd Zdedeeef         deeef         deeef         def
 fdZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZ	 ddeee         z  de
deeef         deeef         dedz  deee         eef         f fdZ xZS )DeepseekVL2MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr   c                     |s'| j                                         } ||dd          S t                                          ||||          }|d                             d          dz   |d<   |S )	NTpt)add_special_tokensreturn_tensors)r   r   r   r   r>   r)   num_patches)r   get_tokenizerrU   _call_hf_processorprod)rf   r   r   r   r   	tokenizerprocessed_outputsrj   s          rH   r   z1DeepseekVL2MultiModalProcessor._call_hf_processor   s      	S	//11I9VTRRRR!GG66!	 7 
 
 3499"==A 	-( ! rG   	hf_inputshf_processor_mm_kwargsc                     |                     dt          j        d                    }t          t	          j        d|          t	          j        d          t	          j        d                    S )Nr   r   r   )r4   r>   rK   )r   rD   emptydictr   flat_from_sizesbatched)rf   r   r   r   s       rH   _get_mm_fields_configz4DeepseekVL2MultiModalProcessor._get_mm_fields_config  sb    
  mmM5;q>>BB.>wTT 5 =g F F.6w??
 
 
 	
rG   mm_itemsout_mm_kwargsc                        j         j        di |}|j        t          t                    sJ dt          f fd}t          dg|          gS )Nitem_idxc                 D                        dt          t          f          }t          |t                    r|                    |           }nM|                    |           }j                            |j        |j	        t          |          dk              }g|z  S )Nr   r=   )r   r   r   )	get_itemsr   r   
isinstanceget_feature_sizeget_image_sizer   r   r   r   len)r   imagesnum_image_tokensr   image_token_idr   rf   s       rH   get_replacement_deepseek_vl2zXDeepseekVL2MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_deepseek_vl2  s    ''-/BC F &"566 	#)#:#:8#D#D  #228<<
#'9#A#A * 0!+!2 [[A- $B $ $ 
 ##&666rG   r   )modalitytargetreplacementrF   )r   r   r   r   rr   r   )rf   r   r   r   r   r   r   s   ``    @rH   _get_prompt_updatesz2DeepseekVL2MultiModalProcessor._get_prompt_updates  s     2ty1KK4JKK%4.#.....	73 	7 	7 	7 	7 	7 	7 	7 	7$  &'8  
 	
rG   Nmm_data_itemstokenization_kwargsmm_uuidsc                     |                     dd          dk    r|                     |||||          S t                                          |||||          S )Nr   F)strictr=   )r   r   r   r   r   )	get_count_apply_hf_processorrU   _cached_apply_hf_processor)rf   r   r   r   r   r   rj   s         rH   r   z9DeepseekVL2MultiModalProcessor._cached_apply_hf_processor7  s     ""75"99A==+++'=$7! ,    ww11'#9 3 2 
 
 	
rG   r   )r?   r@   rA   r   r   r   r   r   r   r   r   r   r   r   r   rM   rr   r   tupler   r   r   r|   r}   s   @rH   r   r      s       !! f%! 3;'	!
 CK(! 
! ! ! ! ! !0

 !(V 4
 
++	,	
 
 
 
"
%"
 !(V 4"
 -	"

 
,	"
 "
 "
 "
T /3
 
d3i
 +
 !(V 4	

 %S&[1
 %t+
 
tCy2D8	9
 
 
 
 
 
 
 
 
 
rG   r   )r   dummy_inputsc                   :    e Zd Z eddi          Zededededz  fd            Zd	d
de	def fdZ
dej        j        defdZdej        j        defdZ	 d'dededz  dedej        fdZdededz  fdZdej        dej        deej                 fdZdedej        eej                 z  fdZdedefdZ	 	 d(dej        dej        d edz  d!ej        dz  def
d"Zd#ej        dej        dz  fd$Zd%ee eej        f                  de!e         fd&Z" xZ#S ))DeepseekVLV2ForCausalLMz	language.zlanguage_model.)orig_to_new_prefixr   ir   Nc                 N    |                     d          rdS t          d          )Nr   r1   z Only image modality is supported)
startswith
ValueError)clsr   r   s      rH   get_placeholder_strz+DeepseekVLV2ForCausalLM.get_placeholder_stra  s,    w'' 	9;<<<rG    prefixvllm_configr   c          	         t                                                       |j        j        }|j        }|j        j        }|| _        || _        |j        | _        |j        | _        |j	        | _	        |j        }t          |          }|j        t                   | _        |                     |d          5  |                     | j        |t!          |d                    | _        t%          | j                  | _        |j        | _        |j        | _        dt-          j        t-          j        | j        j        t,          j                            z  }| j        dk    rqt7          j        t-          j        | j        j                  |z            | _        t7          j        t-          j        | j        j                  |z            | _        ntA          d| j                   	 d d d            n# 1 swxY w Y   | !                    |          5  tE          || j	        t!          |d                    | _#        d d d            n# 1 swxY w Y   | j#        j$        | _$        d S )	Nr   visionr)   dtype2Dz.Only 2D tile_tag is supported currently, got: language)r   r   r   )%rU   rV   model_configr   quant_configmultimodal_configconfigvision_configprojector_configtext_configr!   vocab_IMAGE_TOKENr   _mark_tower_model_init_vision_moduler0   r   rP   	projectortile_tagglobal_view_posrD   sqrttensorr_   float32r[   	Parameterrandnimage_newlineview_seperatorr   _mark_language_modelr/   language_modelmake_empty_intermediate_tensors)
rf   r   r   r  r  r  r  r   	embed_stdrj   s
            rH   rV   z DeepseekVLV2ForCausalLM.__init__h  s   %0%=%G"/'4F!2#1 & 7!-"/0>>	#,?<#@##K99 	 	22"L,vx2P2P DK *$*?@@DN"ODM#)#9D  EJT2:%-PPP  I }$$%'\K 5 =>>J& &" ')lK 5 =>>J' '## !TT]TT  	 $'	 	 	 	 	 	 	 	 	 	 	 	 	 	 	6 &&{33 	 	"<'*#FJ77# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   6D3G66G:=G:+IIIrootdotted_namec                 ~    |                     d          }|}|dd         D ]}t          ||          }||d         fS )zAReturn (parent_module, final_attr_name) for a dotted module path..Nr   )splitgetattr)rf   r  r  namesparentns         rH   _get_parent_and_attrz,DeepseekVLV2ForCausalLM._get_parent_and_attr  sQ    !!#&&ss 	( 	(AVQ''FFuRy  rG   vitr  c                    	 dd l }n"# t          $ r}t          d          |d }~ww xY w|                                D ]\  }}t          |t          j                  r|                     ||          \  }}t          ||j        j                  r+|dk    r%t          |d||          }	t          |||	           }t          ||j        j                  r*|dk    r$t          |d||          }	t          |||	           |S )Nr   Please install timmfc1colwiser   fc2rowwise)timmImportErrornamed_modulesr   r[   r\   r&  re   Mlpr   setattr)
rf   r'  r  r.  enamemoduler$  	attr_name
new_linears
             rH   patch_vit_for_tpz(DeepseekVLV2ForCausalLM.patch_vit_for_tp  s<   	<KKKK 	< 	< 	<344!;	<  --// 	; 	;LD&&"),, ;$($=$=c4$H$H!	fdko66 	;9;M;M!5	<" " "J FIz::::88 ;Y%=O=O!5	<" " "J FIz:::
s    
&!&r  c                    	 dd l }n"# t          $ r}t          d          |d }~ww xY wt          t          j                  5  |                    ddddd          }d d d            n# 1 swxY w Y   t                      dk    r|                     ||          }|                    t          j	                              }|S )	Nr   r)  z#vit_so400m_patch14_siglip_384.webliFT)
pretrainednum_classesdynamic_img_sizedynamic_img_padr)   r   )
r.  r/  r(   rD   float16create_modelr   r8  toget_default_dtype)rf   r  r  r   r.  r3  models          rH   r  z+DeepseekVLV2ForCausalLM._init_vision_module  s   	<KKKK 	< 	< 	<344!;	< %U]33 	 	%%5 !% $ &  E	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 011A55))%>>Eu68899s"    
&!&A))A-0A-r   c                 "   |                     dd           }|                     dd           }|                     dd           }||d S |$| j        j        x}}t          d||||d          S |t	          d|          S t          d          )Nr4   r>   rK   )r8   r9   )r5   r;   r>   resolve_bindings)r5   r;   z This line should be unreachable.)popr  r   r3   rJ   AssertionError)rf   r   r4   r>   rK   
expected_h
expected_ws          rH   _parse_and_validate_image_inputz7DeepseekVLV2ForCausalLM._parse_and_validate_image_input  s     zz.$77$jj)>EEzz.$77L$84#&*&8&CCJ.#!$7##" "	    #3#!   
 ?@@@rG   r4   r>   c           	         | j                             |          }|                     |          }|j        \  }}}t	          |dz            x}}	d}
g }t          |                    d                    D ]i}||         \  }}|dk    s|dk    r nN||z  }||
         }||
dz   |
dz   |z            }|
|dz   z  }
|                    ||	|          }t          | j	        d|          }t          j        ||gd          }|                    d|          }t          |d||||		          }t          | j	        d
||          }t          j        ||gd          }|                    d|          }| j        dk    r't          j        || j        d d d f         |g          }n&t          j        || j        d d d f         |g          }|                    |           k|S )Nrl   r   r)   z
d -> h 1 d)r8   )dimr   z"(th tw) (h w) d -> (th h) (tw w) d)thtwr8   r9   zd -> (th h) 1 d)rL  r8   head)r   forward_featuresr  rq   rr   r`   sizeviewr
   r  rD   catr	   r  r  ra   )rf   r4   r>   images_featureimages_embedsri   rz   n_dimr8   r9   
tile_indexvision_embeddingsjdxr   r   num_tiles_in_imageglobal_featureslocal_featuresnew_lines_in_globalnew_lines_in_localglobal_local_featuress                        rH   _pixel_values_to_embeddingz2DeepseekVLV2ForCausalLM._pixel_values_to_embedding  sw    55lCC ~66$*2uBGA 
,11!4455 H	< H	<C0CC0H-O-!##'71'<'<!03C!C ,J7O +Qa2D!DDN ,q00J
 .221a??O #));\Q"O"O"O $i:M(NTUVVVO .222u==O
 '4#"  N "("$5:Ja" " "
 #Y8J'KQRSSSN ,00U;;N #v--(-	'+D!!!G4&) )%% ).	&+D!!!G4') )% $$%:;;;;  rG   image_inputc                 x    |d         dk    r|d         S |d         }|d         }|                      ||          S )Nr5   rK   r;   r>   )r4   r>   )r_  )rf   r`  r4   r>   s       rH   _process_image_inputz,DeepseekVLV2ForCausalLM._process_image_inputS  sX     v.00v&&"6*)*?@..%;N / 
 
 	
rG   c                 R     | j         di |}|g S |                     |          }|S )NrF   )rI  rb  )rf   r   r`  rW  s       rH   embed_multimodalz(DeepseekVLV2ForCausalLM.embed_multimodal`  s?    :d:DDVDDI 55kBB  rG   	input_ids	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)rh  )r  )rf   re  rf  rg  rh  r   hidden_statess          rH   r{   zDeepseekVLV2ForCausalLM.forwardg  s;      + M++y"6m , 
 
 rG   rj  c                 6    | j                             |          S r   )r  compute_logits)rf   rj  s     rH   rl  z&DeepseekVLV2ForCausalLM.compute_logitsx  s     "11-@@@rG   weightsc                 \    t          |           }|                    || j                  }|S )N)mapper)r-   load_weightshf_to_vllm_mapper)rf   rm  loaderautoloaded_weightss       rH   rp  z$DeepseekVLV2ForCausalLM.load_weights~  s1    "4((#00AW0XX!!rG   )r   )NN)$r?   r@   rA   r.   rq  classmethodr   rr   r   r   rV   rD   r[   Moduler&  r   r8  r$   r  r   rN   rI  rE   rM   r_  rb  r*   rd  r    r{   rl  r   r   setrp  r|   r}   s   @rH   r   r   U  s        &*
   =3 =3 =3: = = = [= BD 5
 5
 5
z 5
3 5
 5
 5
 5
 5
 5
n! !s ! ! ! !EHO CU    4 	 * )4/ 	
 
   6AA	$	&A A A A<Z!lZ! #\Z! 
el		Z! Z! Z! Z!x
1
	U\*	*
 
 
 
! !4H ! ! ! ! <@-1 < < 2D8	
 |d*    "A|A 
	A A A A"HU33D-E$F "3s8 " " " " " " " "rG   r   )WrB   r   collections.abcr   r   r   typingr   r   r   rD   torch.nnr[   torch.nn.functional
functionalrt   einopsr	   r
   transformersr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.models.transformers.utilsr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   $vllm.multimodal.processing.processorr   r   r   r   r   vllm.sequencer    vllm.tokenizersr!   ,vllm.transformers_utils.configs.deepseek_vl2r"   r#   r$   /vllm.transformers_utils.processors.deepseek_vl2r%   vllm.utils.tensor_schemar&   r'   vllm.utils.torch_utilsr(   
interfacesr*   r+   r,   utilsr-   r.   r/   r0   r  r3   rJ   rN   rC   ru  rP   r   r   r   register_processorr   rF   rG   rH   <module>r     s  
 M L L  7 7 7 7 7 7 7 7 7 7 0 0 0 0 0 0 0 0 0 0                 $ $ $ $ $ $ $ $ % % % % % % " " " " " " 3 3 3 3 3 3 A A A A A A F F F F F F N N N N N N / / / / / /                       > = = = = =              . - - - - - 8 8 8 8 8 8         
 R Q Q Q Q Q > > > > > > > > : : : : : : L L L L L L L L L L            G G G G G, G G G	T 	T 	T 	T 	T| 	T 	T 	T  "BB 	   
8 8 8 8 829 8 8 8v,5 ,5 ,5 ,5 ,5 2 ,5 ,5 ,5^
 
 
 
 
$:;T$U 
 
 
>g
 g
 g
 g
 g
56g
 g
 g
T ('"	".  
g" g" g" g" g"bi);Z g" g" 
g" g" g"rG   