
    .`i-Q              	          d Z ddlZddlmZmZmZ ddlmZmZ ddl	Z	ddl
mZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@mAZA ddlBmCZCmDZD ddlEmFZFmGZG ddlHmIZI dZJ G d de@          ZK G d d          ZL G d deC          ZM G d  d!e.          ZN G d" d#e,eN                   ZO G d$ d%e-eN                   ZP e jQ        ePeNeO&           G d' d(ejR        eee                      ZSdS ))zFInference-only Deepseek-OCR model compatible with HuggingFace weights.    N)IterableMappingSequence)	AnnotatedLiteral)BatchFeatureCLIPVisionConfig)
VllmConfig)BaseDummyOptions)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)SamplingParams)IntermediateTensors)cached_tokenizer_from_config)DeepseekVLV2Config)	BASE_SIZE	CROP_MODE
IMAGE_SIZEDeepseekOCRProcessorcount_tiles)TensorSchemaTensorShape)AdapterLogitsProcessorRequestLogitsProcessor   )DeepCLIPVisionTransformerbuild_sam_vit_b)MlpProjector<image>c            	           e Zd ZU dZed         ed<   eej         e	dddddh          f         ed	<   eej         e	ddd
d
dh          f         ed<   eej         e	dd          f         ed<   dS )DeepseekOCRImagePixelInputsz
    Dimensions:
        - b: Batch size
        - n: Number of images
        - p: Number of patches
        - base_size: Base size of the processor
        - image_size: Image size of the processor
    pixel_valuestypebn   	base_sizebnp)dynamic_dimsdata
image_sizeimages_crop   images_spatial_cropN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr-        {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/deepseek_ocr.pyr6   r6   H   s           .
!!!!
D![+UGLLL	N    E1lLwOOO	Q    #5<T11E1E#EFFFFFFrK   r6   c                   l    e Zd Z	 d
dededee         dz  fdZdee         dej        dej        fd	Z	dS )NoRepeatNGramLogitsProcessorN
ngram_sizewindow_sizewhitelist_token_idsc                 L    || _         || _        |pt                      | _        d S N)rO   rP   setrQ   )selfrO   rP   rQ   s       rL   __init__z%NoRepeatNGramLogitsProcessor.__init___   s*     %&#6#?#%%   rK   
output_idslogitsreturnc                 "   t          |          | j        k     r|S t          || j        dz
   d                    }t          dt          |          | j        z
            }t          |          | j        z
  dz   }t                      }t          ||          D ]J}t          |||| j        z                      }|d d         |k    r|                    |d                    K|| j        z
  }|r t          d           |t          |          <   |S )Nr0   r   inf)lenrO   tuplemaxrP   rT   rangeaddrQ   floatlist)	rU   rW   rX   current_prefixsearch_start
search_endbanned_tokensingrams	            rL   __call__z%NoRepeatNGramLogitsProcessor.__call__i   s   
 z??T_,,MzDOa,?*@*B*BCDD1c*oo0@@AA__t6:
|Z00 	- 	-A*QT_)<%<=>>ESbSz^++!!%),,,%(@@ 	8+0<<-F4&&'rK   rS   )
rC   rD   rE   intrT   rV   rc   rH   rI   rj   rJ   rK   rL   rN   rN   ^   s        
 04	@ @@ @ !X_	@ @ @ @I  
	     rK   rN   c                   P    e Zd ZdZedefd            ZdefdZdede	dz  fdZ
dS )NGramPerReqLogitsProcessorzgExample of overriding the wrapper class `__init__()` in order to utilize
    info about the device typeparamsc                    |j         o|j                             d          }|j         o|j                             dd          }|j         o|j                             dd           }|d S t          |t                    r|dk    rt	          d| d          t          |t                    r|dk    rt	          d| d          |(t          |t
                    st	          d	| d          d S d S )
NrO   rP   d   rQ   r   z8`ngram_size` has to be a strictly positive integer, got .z9`window_size` has to be a strictly positive integer, got z<`whitelist_token_ids` has to be a sequence of integers, got )
extra_argsget
isinstancerk   
ValueErrorr   )clsrn   rO   rP   rQ   s        rL   validate_paramsz*NGramPerReqLogitsProcessor.validate_params   sL   &N6+<+@+@+N+N
'UF,=,A,A-QT,U,U$/ 
F4E4I4I!45
 5

 4*c** 	jAooX:XXX   +s++ 	{a/?/?&"& & &   *:4
 4
* .*. . .   +***rK   rY   c                     dS )NFrJ   rU   s    rL   is_argmax_invariantz.NGramPerReqLogitsProcessor.is_argmax_invariant   s    urK   Nc                    |j         o|j                             d          }|j         o|j                             dd          }|j         o|j                             dd           }|d S |rt          |          nd }t          |||          S )NrO   rP   rp   rQ   )rO   rP   rQ   )rr   rs   rT   rN   )rU   rn   rO   rP   rQ   s        rL   new_req_logits_processorz3NGramPerReqLogitsProcessor.new_req_logits_processor   s     &N6+<+@+@+N+N
'UF,=,A,A-QT,U,U$/ 
F4E4I4I!45
 5
 4:MWc"5666SW+!# 3
 
 
 	
rK   )rC   rD   rE   rF   classmethodr#   rw   boolrz   r/   r|   rJ   rK   rL   rm   rm      s        " " ^    [8T    

 
 $	&
 
 
 
 
 
rK   rm   c            	       l    e Zd Zd ZdefdZdeeedz  f         fdZ	ddd	ed
ede
defdZdefdZdS )DeepseekOCRProcessingInfoc                 @    | j                             t                    S rS   )ctxget_hf_configr&   ry   s    rL   r   z'DeepseekOCRProcessingInfo.get_hf_config   s    x%%&8999rK   kwargsc                 2     | j         j        t          fi |S rS   )r   get_hf_processorr*   )rU   r   s     rL   r   z*DeepseekOCRProcessingInfo.get_hf_processor   s     (tx()=HHHHHrK   rY   Nc                 
    dd iS )NimagerJ   ry   s    rL   get_supported_mm_limitsz1DeepseekOCRProcessingInfo.get_supported_mm_limits   s    rK   T)croppingimage_widthimage_heightr   c                b   t           }t          }d}d}t          r.|dk    r|dk    rddg}nt          ||t                     }|\  }	}
ndx}	}
t	          j        ||z  |z            x}}t	          j        ||z  |z            x}}||dz   z  }|	dk    s|
dk    r|
|z  |	|z  dz   z  }nd}||z   dz   S )N      i  r0   )r?   r   )r)   r'   r(   r+   mathceil)rU   r   r   r   r?   r;   
patch_sizedownsample_ratio
crop_rationum_width_tilesnum_height_tileshwh2w2global_views_tokenslocal_views_tokenss                    rL   get_num_image_tokensz.DeepseekOCRProcessingInfo.get_num_image_tokens   s     
	
 	3c!!lc&9&9V

 )*  
 1;-O--122O.	9
26FFGGGA)Z:59IIJJJR1q5kQ"2Q"6"6"2R"7Ob<PST<T!U!""%77!;;rK   c                 r    t           dk    rt          dk    rt          dd          S t          dd          S )N   i   i   )widthheight)r)   r'   r   ry   s    rL   !get_image_size_with_most_featuresz;DeepseekOCRProcessingInfo.get_image_size_with_most_features   s=    )t"3"38H====ww7777rK   )rC   rD   rE   r   objectr   r   strrk   r   r~   r   r   r   rJ   rK   rL   r   r      s        : : :I I I I IcDj)A     HL< < <!<14<@D<	< < < <B89 8 8 8 8 8 8rK   r   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	DeepseekOCRDummyInputsBuilder	mm_countsrY   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nr   r   )rs   infor   image_token)rU   r   
num_images	processorr   s        rL   get_dummy_textz,DeepseekOCRDummyInputsBuilder.get_dummy_text   s;    ]]7A..
I..00	+Z''rK   Nseq_len
mm_optionsc                     |                     dd          }| j                                        }d|                     |j        |j        |          iS )Nr   r   )r   r   r   )rs   r   r   _get_dummy_imagesr   r   )rU   r   r   r   r   max_image_sizes         rL   get_dummy_mm_dataz/DeepseekOCRDummyInputsBuilder.get_dummy_mm_data   s_     ]]7A..
DDFF T++$*%,% ,  
 	
rK   rS   )
rC   rD   rE   r   r   rk   r   r   r   r   rJ   rK   rL   r   r      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rK   r   c            
           e Zd Zdedeeef         deeef         deeef         def
dZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZdS )DeepseekOCRMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrY   c                     |r>| j         j                             | j         j        di |t	          dd|i||          }n'| j                                         } ||dd          }|S )Nr   Tpt)add_special_tokensreturn_tensorsrJ   )r   r   call_hf_processorr   dictget_tokenizer)rU   r   r   r   r   processed_outputs	tokenizers          rL   _call_hf_processorz1DeepseekOCRMultiModalProcessor._call_hf_processor  s      	 $	 ? ?*	*77Y77..F.g..! ! 	//11I )	4! ! ! ! rK   	hf_inputshf_processor_mm_kwargsc                 z   |                     dt          j        d                    }|d d df         dk    |d d df         dk    z  }t          j        ||                    d          d          }t          t          j        d          t          j        d          t          j        d|                    S )	NrB   )r   rA   r   r0   r[   dimr   )r7   rB   r@   )	rs   rH   emptywhereprodr   r   batchedflat_from_sizes)rU   r   r   rB   is_tiledpatches_per_images         rL   _get_mm_fields_configz4DeepseekOCRMultiModalProcessor._get_mm_fields_config"  s    
 (mm,A5;vCVCVWW'1-16I!!!Q$6ORS6ST!K2E2J2Jr2J2R2RTUVV.6w?? 5 =g F F-=* 
 
 
 	
rK   mm_itemsout_mm_kwargsc                        j         j        di |}|j        t          t                    sJ dt          f fd}t          dg|          gS )Nitem_idxc                 ,                        dt          t          f          }t          |t                    r|                    |           }nA|                    |           }j                            |j        |j	        t                    }g|z  S )Nr   )r   r   r   )	get_itemsr   r   rt   get_feature_sizeget_image_sizer   r   r   r   r(   )r   imagesnum_image_tokenssizeimage_token_idr   rU   s       rL   get_replacement_deepseek_vl2zXDeepseekOCRMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_deepseek_vl2=  s    ''-/BC F &"566 	#)#:#:8#D#D  ,,X66#'9#A#A $
!%& $B $ $ 
 ##&666rK   r   )modalitytargetreplacementrJ   )r   r   r   rt   rk   r!   )rU   r   r   r   hf_processorr   r   s   ``    @rL   _get_prompt_updatesz2DeepseekOCRMultiModalProcessor._get_prompt_updates2  s     2ty1KK4JKK%4.#.....	73 	7 	7 	7 	7 	7 	7 	7 	7$  &'8  
 	
rK   N)rC   rD   rE   r   r   r   r   r   r   r   r   r   r   r"   r   rJ   rK   rL   r   r   	  s        !! f%! 3;'	!
 CK(! 
! ! ! !,

 !(V 4
 
++	,	
 
 
 
 "
%"
 !(V 4"
 -	"

 
,	"
 "
 "
 "
 "
 "
rK   r   )r   dummy_inputsc                       e Zd Z edddddd          Zeded	ed
edz  fd            Zddde	def fdZ
ded
edz  fdZdej        d
ej        fdZdej        dej        d
ej        dz  fdZdej        dej        dej        d
efdZded
ej        fdZded
edz  fdZ	 	 d)dej        d ej        d!edz  d"ej        dz  def
d#Zd$ej        d
ej        dz  fd%Zd&eeeej        f                  d
ee         fd'Zd
efd(Z xZ S )*DeepseekOCRForCausalLMz"language_model.model.embed_tokens.zlanguage_model.model.layers.zlanguage_model.model.norm.zlanguage_model.lm_head. )zmodel.embed_tokens.zmodel.layers.zmodel.norm.zlm_head.zmodel.)orig_to_new_prefixr   rh   rY   Nc                 N    |                     d          rdS t          d          )Nr   r4   z Only image modality is supported)
startswithru   )rv   r   rh   s      rL   get_placeholder_strz*DeepseekOCRForCausalLM.get_placeholder_stri  s,    w'' 	9;<<<rK   )prefixvllm_configr   c                   t                                                       |j        j        }|j        }|j        j        }|| _        || _        |j        | _        |j        | _        |j	        | _	        |j        }t          |          }|j        t                   | _        |                     |d          5  t                      | _        t#          dddddddd	
          }t%          ||t'          |d                    | _        t+          | j                  | _        |j        | _        |j        | _        | j        j        }	dt5          j        t5          j        |	t4          j                            z  }
| j        dk    r]t=          j        t5          j         |	          |
z            | _!        t=          j        t5          j         |	          |
z            | _"        ntG          d| j                   	 d d d            n# 1 swxY w Y   | $                    |          5  tK          || j	        t'          |d                    | _&        d d d            n# 1 swxY w Y   | j&        j'        | _'        d S )Nr   r   i   r            i   gh㈵>)hidden_sizeintermediate_sizenum_attention_headsnum_hidden_layersr?   r   projection_dimlayer_norm_epsvision_model)configquant_configr   r0   dtype2Dz.Only 2D tile_tag is supported currently, got: language_model)r   	hf_configr   )(superrV   model_configr   r   multimodal_configr   vision_configprojector_configtext_configr%   vocab_IMAGE_TOKENr   _mark_tower_modelr2   	sam_modelr	   r1   r   r   r3   	projectortile_tagglobal_view_posn_embedrH   sqrttensorfloat32nn	Parameterrandnimage_newlineview_seperatorru   _mark_language_modelr   r   make_empty_intermediate_tensors)rU   r   r   r   r   r  r   r   clip_vision_configr  	embed_std	__class__s              rL   rV   zDeepseekOCRForCausalLM.__init__p  s   %0%=%G"/'4F!2#1 & 7!-"/0>>	'ol;##K99 !	 !	,..DN!1 "&$&"$"#	" 	" 	" !:))#FN;;! ! !D *$*?@@DN"ODM#)#9D  +3GEJu|G5='Q'Q'QRRRI}$$%'\%+g2F2F2R%S%S"&(l5;w3G3G)3S&T&T## TT]TT   $;!	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	F &&{33 	 	"<'*#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   6EHHH$+II"Ir   c                 &   |                     dd           }|                     dd           }|                     dd           }|*t          j        |                                          dk    rd S | j        j        }t          d|||d|i          S )Nr7   rB   r@   r   r;   )r8   r>   r@   rB   resolve_bindings)poprH   sumitemr  r?   r6   )rU   r   r7   rB   r@   r;   s         rL   _parse_and_validate_image_inputz6DeepseekOCRForCausalLM._parse_and_validate_image_input  s     zz.$77$jj)>EEjj5559\#:#:#?#?#A#AQ#F#F4&1	*# 3Y
 
 
 	
rK   image_tensorc                 ,   |                      |          }|                     ||          }t          j        |d d dd f         |                    d                              ddd          fd          }|                     |          }|j        \  }}}t          |dz            }|	                    |||          }| j
        d d d d f                             |d|          }	t          j        ||	gd          }|	                    d|          S )Nr0   rA   r   r[   r         ?)r  r   rH   catflattenpermuter	  shaperk   viewr  expand)
rU   r   global_features_1global_features_2features_hwr   sidenewlines
             rL   _encode_global_featuresz.DeepseekOCRForCausalLM._encode_global_features  s    NN<88 --l<MNN9!!!!QRR%(!))!,,44Q1== 
 
 
 >>(++^
2s2s7||==tS11$T4]3::4CHH9h0a888}}R%%%rK   patches
crop_shapec                    t          j        |                                          dk    rd S |                     |          }|                     ||          }t          j        |d d dd f         |                    d                              ddd          fd          }|                     |          }|j	        \  }}}t          |dz            }	t          |d                                                   }
t          |d                                                   }|                    ||
|	|	|                              ddddd                              ||	z  |
|	z  |          }| j        d d d d f                             ||	z  d|          }t          j        ||gd          }|                    d|          S )	Nr   r0   rA   r[   r   r"  r:   r   )rH   r  r  r  r   r#  r$  r%  r	  r&  rk   r'  reshaper  r(  )rU   r1  r2  local_features_1local_features_2r+  r,  r-  r   
patch_sidewidth_tilesheight_tilesr/  s                rL   _encode_local_featuresz-DeepseekOCRForCausalLM._encode_local_features  s    9W""$$))4>>'22,,W6FGG9 ABB' ((++33Aq!<< 
 
 
 >>(++^
2sS\\
*Q-,,..//:a=--//00 MM,ZSQQWQ1a##W\J.j0H#NN 	
 $T4]3:::%q#
 
 9h0a888}}R%%%rK   r7   r@   rB   c           	         g }|d d df         dk    |d d df         dk    z  }t          j        ||                    d          d          }|                    |                                          }t          |                    d                    D ]}||         }||g         }	||         }
|                     |	          }|                     ||
          }|)t          j	        ||| j
        d d d f         gd          }n't          j	        || j
        d d d f         gd          }|                    |           |S )Nr   r0   r[   r   )rH   r   r   splittolistr`   r   r0  r:  r#  r  append)rU   r7   r@   rB   images_in_this_batchr   r   jdxr1  	image_orir2  global_featureslocal_featurescombineds                 rL   _pixel_values_to_embeddingz1DeepseekOCRForCausalLM._pixel_values_to_embedding  s~     "'1-16I!!!Q$6ORS6ST!K2E2J2Jr2J2R2RTUVV!''(9(@(@(B(BCC,11!4455 	2 	2C!#&G$cU+I,S1J"::9EEO!88*MMN) 9#_d6I$PQPQPQ'6RS  
 !9$d&9$'&BC   !''1111##rK   image_inputc                     |j         }|j        }|j                            t          j                  }|                     |||          }|S )Nr   )r7   r@   rB   )r>   r@   rB   torH   longrE  )rU   rF  r7   r@   rB   vision_featuress         rL   _process_image_inputz+DeepseekOCRForCausalLM._process_image_input  s[     #'!-)=@@uz@RR99%# 3 : 
 
 rK   c                 R     | j         di |}|d S |                     |          }|S )NrJ   )r  rK  )rU   r   rF  vision_embeddingss       rL   embed_multimodalz'DeepseekOCRForCausalLM.embed_multimodal,  s?    :d:DDVDD4 55kBB  rK   	input_ids	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)rR  )r   )rU   rO  rP  rQ  rR  r   hidden_statess          rL   forwardzDeepseekOCRForCausalLM.forward3  s;      + M++y"6m , 
 
 rK   rT  c                 6    | j                             |          S rS   )r   compute_logits)rU   rT  s     rL   rW  z%DeepseekOCRForCausalLM.compute_logitsD  s     "11-@@@rK   weightsc                 \    t          |           }|                    || j                  }|S )N)mapper)r   load_weightshf_to_vllm_mapper)rU   rX  loaderautoloaded_weightss       rL   r[  z#DeepseekOCRForCausalLM.load_weightsJ  s1    "4((#00AW0XX!!rK   c                 4    t          j        ddddg          S )z<
        Get the module prefix in multimodal models
        r   r	  r  r   )r   	connectortower_model)r   from_string_fieldry   s    rL   get_mm_mappingz%DeepseekOCRForCausalLM.get_mm_mappingO  s,     /+!$n5
 
 
 	
rK   )NN)!rC   rD   rE   r   r\  r}   r   rk   r   r
   rV   r   r6   r  rH   rI   r0  r:  r   rE  rK  r   rN  r$   rU  rW  r   r^   rT   r[  r   rc  __classcell__)r  s   @rL   r   r   W  s        & $H;71
 

 
 
 =3 =3 =3: = = = [= BD >
 >
 >
z >
3 >
 >
 >
 >
 >
 >
@

	$t	+
 
 
 
*&EL &U\ & & & &(!&|!&16!&		!& !& !& !&F$l$ \$ #\	$
 
$ $ $ $B6	   ! !4H44O ! ! ! ! <@-1 < < 2D8	
 |d*    "A|A 
	A A A A"HU33D-E$F "3s8 " " " "

 
 
 
 
 
 
 
 
rK   r   )TrF   r   collections.abcr   r   r   typingr   r   rH   torch.nnr  transformersr   r	   vllm.configr
   vllm.config.multimodalr   %vllm.model_executor.models.interfacesr   r   r   r   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.utilsr   r   r   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   r   r    r!   r"   vllm.sampling_paramsr#   vllm.sequencer$   vllm.tokenizersr%   ,vllm.transformers_utils.configs.deepseek_vl2r&   /vllm.transformers_utils.processors.deepseek_ocrr'   r(   r)   r*   r+   vllm.utils.tensor_schemar,   r-   vllm.v1.sample.logits_processorr.   r/   deepencoderr1   r2   deepseek_vl2r3   r  r6   rN   rm   r   r   r   register_processorModuler   rJ   rK   rL   <module>r}     s   M L  7 7 7 7 7 7 7 7 7 7 % % % % % % % %        7 7 7 7 7 7 7 7 " " " " " " 3 3 3 3 3 3            E D D D D D            0 / / / / /                                    0 / / / / / - - - - - - 8 8 8 8 8 8 K K K K K K              ? > > > > > > >       
 D C C C C C C C & & & & & & G G G G G, G G G,# # # # # # # #L5
 5
 5
 5
 5
!7 5
 5
 5
p.8 .8 .8 .8 .8 2 .8 .8 .8b
 
 
 
 
$:;T$U 
 
 
8K
 K
 K
 K
 K
56K
 K
 K
\ ('"	".  
{
 {
 {
 {
 {
RY(:J {
 {
 
{
 {
 {
rK   