
    .`i)w                     T   U d dl mZmZmZ d dlmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@mAZAmBZBmCZC  G d dejD                  ZE G d de8          ZF G d de8          ZGeFeGz  ZHeeId<    G d de8          ZJ G d  d!e8          ZKeJeKz  ZLeeId"<   d#eMd$eMd%eNd&eNd'eOeMeMf         f
d(ZPd)eMd*eMd'eQeOeMeMf                  fd+ZR G d, d-e/          ZS G d. d/e-eS                   ZT G d0 d1e.eS                   ZU e"jV        eUeSeT2           G d3 d4ejD        e=e>e<                      ZWdS )5    )IterableMappingSequence)	AnnotatedLiteral	TypeAliasN)BatchFeatureInternVLProcessorPretrainedConfig)ACT2FN)GotOcr2ImageProcessorFast)InternVLVideoProcessor)
VllmConfig)BaseDummyOptions)QuantizationConfig)InternS1VisionModel)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)"cached_video_processor_from_config)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefixc                   $     e Zd Z fdZd Z xZS )InternS1MultiModalProjectorc                    t                                                       t          j        |j        j        t          d|j        z            dz  z            | _        t          j	        |j        j        t          d|j        z            dz  z  |j
        j                  | _        t          |j                 | _        t          j	        |j
        j        |j
        j                  | _        d S )Nr&      )super__init__nn	LayerNormvision_confighidden_sizeintdownsample_ratio
layer_normLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfconfig	__class__s     w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/interns1.pyr4   z$InternS1MultiModalProjector.__init__D   s    , ,s1v7N3N/O/OST/TT
 
 	 ,s1v7N3N/O/OST/TT*
 
 &56	*F,>,J
 
    c                     |                      |          }|                     |          }|                     |          }|                     |          }|S N)r;   r>   r@   rA   )rB   image_featureshidden_statess      rE   forwardz#InternS1MultiModalProjector.forwardR   sL    77m44//m44rF   )__name__
__module____qualname__r4   rK   __classcell__rD   s   @rE   r0   r0   C   sG        
 
 
 
 
      rF   r0   c                       e Zd ZU dZdZed         ed<   eej	         e
dddd          f         ed<   eej	         e
d          f         ed	<   d
S )InternS1ImagePixelInputsz
    Dimensions:
        - bnp: Batch size * number of images * (1 + num_patches)
        - c: Number of channels (3)
        - h: Height
        - w: Width
        - bn: Batch size * number of images
    pixel_valuestypebnp   hwbnnum_patchesNrL   rM   rN   __doc__rT   r   __annotations__r   torchTensorr%    rF   rE   rR   rR   Z   s}           %3D'.
!222EL++eQS*I*IIJJJJ5<T):)::;;;;;;rF   rR   c                       e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddd          f         ed<   dS )	InternS1ImageEmbeddingInputsz
    Dimensions:
        - ni: Number of images
        - tifs: Total image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsrT   nitifshsdataNrL   rM   rN   r\   rT   r   r]   r   r^   r_   listr%   r`   rF   rE   rb   rb   i   f           %3D'.
!222
EL4#55{{4QU7V7VV
WWWWWWrF   rb   InternS1ImageInputsc                       e Zd ZU dZdZed         ed<   eej	         e
dddd          f         ed<   eej	         e
d	          f         ed
<   dS )InternS1VideoPixelInputsz
    Dimensions:
        - bnv: Batch size * number of videos * number of frames
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_values_videosrT   bnvrV   rW   rX   rS   rY   rZ   Nr[   r`   rF   rE   rm   rm   x   s           ,AD''
(@@@EL++eQS*I*IIJJJJ5<T):)::;;;;;;rF   rm   c                       e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddd          f         ed<   dS )	InternS1VideoEmbeddingInputsz
    Dimensions:
        - nv: Number of videos
        - tvfs: Total video feature size
        - hs: Hidden size (must match language model backbone)
    video_embedsrT   nvtvfsrf   rg   Nrh   r`   rF   rE   rq   rq      rj   rF   rq   InternS1VideoInputsmin_dynamic_patchmax_dynamic_patchdynamic_image_sizeuse_thumbnailreturnc                 <    |r| nd} |r|nd}|r|dk    r|dz  }| |fS )Nr&   r`   )rv   rw   rx   ry   s       rE   resolve_interns1_min_max_numr|      sQ     .@F))Q-?F))Q *a//Q///rF   min_nummax_numc                 j      fdt           dz             D             }t          |d           S )Nc                     h | ]E}t          d |d z             D ]/}t          d |d z             D ]}||z  cxk    rk    n n||f0FS )r&   )range).0nijr~   r}   s       rE   	<setcomp>z-get_interns1_target_ratios.<locals>.<setcomp>   s       q!a%  q!a%	  a!e&&&&w&&&&&	 
A '&&&&rF   r&   c                 $    | d         | d         z  S )Nr   r&   r`   )xs    rE   <lambda>z,get_interns1_target_ratios.<locals>.<lambda>   s    qtad{ rF   )key)r   sorted)r}   r~   target_ratioss   `` rE   get_interns1_target_ratiosr      sX        w!,,  M -%:%:;;;;rF   c            	           e Zd ZdZdedefdZdeee	dz  f         fdZ
ddde	d	e	d
edz  de	fdZddedz  fdZdefdZde	fdZde	deee	f         de	fdZdS )InternS1ProcessingInfoz)ProcessingInfo for InternS1-style models.kwargsrz   c                      | j         j        t          fi |}t          | j         j        ft
          |j        j        d||_        |S )N)processor_clssize)	ctxget_hf_processorr
   r#   model_configr   image_processorr   video_processor)rB   r   hf_processors      rE   r   z'InternS1ProcessingInfo.get_hf_processor   sa    0tx01BMMfMM'IH!(
0-2(
 (
 	(
 (
$ rF   Nc                     d d dS )Nimagevideor`   rB   s    rE   get_supported_mm_limitsz.InternS1ProcessingInfo.get_supported_mm_limits   s    ---rF   )	processorimage_widthimage_heightr   c                $   ||                                  j        }t          |t                    st	          dt          |                     |                    ||t                                }|                                  j        |z  }|S )Nz.GotOcr2ImageProcessorFast is expected but got )images_kwargs)	r   r   
isinstancer   
ValueErrorrT   get_number_of_image_patchesdictimage_seq_length)rB   r   r   r   num_image_patchesnum_image_tokenss         rE   get_num_image_tokensz+InternS1ProcessingInfo.get_num_image_tokens   s     --//?I)%>?? 	RiRR   &AA+TVV B 
 
  0022CFWWrF   ry   c                     |                                  j        }|j        }|j        }|j        }d}t          ||||          \  }}t          ||          S )NT)ry   )r   r   min_patchesmax_patchescrop_to_patchesr|   r   )rB   ry   r   rv   rw   rx   r}   r~   s           rE   resolve_target_ratiosz,InternS1ProcessingInfo.resolve_target_ratios   so    //11A+7+7 (7!7'	
 
 
 *'7;;;rF   c                 d   |                                  }| j                                        }|j        j        \  }}|                                 }d\  }}|D ]E\  }}	||z  ||	z  }}
|                     |
||j                  }||k    r|}t          |
|          }F|dk    s|
J d            |S )N)r   Nr   r   r   )widthheightr   z(Cannot have a largest feature size of 0!)	r   r   get_hf_configr7   
image_sizer   r   r   r   )rB   r   	hf_configbase_height
base_widthr   largest_feature_sizelargest_feature_pinpointwrhrr   r   	feat_sizes                rE   !get_image_size_with_most_featuresz8InternS1ProcessingInfo.get_image_size_with_most_features   s    ))++	H**,,	"+"9"DZ22449@66# 
	Q 
	QFB&O[2-=6E11!##3 2  I
 ///'0$+45+P+P+P((A--1I1Q1Q6 2R1QR ('rF   c                     |                                  }|                                 \  }}|                     |||j                  S )Nr   )r   r   r   r   )rB   r   target_widthtarget_heights       rE   get_max_image_tokensz+InternS1ProcessingInfo.get_max_image_tokens  sR    ))++	&*&L&L&N&N#m(($&/ ) 
 
 	
rF   seq_len	mm_countsc                    |                     dd          }|                     dd          }|                                 }|                                 |z  }||z
  |j        z  }|t	          |d          z  }t	          |d          S )Nr   r   r   r&   )getr   r   r   max)	rB   r   r   
max_images
max_videosr   max_image_tokensmax_total_framesmax_frames_per_videos	            rE   !get_num_frames_with_most_featuresz8InternS1ProcessingInfo.get_num_frames_with_most_features  s    
 ]]7A..
]]7A..
))++	4466C#&669;UU/3z13E3EE'+++rF   rH   )rL   rM   rN   r\   objectr
   r   r   strr9   r   r   r   boolr   r   r   r   r   r`   rF   rE   r   r      sH       33 4E    .cDj)A . . . . 7;        	 
 -t3  
       (< <4$; < < < <"(9 ( ( ( (4
c 
 
 
 
,, 38$, 
	, , , , , ,rF   r   c            	       t    e Zd ZdZdeeef         defdZ	 d	dedeeef         deeef         dz  de	fdZ
dS )
InternS1DummyInputsBuilderz-DummyInputsBuilder for InternS1-style models.r   rz   c                     |                     dd          }|                     dd          }| j                                        j        }| j                                        j        }||z  ||z  z   S )Nr   r   r   )r   infor   image_tokenvideo_token)rB   r   
num_images
num_videosr   r   s         rE   get_dummy_textz)InternS1DummyInputsBuilder.get_dummy_text   si    ]]7A..
]]7A..
i0022>i0022>Z'+
*BBBrF   Nr   
mm_optionsc                    | j                                         \  }}| j                             ||          }|                    dd          }|                    dd          }| j                                         }	|	j        j        \  }
}|r|                    d          nd }|r|                    d          nd }|                     ||||          |                     ||
|||          dS )Nr   r   r   )r   r   r   	overrides)r   r   
num_framesr   r   r   )	r   r   r   r   r   r7   r   _get_dummy_images_get_dummy_videos)rB   r   r   r   r   r   target_num_framesr   r   rC   image_size_himage_size_wimage_overridesvideo_overridess                 rE   get_dummy_mm_dataz,InternS1DummyInputsBuilder.get_dummy_mm_data(  s    '+i&Q&Q&S&S#m IGGY
 
 ]]7A..
]]7A..
((**%+%9%D"l5?I*..111T5?I*..111T ++"$%)	 ,   ++"#,%) ,  
 
 	
rF   rH   )rL   rM   rN   r\   r   r   r9   r   r   r   r   r`   rF   rE   r   r     s        77CS(9 Cc C C C C =A	!
 !
!
 38$!
 C!112T9	!

 
!
 !
 !
 !
 !
 !
rF   r   c            
            e Zd ZdZdedeeef         deeef         deeef         def
 fdZded	eeef         deee	f         fd
Z
ded	eeef         dedee         fdZ xZS )InternS1MultiModalProcessorz?Basic image-only MultiModalProcessor for InternS1-style models.promptmm_data	mm_kwargs
tok_kwargsrz   c                 t   t          |          }|                    dg           }|                    dg           }t          |t                    sJ t          |t                    sJ  | j        j        di |}|j        }|                    |j        d          }	t          |	          dk    sJ |	d         }	t          j        |j        d|          }t          j        |j        d|          }i }
|rg }|D ]}t                                          |j        d|i||	          }|                    |                    d
                     |                    d          }|                    |          d         }|                    d|d          }d |D             }t%          j        |          t%          j        |          t%          j        |j                  d}
i }|rg }|D ]}t                                          |j        d|i||	          }|                    |                    d
                     |                    d          }|	|||j        k    <   |                    |          d         }|                    d|d          }d |D             }t%          j        |          t%          j        |          t%          j        |	          d}t          j        d|j        |          }t          j        d|j        |          } ||fi |ddi}t-          i ||
|          S )NvideosimagesF)add_special_tokensr&   r   z<image_placeholder>z<video_placeholder>)r   r   r   r   rS   	input_idsc                 ,    g | ]}t          |          S r`   lenr   items     rE   
<listcomp>zBInternS1MultiModalProcessor._call_hf_processor.<locals>.<listcomp>w  s    DDD3t99DDDrF   )rS   image_num_patchesimage_token_idc                 ,    g | ]}t          |          S r`   r   r   s     rE   r   zBInternS1MultiModalProcessor._call_hf_processor.<locals>.<listcomp>  s    CCC#d))CCCrF   )rn   video_num_patchesvideo_token_idreturn_tensorsptr`   )r   popr   ri   r   r   	tokenizerencoder   r   resubr   r3   _call_hf_processorappendbatch_decodereplacer^   concattensorr   r	   )rB   r   r   r   r   r   r   r   r  r   image_outputsimage_pixel_valuesr   processed_outputsr   image_placeholderrZ   video_outputsvideo_pixel_valuesr   video_placeholderr   text_outputsrD   s                          rE   r  z.InternS1MultiModalProcessor._call_hf_processorO  s    w--Xr**Xr**&$'''''&$'''''1ty1>>I>> *	"))$ * 
 
 >""a'''''*02GPP02GPP 	!# U U$)GG$>$>'3%u-')	 %? % %! #))*;*?*?*O*OPPP-11+>>	$-$:$:9$E$Ea$H!(=?PRSTTDD1CDDDK %-? @ @%*\+%>%>"',|/J"K"K M  	!# U U$)GG$>$>'3%u-')	 %? % %! #))*;*?*?*O*OPPP-11+>>	FT	)|'BBC$-$:$:9$E$Ea$H!(=?PRSTTCC0BCCCJ',|4F'G'G%*\*%=%="',~">"> M -|/GPP-|/GPP yKK:KKdKKKN|N}NNOOOrF   	hf_inputshf_processor_mm_kwargsc                    |                     dt          j        d                    }|                     dt          j        d                    }t          |          }t          |          }t	          t          j        d|          t          j        d          t          j        d          t          j        d|          t          j        d|          t          j        d          t          j        d|                    S )Nr   r   r   r   r   )rS   r   rc   r   rn   r   r   )	r   r^   emptyr   r   r   flat_from_sizesbatchedshared)rB   r  r  r   r   r   r   s          rE   _get_mm_fields_configz1InternS1MultiModalProcessor._get_mm_fields_config  s    
 &MM*=u{1~~NN%MM*=u{1~~NN*++
*++
.>*  4;GDD.6w??07LL 5 E*! ! 4;GDD07LL
 
 
 	
rF   mm_itemsout_mm_kwargsc                    	
  | j         j        d	i |j        
j        j        j        |                                }d|v r9|d         t          t          j	                  sJ 
                                ng d|v r9|d         	t          	t          j	                  sJ 	
                                	ng 	dt          f	
fd}dt          ffd}t          d
|          t          d|          gS )
Nr   r   item_idxc                    
                     dt          t          f          }t          |t                    r|                    |           }n|          }|j        z  }	|z  }|z   z   }t          j        |	          S )Nr   )	get_itemsr   r   r   get_feature_sizer   r!   select_text)r  r   feature_sizerZ   repl_features	repl_fullend_image_tokenr   r   img_context_tokenr  start_image_tokens         rE   get_replacement_interns1_imagezWInternS1MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_interns1_image  s    ''-/BC F &"566 K%66x@@/9*\-JJ-<M)M9OKI&29>OPPPrF   c                     |          }	j         z  }|z   z   d                    fdt          |          D                       }t          j        |	          S )N
c                 &    g | ]}d |dz    d S )Framer&   z: r`   )r   r   repl_features_with_seps     rE   r   zkInternS1MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_interns1_video.<locals>.<listcomp>  s/    WWWq:Q::"8::WWWrF   )r   joinr   r!   r"  )
r  rZ   r$  r%  r.  r&  r   r(  r   r   s
       @rE   get_replacement_interns1_videozWInternS1MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_interns1_video  sr    +H5K',*GGM%6%F%X"		WWWWE+DVDVWWW I '29kJJJrF   r   )modalitytargetreplacementr   r`   )r   r   r   r(  r&  r   get_datar   r^   r_   tolistr9   r   )rB   r  r  r  out_mm_datar)  r0  r&  r   r   r'  r(  r   r   s    `     @@@@@@@rE   _get_prompt_updatesz/InternS1MultiModalProcessor._get_prompt_updates  s    2ty1KK4JKK(4(:&6".#,,..+-- +,? @/>>>>> 1 8 8 : : "+-- +,? @/>>>>> 1 8 8 : : "	QS 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q		KS 		K 		K 		K 		K 		K 		K 		K 		K 		K 		K  (:  
  ":  
 	
rF   )rL   rM   rN   r\   r   r   r   r	   r  r   r  r   r   r   r    r7  rO   rP   s   @rE   r   r   L  s=       IILPLP f%LP 3;'	LP
 CK(LP 
LP LP LP LP LP LP\

 !(V 4
 
++	,	
 
 
 
0@
%@
 !(V 4@
 -	@

 
,	@
 @
 @
 @
 @
 @
 @
 @
rF   r   )r   dummy_inputsc                       e Zd Z eddddd          Zededed	ed
z  fd            Zddde	ded	d
f fdZ
deded
z  defdZded	ej        fdZd2dZdej        d	ej        fdZded	ed
z  fdZded	ed
z  fdZdeez  d	eej        df         fdZded	efdZd ej        d	d
fd!Zded	efd"Z 	 d3d
d#d$d ej        d%ed
z  d&ej        d
z  d'e!d	ej        f
 fd(Z"	 	 d4d ej        d)ej        d*e#d
z  d+ej        d
z  ded	e#fd,Z$d-ej        d	ej        d
z  fd.Z%d/e&eeej        f                  d	e'e         fd0Z(d	e)fd1Z* xZ+S )5 InternS1ForConditionalGenerationzlanguage_model.lm_head.zlanguage_model.model.zvision_tower.zmulti_modal_projector.)zlm_head.zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.)orig_to_new_prefixr1  r   rz   Nc                 |    |                     d          rdS |                     d          rdS t          d          )Nr   z<IMG_CONTEXT>r   z<video>z)Only image or video modality is supported)
startswithr   )clsr1  r   s      rE   get_placeholder_strz4InternS1ForConditionalGeneration.get_placeholder_str
  sI     w'' 	#"?w'' 	9DEEErF    )prefixvllm_configrA  c          	      >   t                                                       |j        j        }|j        }|j        j        }|| _        || _        |j        j        d         }|j        j	        d         }|| _	        t          ||z  dz  |j        dz  z            | _        |j        | _        |                     |ddh          5  |                     ||t          |d                    | _        |                     |          | _        d d d            n# 1 swxY w Y   |                     |          5  t)          ||j        t          |d                    | _        d d d            n# 1 swxY w Y   d | _        d | _        d | _        | j        j        | _        d S )	Nr   r2   r   r   vision_tower)quant_configrA  language_model)rB  r   rA  )r3   r4   r   r   rE  multimodal_configrC   r7   r   
patch_sizer9   r:   num_image_token_mark_tower_model_init_vision_modelr.   rD  
_init_mlp1multi_modal_projector_mark_language_modelr-   r=   rF  img_context_token_idvideo_context_token_idvisual_token_maskmake_empty_intermediate_tensors)	rB   rB  rA  rC   rE  rG  r   rH  rD   s	           rE   r4   z)InternS1ForConditionalGeneration.__init__  s8   )3"/'4F!2)4Q7
)4Q7
$":%!+v/F/IJ 
  
 !' 7##K'71CDD 	A 	A $ 7 7)#FN;; !8 ! !D
 *.)@)@D&	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 %)!&*#!%? 	,,,s%   ADDD5+E,,E03E0rC   rE  c                J    |j         j        }t          |j         |||          S )N)rE  num_hidden_layers_overriderA  )r7   num_hidden_layersr   )rB   rC   rE  rA  rU  s        rE   rK  z3InternS1ForConditionalGeneration._init_vision_model=  s6     #0B" %'8	
 
 
 	
rF   c                      t          |          S rH   )r0   )rB   rC   s     rE   rL  z+InternS1ForConditionalGeneration._init_mlp1L  s    *6222rF         ?c           
         |                                 \  }}}}|                    ||t          ||z            t          ||z                      }|                    dddd                                          }|                    |t          ||z            t          ||z            t          |||z  z                      }|                    dddd                                          }|S )Nr   r2   r&   rV   )r   viewr9   permute
contiguous)rB   r   scale_factorr   rX   rW   cs          rE   pixel_shufflez.InternS1ForConditionalGeneration.pixel_shuffleO  s    VVXX
1aFF1aQ-..A4D0E0EFFIIaAq!!,,..FFL !!L !!\L0122	
 
 IIaAq!!,,..rF   rS   c                    |                      |          }|d d dd d d f         }t          |j        d         dz            x}}|                    |j        d         ||d          }|                     || j                  }|                    |j        d         d|j        d                   }|                     |          }|S )N)rS   r&   rW  r   )r\  )rD  r9   shapereshaper^  r:   rM  )rB   rS   
vit_embedsrW   rX   s        rE   extract_featurez0InternS1ForConditionalGeneration.extract_feature^  s    &&L&AA
122qqq)
J$Q'3.///A''
(8(;Q2FF
''
AV'WW
''
(8(;RAQRTAUVV
//
;;
rF   r   c                    |                     dd           }|                     dd           }|                     dd           }||d S |t          d|          S |d         }t          |t          j                  r8|                                                                                                }t          |t                    sJ || _	        |*| j
        j        j        \  }}t          d||||d          S t          d          )	NrS   r   rc   rT   rg   r   rW   rX   )rT   rS   rZ   resolve_bindings This line should be unreachable.)r   rb   r   r^   r_   flattenuniquer   r9   rO  rC   r7   r   rR   AssertionError)rB   r   rS   r   rc   r   rW   rX   s           rE   _parse_and_validate_image_inputz@InternS1ForConditionalGeneration._parse_and_validate_image_inputj  s3    zz.$77"JJ':DAAzz.$77L$84#/#!   
   01nel33 	F+3355<<>>CCEEN.#.....$2!#;,7DAq+#)-" "	    ?@@@rF   c                    |                     dd           }|                     dd           }|                     dd           }||d S |t          d|          S |d         }t          |t          j                  r8|                                                                                                }t          |t                    sJ || _	        |*| j
        j        j        \  }}t          d||||d          S t          d          )	Nrn   r   rr   rf  r   rg  )rT   rZ   rS   rh  ri  )r   rq   r   r^   r_   rj  rk  r   r9   rP  rC   r7   r   rm   rl  )rB   r   pixel_values_flat_videor   rr   r   rW   rX   s           rE   _parse_and_validate_video_inputz@InternS1ForConditionalGeneration._parse_and_validate_video_input  s5    #)**-BD"I"I"JJ':DAAzz.$77"*|/C4#/#!   
   01nel33 	F+3355<<>>CCEEN.#.....&4#".;,7DAq+*-4" "	    ?@@@rF   image_input.c                    |d         dk    s|d         dk    r|d         S |                      |d                   }|d         }t          |          dk    r&|                    d| j        j        j                  fS |j        d         |                    d| j        j        j                  }fd	|D             }|                    |          S )
NrT   rc   rr   rg   rS   rZ   r&   r`  c                     g | ]}|z  S r`   r`   )r   rZ   r#  s     rE   r   zJInternS1ForConditionalGeneration._process_vision_input.<locals>.<listcomp>  s+     
 
 
+6K,&
 
 
rF   )rd  r   rY  rC   r=   r8   ra  split)rB   rq  rc   rZ   image_feature_sizesr#  s        @rE   _process_vision_inputz6InternS1ForConditionalGeneration._process_vision_input  s    
 >116"n44v&&++K,GHH!-0 {q   %%b$+*A*MNNPP $)!,#((T[-D-PQQ
 
 
 
:E
 
 
 !!"5666rF   c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)rS   rc   r   )rn   r   r`   )rm  rp  )rB   r   
modalities	input_keys       rE   %_parse_and_validate_multimodal_inputszFInternS1ForConditionalGeneration._parse_and_validate_multimodal_inputs  s    
   	V 	VI===J..'Kt'K'U'Uf'U'U
8$4449S9S'Kt'K'U'Uf'U'U
8$rF   r   c                     d | _         d S rH   )rQ  )rB   r   s     rE   _set_visual_token_maskz7InternS1ForConditionalGeneration._set_visual_token_mask  s    !%rF   c                 
    | j         di |}|sg S d}|D ]l}|dk    r/|d         }|                     |          }|t          |          z  }|dk    r/|d         }|                     |          }|t          |          z  }m|S )Nr`   r   r   )rz  rv  tuple)	rB   r   rx  multimodal_embeddingsr1  rq  image_embeddingsvideo_inputvideo_embeddingss	            rE   embed_multimodalz1InternS1ForConditionalGeneration.embed_multimodal  s    ?T?II&II
 	I ;= # 	A 	AH8##(2#'#=#=k#J#J %/?)@)@@%8##(2#'#=#=k#J#J %/?)@)@@%$$rF   F)is_multimodalhandle_oov_mm_tokenr  r  r  c                    |(t          |          dk    r|                     |           ||!t                                          |          S t                                          ||||          S )Nr   )r  r  r  )r   r|  r3   embed_input_ids)rB   r   r  r  r  rD   s        rE   r  z0InternS1ForConditionalGeneration.embed_input_ids  s     !,5J1K1Ka1O1O''	222 !(M,A77**9555ww&&"7' 3	 ' 
 
 	
rF   	positionsintermediate_tensorsinputs_embedsc                 @    |d }||||d} | j         j        di |}|S )N)r   r  r  r  r`   )rF  model)rB   r   r  r  r  r   forward_kwargsrJ   s           rE   rK   z(InternS1ForConditionalGeneration.forward  sL      + M #"$8*	
 
 2+1CCNCCrF   rJ   c                 6    | j                             |          S rH   )rF  compute_logits)rB   rJ   s     rE   r  z/InternS1ForConditionalGeneration.compute_logits$  s     "11-@@@rF   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r+   load_weightshf_to_vllm_mapper)rB   r  loaders      rE   r  z-InternS1ForConditionalGeneration.load_weights*  s+    "4((""743I"JJJrF   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        rF  rM  rD  )rF  	connectortower_model)r   from_string_fieldr   s    rE   get_mm_mappingz/InternS1ForConditionalGeneration.get_mm_mapping.  s'     /+-&
 
 
 	
rF   )rW  rH   )NN),rL   rM   rN   r,   r  classmethodr   r9   r?  r   r4   r   r   rK  r5   ModulerL  r^  r^   r_   rd  r   rk   rm  ru   rp  r~  rv  r   rz  r|  r'   r  r   r  r"   rK   r  r   setr  r   r  rO   rP   s   @rE   r:  r:    s        &1%<#2,D	
 
   F3 F3 F3: F F F [F BD &
 &
 &
z &
3 &
 &
 &
 &
 &
 &
 &
P
 
 )4/

 
 
 
 
3!1 3bi 3 3 3 3   
EL 
U\ 
 
 
 
#A#A	t	##A #A #A #AJ#A#A	t	##A #A #A #AJ7(+>>7 
u|S 	!7 7 7 76f      & & & & & &% %4H % % % %4 >B

 .2$)
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
4 <@-1 < < 2D8	
 |d*  
   *A|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
 
 
 
 
rF   r:  )Xcollections.abcr   r   r   typingr   r   r   regexr  r^   torch.nnr5   transformersr	   r
   r   transformers.activationsr   ;transformers.models.got_ocr2.image_processing_got_ocr2_fastr   6transformers.models.internvl.video_processing_internvlr   vllm.configr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   'vllm.model_executor.models.interns1_vitr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   r   r   r   r    r!   vllm.sequencer"   !vllm.transformers_utils.processorr#   vllm.utils.tensor_schemar$   r%   
interfacesr'   r(   r)   r*   utilsr+   r,   r-   r.   r  r0   rR   rb   rk   r]   rm   rq   ru   r9   r   r~  r|   ri   r   r   r   r   register_processorr:  r`   rF   rE   <module>r     sk   8 7 7 7 7 7 7 7 7 7 7 0 0 0 0 0 0 0 0 0 0            J J J J J J J J J J + + + + + +           # " " " " " 3 3 3 3 3 3 F F F F F F G G G G G G D D D D D D / / / / / /         
                           . - - - - - P P P P P P > > > > > > > >                          ")   .< < < < <| < < <	X 	X 	X 	X 	X< 	X 	X 	X ":<X!X Y X X X< < < < <| < < <	X 	X 	X 	X 	X< 	X 	X 	X ":<X!X Y X X X000 0 	0
 38_0 0 0 0<<< 
%S/< < < <g, g, g, g, g,/ g, g, g,T,
 ,
 ,
 ,
 ,
!78N!O ,
 ,
 ,
^i
 i
 i
 i
 i
"9:P"Q i
 i
 i
X ('	+  
y
 y
 y
 y
 y
I!:|y
 y
 
y
 y
 y
rF   