
    .`iKW                        U d dl Z d dlmZmZmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lm Z m!Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? d dl@mAZA d dlBmCZCmDZD ddlEmFZF ddlGmHZHmIZImJZJ ddlKmLZL ddlMmNZNmOZOmPZPmQZQ ddlRmSZSmTZTmUZU  G d deC          ZV G d deC          ZWeVeWz  ZXe
eYd <    G d! d"e	          ZZ G d# d$ed%&          Z[ G d' d(e          Z\ G d) d*ej]                  Z^ G d+ d,e<          Z_ ed-e_.          Z` G d/ d0e,e`                   Za G d1 d2e;e`                   Zbd3e=d4e_fd5Zcdd6d7e`d8e:e`         d9e0dz  d4e;fd:Zddd;d<d=eZd>e*dz  d?eedz  d@efd4eFeLz  f
dAZg e.jh        edeceaB           G dC dDej]        eIeJ                      ZidS )E    N)IterableMappingSequence)	AnnotatedFinalLiteralProtocol	TypeAliasTypeVar)BatchFeatureCLIPVisionConfigPretrainedConfigSiglipVisionConfig)LlavaConfig)
ImageInputget_image_sizeto_numpy_array)LlavaProcessor)ProcessingKwargsUnpack)PreTokenizedInput	TextInput)
VllmConfig)
get_act_fn)ColumnParallelLinearRowParallelLinear)QuantizationConfig)LlavaDummyInputsBuilder)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalFieldConfigMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderget_layer_indexinit_vllm_registered_modelmaybe_prefix)VisionEncoderInfoget_num_selected_vision_tokensget_vision_encoder_infoc                   j    e Zd ZU dZdZed         ed<   eej	         e
dddd          f         ed<   dS )	TarsierImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_valuestypebn   hwN__name__
__module____qualname____doc__r@   r   __annotations__r   torchTensorr/        v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/tarsier.pyr>   r>   ?   s[           %3D'.
!222EL++dAsC*H*HHIIIIIIrN   r>   c                   h    e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   dS )	TarsierImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    image_embedsr@   rA   ifshsdataNrE   rM   rN   rO   rQ   rQ   L   sY           %3D'.
!222
EL++dE4"@"@@
AAAAAArN   rQ   TarsierImageInputsc                       e Zd ZU ee         ed<   ee         ed<   ee         ed<   ee         ed<   eeee         z           ed<   ee         ed<   ee         ed<   ee         ed<   d	Z	e
ed
<   dS )TarsierHfConfigvision_configtext_configimage_token_indexvision_feature_select_strategyvision_feature_layerprojector_hidden_actimage_newline_idximage_new_idxTmultimodal_projector_biasN)rF   rG   rH   r   r   rJ   intstrlistra   boolrM   rN   rO   rX   rX   \   s         )****'((((Sz!!!$)#J...d3i0000*$$$Sz!!!:&*t*****rN   rX   c                       e Zd Zddii dZdS )TarsierProcessorKwargspaddingF)text_kwargsimages_kwargsN)rF   rG   rH   	_defaultsrM   rN   rO   rg   rg   h   s+         u
 	 IIIrN   rg   F)totalc            	       f    e Zd Z	 	 	 	 ddedeez  ee         z  ee         z  dee         de	fdZ
dS )TarsierProcessorNimagestextkwargsreturnc                 >   ||t          d           | j        t          fd| j        j        i|}| | j        |fi |d         }ni }t          |t                    r|g}n?t          |t                    s*t          |d         t                    st          d          |}|	                    d          |d         }	t          t          |	d                             \  }
}|
| j        z  || j        z  dz   z  | j        z   dz   }| j        dk    r|dz  }g }|D ]:}|                    | j        | j        |z            }|                    |           ;|d	                             d
d           } | j        |fi |d	         }t'          i |||          S )Nz7You have to specify at least one of `images` or `text`.tokenizer_init_kwargsrj   r   zAInvalid input text. Please provide a string, or a list of stringsr?   r0   defaultri   return_tensors)rU   tensor_type)
ValueError_merge_kwargsrg   	tokenizerinit_kwargsimage_processor
isinstancerc   rd   getr   r   
patch_sizenum_additional_image_tokensr\   replaceimage_tokenappendpopr   )selfro   rp   audiovideosrq   output_kwargsimage_inputsprompt_stringsr?   heightwidthnum_image_tokenssamplerv   text_inputss                   rO   __call__zTarsierProcessor.__call__r   s     >dlVWWW**"
 
"&."<
 
 

 /4/ '8 LL LdC   	6DDD$'' 	
47C0H0H 	S  
 N++7'7L*>,q/+J+JKKMFE4?*u/G!/KL23 
 2i?? A% N . .$d&69I&I  %%f----&}599:JDQQ$dn^TT}]7STT0K0<0n
 
 
 	
rN   )NNNN)rF   rG   rH   r   r   r   rd   r   rg   r   r   rM   rN   rO   rn   rn   q   s         " %):
 :
:
 

y/  
!":
 /0:
 
:
 :
 :
 :
 :
 :
rN   rn   c                   h     e Zd Z	 	 ddedededededz  def fd	Zd
ej	        dej	        fdZ
 xZS )TarsierMultiModalProjectorN vision_hidden_sizetext_hidden_sizer^   ra   quant_configprefixc                     t                                                       t          ||||| d          | _        t	          |          | _        t          ||||| d          | _        d S )Nz	.linear_1)biasr   r   z	.linear_2)super__init__r   linear_1r   actr   linear_2)r   r   r   r^   ra   r   r   	__class__s          rO   r   z#TarsierMultiModalProjector.__init__   s     	,*%'''
 
 
 233)*%'''
 
 
rN   image_featuresrr   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r   r   r   )r   r   hidden_states_s       rO   forwardz"TarsierMultiModalProjector.forward   sE    ==88q//==77qrN   )Nr   )rF   rG   rH   rb   rc   re   r   r   rK   rL   r   __classcell__r   s   @rO   r   r      s         37
 

 
 "	

 $(
 )4/
 
 
 
 
 
 
6el u|        rN   r   c                       e Zd ZdefdZdefdZdedefdZ	de
eedz  f         fdZded	edefd
ZdefdZdefdZdefdZdefdZdS )TarsierProcessingInforr   c                 @    | j                             t                    S r   )ctxget_hf_configHfLlavaConfigr   s    rO   r   z#TarsierProcessingInfo.get_hf_config   s    x%%m444rN   c                 D    t          |                                           S r   )r<   r   r   s    rO   r<   z-TarsierProcessingInfo.get_vision_encoder_info   s    &t'9'9';';<<<rN   rq   c                     |                                  }|                    d|                                            | j        j        t
          fi |S )Nr   )r<   
setdefaultget_patch_sizer   get_hf_processorrn   )r   rq   vision_infos      rO   r   z&TarsierProcessingInfo.get_hf_processor   sS    2244,(B(B(D(DEEE(tx()9DDVDDDrN   Nc                 
    dd iS )NimagerM   r   s    rO   get_supported_mm_limitsz-TarsierProcessingInfo.get_supported_mm_limits   s    rN   image_widthimage_heightc                   |                                  }|                                 }t          |                    ||          |j                  }|dk    r_|                                 }t          |                    |j        |j                  |j                  }|dk    rt          d          |}t          t          j        |                    }||z   dz   }	|	S )Nr   r   r   z4Could not determine a valid number of image patches.r0   )r   r<   r;   get_num_image_tokensr\   !get_image_size_with_most_featuresr   r   rx   rb   mathsqrt)
r   r   r   	hf_configvision_encoder_infonum_projected_patchesdefault_sizenum_projected_patches_defaultnum_height_patchestotal_image_tokens_for_llms
             rO   r   z*TarsierProcessingInfo.get_num_image_tokens   s
    &&((	"::<< >44') 5   4!
 !
 !A%%AACCL,J#88 , 2!-!4 9   8- -) -11 !WXXX$A! +@!A!ABB%:=O%ORS%S"))rN   c                 x    |                                  }|                                x}}t          ||          S )N)r   r   )r<   r   r%   )r   r   r   r   s       rO   r   z7TarsierProcessingInfo.get_image_size_with_most_features  s=    "::<<,;;===uV4444rN   c                 ^    |                                  \  }}|                     ||          S )Nr   )r   r   )r   target_widthtarget_heights      rO   get_max_image_tokensz*TarsierProcessingInfo.get_max_image_tokens  s;    &*&L&L&N&N#m(($& ) 
 
 	
rN   c                 4    |                                  j        S r   )r   r_   r   s    rO   get_image_newline_idxz+TarsierProcessingInfo.get_image_newline_idx  s    !!##55rN   c                 4    |                                  j        S r   )r   r`   r   s    rO   get_image_new_idxz'TarsierProcessingInfo.get_image_new_idx  s    !!##11rN   )rF   rG   rH   rX   r   r:   r<   objectrn   r   r   rc   rb   r   r   r%   r   r   r   r   rM   rN   rO   r   r      s:       5 5 5 5 5=): = = = =E E4D E E E EcDj)A    * * 	*
 
* * * *>59 5 5 5 5

c 
 
 
 
6s 6 6 6 623 2 2 2 2 2 2rN   r   
_I_Tarsier)boundc                       e Zd ZdS )TarsierDummyInputsBuilderN)rF   rG   rH   rM   rN   rO   r   r     s        DrN   r   c            	       v    e Zd Zdedeeef         deeef         fdZde	deeef         de
dee         fdZdS )	TarsierMultiModalProcessor	hf_inputshf_processor_mm_kwargsrr   c                 l    t          t          j        d          t          j        d                    S )Nr   )r?   rR   )dictr!   batched)r   r   r   s      rO   _get_mm_fields_configz0TarsierMultiModalProcessor._get_mm_fields_config  s7    
 .6w??.6w??
 
 
 	
rN   mm_itemsout_mm_kwargsc                       j                                         }|j        dt          f fd}t	          dg|          gS )Nitem_idxc                 r                        dt          t          f          }t          |t                    r?|                    |           }t          t          j        |                    }||z   dz   }n;|                    |           }j	        
                    |j        |j                  }g|z  S )Nr   r0   r   )	get_itemsr#   r$   r}   get_feature_sizerb   r   r   r   infor   r   r   )	r   ro   r   r   num_final_image_tokens
image_sizeimage_token_idr   r   s	         rO   get_replacementzGTarsierMultiModalProcessor._get_prompt_updates.<locals>.get_replacement0  s    ''-/BC F &"566 
(.(?(?(I(I%%(3H)I)I%J%J")>AS)SVW)W&&#228<<
)-)G)G * 0!+!2 *H * *&
 ##&<<<rN   r   )modalitytargetreplacement)r   r   r[   rb   r+   )r   r   r   r   r   r   r   s   ``    @rO   _get_prompt_updatesz.TarsierMultiModalProcessor._get_prompt_updates'  s{     I++--	"4	=c 	= 	= 	= 	= 	= 	= 	= 	=(  &'+  
 	
rN   N)rF   rG   rH   r   r   rc   r   r!   r   r&   r"   r   r,   r   rM   rN   rO   r   r     s        

 !(V 4
 
++	,	
 
 
 
"
%"
 !(V 4"
 -	"

 
,	"
 "
 "
 "
 "
 "
rN   r   r   rr   c                      t          |           S r   )r   )r   s    rO   _build_tarsier_hf_infor   L  s     %%%rN   cacher   dummy_inputsr   c                    t          | t                    rt          | ||          S t          t	          |                     )Nr   )r}   r   r   NotImplementedErrorr@   )r   r   r   s      rO   _build_tarsier_hf_processorr   P  sM     $-.. 
)
 
 
 	

 d4jj
)
))rN   r   )require_post_normr   r   r   r   r   c                   | j         }| j        }|j        t          |t                    rt          |          }nXt          |t          t          f          rt          fd|D                       }n t          dt          |           d          t          |t                    rt          |||||          S t          |t                    rt          |||||          S dt          |           }t          |          )Nc              3   8   K   | ]}t          |          V  d S r   )r7   ).0idxbase_num_hidden_layerss     rO   	<genexpr>z0init_vision_tower_for_tarsier.<locals>.<genexpr>p  s?       (
 (
=@OC!788(
 (
 (
 (
 (
 (
rN   zvision_layer_feature type:  is not supported)r   num_hidden_layers_overrider   r   z'Unsupported vision config for Tarsier: )rY   r]   num_hidden_layersr}   rb   r7   rd   tuplemax	TypeErrorr@   r   r1   r   r5   r   )	r   r   r   r   rY   feature_layersnum_hidden_layers_to_initmsgr   s	           @rO   init_vision_tower_for_tarsierr  _  s`    +M3N*<.#&& 
$32%
 %
!! 
NT5M	2	2 
$' (
 (
 (
 (
DR(
 (
 (
 %
 %
!! Q$~*>*>QQQ
 
 	
 -!122 
%'@/
 
 
 	
 
M#5	6	6 
 %'@/
 
 
 	
 JD4G4G
I
IC
c
"
""rN   )r   r   c                   b    e Zd Zg dddgdZededededz  fd	            Zd
ddededdf fdZ	de
dedz  fdZdeez  dej        eej                 z  dej        eej        df         z  fdZdej        dej        fdZdedej        eej        df         z  fdZdedej        eej        df         z  fdZde
defdZ	 	 d%dej        dej        dedz  dej        dz  de
dej        ez  fd Zd!ej        dej        dz  fd"Zd#eeeej        f                  dee         fd$Z xZ S )&TarsierForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   irr   Nc                 N    |                     d          rdS t          d          )Nr   z<image>z Only image modality is supported)
startswithrx   )clsr   r  s      rO   get_placeholder_strz3TarsierForConditionalGeneration.get_placeholder_str  s,    w'' 	9;<<<rN   r   )r   vllm_configr   c                   t                                                       |j        j        }|j        }|| _        |                     |d          5  t          ||dt          |d                    | _	        t          |dd          }t          |j        j        |j        j        |j        ||t          |d                    | _        |                     d	t%          j        |j        gt$          j        
          d           |                     dt%          j        |j        gt$          j        
          d           d d d            n# 1 swxY w Y   |                     |          5  t1          ||j        t          |d                    | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nr   Fvision_tower)r   r   r   ra   Tmulti_modal_projector)r   r   r^   ra   r   r   image_newline_idx_tensor)dtype)
persistentimage_new_idx_tensorlanguage_model)r  r   r   )r   r   model_configr   r   config_mark_tower_modelr  r9   r  getattrr   rY   hidden_sizerZ   r^   r  register_bufferrK   tensorr_   longr`   _mark_language_modelr8   r  make_empty_intermediate_tensors)r   r  r   r  r   projector_biasr   s         rO   r   z(TarsierForConditionalGeneration.__init__  sR   "-":"D"/##K99 	 	 =)"'#FN;;	! ! !D %V-H$OON)C#)#7#C!'!3!?%+%@*8)#F,CDD* * *D&   *f67uzJJJ  !   
   &f235:FFF  !   -	 	 	 	 	 	 	 	 	 	 	 	 	 	 	8 &&{33 	 	"<' ,#F,<==	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   C1EEE/+F&&F*-F*rq   c                     |                     dd           }|                     dd           }||d S |t          d|          S |t          d|          S t          d          )Nr?   rR   )r@   r?   )r@   rU   z This line should be unreachable.)r   r>   rQ   AssertionError)r   rq   r?   rR   s       rO   _parse_and_validate_image_inputz?TarsierForConditionalGeneration._parse_and_validate_image_input  s     zz.$77zz.$77L$84#*#)   
 #.#!   
 ?@@@rN   r  r?   .c                 0     ||| j         j                  S )N)feature_select_strategy)r  r\   )r   r  r?   s      rO   _image_pixels_to_featuresz9TarsierForConditionalGeneration._image_pixels_to_features  s(     |$(K$N
 
 
 	
rN   projected_image_featuresc                    |j         \  }}}t          t          j        |                    }||z  }|j        }| j        j        j        } || j        	                    |                    
                    d          }	 || j        	                    |                    
                    d          }
	 |                    ||||          }n=# t          $ r0}t          d|j          d| d| d| d| d| d| d          |d}~ww xY w|	                    ||d	|f          }t          j        ||gd
          }||z   }|                    |||          }|
                    |d	|f          }t          j        ||gd	          }|S )z@
        Implements Tarsier's `add_split_tokens` logic.
        r   z3Cannot reshape projected_image_features with shape z to (z, z[). Ensure num_projected_patches is compatible with a grid structure. num_projected_patches=z, derived num_height_patches=. Nr0      )dim)shaperb   r   r   devicer  modelembed_tokensr  tosqueezer  viewRuntimeErrorexpandrK   cat)r   r.  
num_imagesr   	embed_dimr   num_width_patchesr4  embedding_layerimage_newline_embimage_new_embcurrent_image_features_grideimage_newline_expandedfeatures_with_newlinesnew_num_patches_after_newlinefeatures_with_newlines_flatimage_new_expandedfinal_image_featuress                      rO   _add_tarsier_split_tokensz9TarsierForConditionalGeneration._add_tarsier_split_tokens  sT    8P7U4
)9 +@!A!ABB15GG)0-3@+O),,V44
 

'!** 	 ((A(D(DV(L(LMMUUVWXX	*B*G*G.0A9+ +''  
	 
	 
	E7=E E!E E%7E E &E E *3E E *?E E /AE E E	 	 	
	 "3!9!9+Q	:"
 "
 "'(*@A"
 "
 "
 )>@R(R%&<&A&A5y'
 '
# +11:q)2LMM$y(*<= 
  
  
 $#s   7C 
D
+DD
inputsc                    |d         }|                      | j        |          }t          |t          j                  r,|                     |          }|                     |          }|S t          dt          |           d          )Nr?   z _image_pixels_to_features type: r   )	r-  r  r}   rK   rL   r  rK  r  r@   )r   rL  r?   image_features_selectedprojected_featuresfinal_featuress         rO   _process_image_pixelsz5TarsierForConditionalGeneration._process_image_pixels"  s     n-"&"@"@|#
 #
 -u|<< 	!%!;!;<S!T!T!;;<NOON!!E011E E E  rN   image_inputc                     |d         dk    rW|d         }t          |t          j                  r|                     |          S t	          dt          |           d          |                     |          S )Nr@   rR   rU   z*Incorrect type of image_embeds. Got type: r0  )r}   rK   rL   rK  rx   r@   rQ  )r   rR  rO  s      rO   _process_image_inputz4TarsierForConditionalGeneration._process_image_input4  s     v.00!,V!4,el;; 556HIII >!%&8!9!9> > >  
 ))+666rN   c                 N     | j         di |}|g S |                     |          S )NrM   )r*  rT  )r   rq   rR  s      rO   embed_multimodalz0TarsierForConditionalGeneration.embed_multimodalD  s9    :d:DDVDDI((555rN   	input_ids	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)rW  rX  rY  rZ  )r  r5  )r   rW  rX  rY  rZ  rq   r   s          rO   r   z'TarsierForConditionalGeneration.forwardJ  s@      + M+11!5'	 2 
 
 rN   r   c                 6    | j                             |          S r   )r  compute_logits)r   r   s     rO   r]  z.TarsierForConditionalGeneration.compute_logits]  s     "11-@@@rN   weightsc                 J    t          |           }|                    |          S r   )r6   load_weights)r   r^  loaders      rO   r`  z,TarsierForConditionalGeneration.load_weightsc  s#    "4((""7+++rN   )NN)!rF   rG   rH   packed_modules_mappingclassmethodrc   rb   r  r   r   r   rV   r*  r1   r5   rK   rL   rd   r   r-  rK  r>   rQ  rT  r2   rV  r-   r   r]  r   setr`  r   r   s   @rO   r  r    s        322$i0 
 =3 =3 =3: = = = [= BD .
 .
 .
z .
3 .
 .
 .
 .
 .
 .
 .
`AA	d	"A A A A.	
%(99	
 lT%,%77	
 
elC/0	0		
 	
 	
 	
/$(-/$	/$ /$ /$ /$b' 
elC/0	0   $7'7 
elC/0	07 7 7 7 6 64H 6 6 6 6 <@-1 < < 2D8	
 |d*  
+	+   &A|A 
	A A A A,HU33D-E$F ,3s8 , , , , , , , ,rN   r  )jr   collections.abcr   r   r   typingr   r   r   r	   r
   r   rK   torch.nnnntransformersr   r   r   r   r   r   transformers.image_utilsr   r   r   transformers.models.llavar   transformers.processing_utilsr   r   $transformers.tokenization_utils_baser   r   vllm.configr   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr    vllm.model_executor.models.llavar   vllm.multimodalr   vllm.multimodal.cacher    vllm.multimodal.inputsr!   r"   vllm.multimodal.parser#   r$   r%   r&   vllm.multimodal.processingr'   r(   r)   r*   r+   r,   vllm.sequencer-   vllm.utils.tensor_schemar.   r/   clipr1   
interfacesr2   r3   r4   siglipr5   utilsr6   r7   r8   r9   visionr:   r;   r<   r>   rQ   rV   rJ   rX   rg   rn   Moduler   r   r   r   r   r   r   re   rc   r  register_processorr  rM   rN   rO   <module>r     sW    7 7 7 7 7 7 7 7 7 7 J J J J J J J J J J J J J J J J                   6 5 5 5 5 5 O O O O O O O O O O 4 4 4 4 4 4 B B B B B B B B M M M M M M M M " " " " " " < < < < < < U U U U U U U U F F F F F F D D D D D D / / / / / / > > > > > > O O O O O O O O                           . - - - - - > > > > > > > > ! ! ! ! ! ! L L L L L L L L L L % % % % % %                    
J 
J 
J 
J 
Jl 
J 
J 
J
B 
B 
B 
B 
B, 
B 
B 
B !8:U U I U U U	+ 	+ 	+ 	+ 	+h 	+ 	+ 	+    -U    ;
 ;
 ;
 ;
 ;
~ ;
 ;
 ;
|               F@2 @2 @2 @2 @2. @2 @2 @2F W\)>???
	 	 	 	 	 7
 C 	 	 	-
 -
 -
 -
 -
!8!D -
 -
 -
`& 6 &;P & & & & 26	* * *
*(4* ($.	*
 * * * *& &*+# +# +#+#$t++# d{	+#
 +# ((+# +# +# +#\ ('	*  
S, S, S, S, S,bi1CZ S, S, 
S, S, S,rN   