
    .`i]e              	          U d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
mZmZ ddlmZmZmZ ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZH ddlImJZJmKZKmLZLmMZM ddlNmOZOmPZPmQZQ  G d deG          ZR G d d eG          ZSeReSz  ZTeeUd!<    G d" d#ejV                  ZW G d$ d%ejV                  ZX G d& d'ejV                  ZY G d( d)ejV                  ZZ G d* d+ejV                  Z[ G d, d-eQ          Z\ ed.          d/ed0efd1            Z] G d2 d3          Z^ G d4 d5e@          Z_ G d6 d7e>e_                   Z` G d8 d9e?e_                   Za e6jb        eae_e`:           G d; d<eOeMeKeL                      ZcdS )=zAInference-only Qwen-VL model compatible with HuggingFace weights.    N)Callable
CollectionMappingSequenceSet)	lru_cachepartial)	AnnotatedLiteral	TypeAlias)nn)
transforms)InterpolationMode)BatchFeaturePretrainedConfigPreTrainedTokenizer
TensorType)
ImageInput)	TextInput)
VllmConfig)BaseDummyOptions)
get_act_fn)Conv2dLayer)ColumnParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)
Resampler2get_abs_pos)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)QWenBaseModel	QWenBlock	QWenModelc                   j    e Zd ZU dZdZed         ed<   eej	         e
dddd          f         ed<   d	S )
QwenImagePixelInputsaj  
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width

    Note that image_size is the value in the vision config to which we resize
    the image to in the normalization transform. Currently multi-image support
    can only be leveraged by passing image embeddings directly.
    pixel_valuestypebn   hwdataN__name__
__module____qualname____doc__r:   r   __annotations__r
   torchTensorr.        v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/qwen_vl.pyr8   r8   @   s[         
 
 %3D'.
!222
EL++dAsC"@"@@
AAAAAArI   r8   c                   h    e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   dS )	QwenImageEmbeddingInputsa  
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size (256)
        - hs: Hidden size

    `hidden_size` must match the hidden size of the language model backbone
    and is stored in the visual config of the model if we have one.
    image_embedsr:   r;      hsr?   Nr@   rH   rI   rJ   rL   rL   Q   sY           %3D'.
!222
EL++dC">">>
??????rI   rL   QwenImageInputsc                        e Zd ZdZ	 	 	 	 ddededededz  d	edz  d
ef fdZ	 ddej	        dej	        dz  dej	        fdZ
 xZS )VisualAttentionzself-attention layer class.
    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    TN 	embed_dim	num_headsbiaskdimvdimprefixc                    t                                                       || _        ||n|| _        ||n|| _        | j        |k    o
| j        |k    | _        || _        ||z  dk    sJ ||z  | _        || _        || _	        | j        s
J d            t          |d|z  | d          | _        t          ||| d          | _        t          j        | j                  | _        d S )Nr   z<Visual Attention implementation only supports self-attentionr<   z.in_projrY   z	.out_proj)super__init__rT   rW   rX   _qkv_same_embed_dimrU   hidden_size_per_attention_head!num_attention_heads_per_partitionhidden_size_per_partitionr   in_projout_projmathsqrtnorm_factor)selfrT   rU   rV   rW   rX   rY   	__class__s          rJ   r]   zVisualAttention.__init__i   s,    	" ,DD)	 ,DD)	#'9	#9#Tdi9>T " 9$)))).79.D+1:.)2& ' 	
 	
J	
 	
' (q9}-@-@-@
 
 
 )yF)=)=)=
 
 
  9T%HIIrI   x	attn_maskreturnc                    |                                 \  }}}|                     |          \  }}|                                 d d         | j        d| j        z  fz   } |j        | }|                    | j        d          \  }}	}
|                    ||| j        z  | j                                      dd          }|	                    ||| j        z  | j                                      dd          }	|| j        z  }|+t          j	        |||	                    dd                    }n)t          j
        ||	                    dd                    }|                    d          }|
                    ||| j        z  | j                                      dd          }
t          j
        ||
          }|                    || j        || j                  }|                    dddd                                          }|                                 d d         | j        fz   } |j        | }|                     |          \  }}|S )Nr<   )dimr   r/      )sizerb   r`   r_   viewsplit	transposerf   rF   baddbmmbmmsoftmaxpermute
contiguousra   rc   )rg   ri   rj   sqb_mixed_x_layernew_tensor_shapequery_layer	key_layervalue_layerq_scaledattention_probscontext_layernew_context_layer_shapeoutputs                   rJ   forwardzVisualAttention.forward   s    6688Aq<<??q )--//42338
 
 +*,<= /<.A.A/R /B /
 /
+Y
 "&&66/
 
 )Aq//	 	 NN66/
 
 )Aq//	 	 !11 #m8Y%8%8R%@%@ OO $i)2E2Eb"2M2MNNO)11b199!&&66/
 
 )Aq//	 	 	/;?? &**2/	
 
 &--aAq99DDFF #0"4"4"6"6ss";*?
 #
 +*,CDMM-00	rI   )TNNrS   N)rA   rB   rC   rD   intboolstrr]   rF   rG   r   __classcell__rh   s   @rJ   rR   rR   c   s          !J !J!J !J 	!J
 Dj!J Dj!J !J !J !J !J !J !JL *.G G<G <$&G 
	G G G G G G G GrI   rR   c            	       F     e Zd ZdZ	 	 d
dedededz  def fdZd	 Z xZ	S )	QwenVLMLPz/MLP for the visual component of the Qwen model.NrS   hidden_sizeintermediate_sizequant_configrY   c                     t                                                       t          ||d|| d          | _        t	          d          | _        t          ||d|| d          | _        d S )NTz.c_fc)rV   r   rY   geluz.c_proj)r\   r]   r   c_fcr   act_fnr   c_proj)rg   r   r   r   rY   rh   s        rJ   r]   zQwenVLMLP.__init__   s     	(%###
 
 
	 !(('%%%%
 
 
rI   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S r   )r   r   r   )rg   ri   r|   s      rJ   r   zQwenVLMLP.forward   s<    yy||1KKNN{{1~~1rI   )NrS   )
rA   rB   rC   rD   r   r   r   r]   r   r   r   s   @rJ   r   r      s        99 37
 

 
 )4/	

 
 
 
 
 
 
0      rI   r   c                        e Zd Zdej        ddfdedededeegej        f         de	dz  d	e
f fd
Z	 ddej        dej        dz  dej        fdZ	 ddej        dej        dz  dej        fdZ xZS )VisualAttentionBlock      @NrS   d_modeln_head	mlp_ratio
norm_layerr   rY   c                    t                                                        ||          | _         ||          | _        t	          ||z            }t          ||| d          | _        t          |||| d          | _        d S )Nz.attnr[   z.mlp)r   r   r   rY   )	r\   r]   ln_1ln_2r   rR   attnr   mlp)	rg   r   r   r   r   r   rY   	mlp_widthrh   s	           rJ   r]   zVisualAttentionBlock.__init__   s     	Jw''	Jw''	)+,,	#GVv<L<L<LMMM	'%???	
 
 
rI   ri   rj   rk   c                 l    ||                     |j                  nd }|                     ||          S N)rj   )todtyper   rg   ri   rj   s      rJ   	attentionzVisualAttentionBlock.attention  s7    
 .7-BILL)))	yyiy000rI   c                     ||                      |                     |          |          z   }||                     |                     |                    z   }|S r   )r   r   r   r   r   s      rJ   r   zVisualAttentionBlock.forward  sM    
 tyy||yAAA1&&&rI   r   )rA   rB   rC   r   	LayerNormr   floatr   Moduler   r   r]   rF   rG   r   r   r   r   s   @rJ   r   r      s)       
 1326
 

 
 	

 cUBI-.
 )4/
 
 
 
 
 
 
2 *.1 1<1 <$&1 
	1 1 1 1 *. < <$& 
	       rI   r   c                        e Zd Zdej        ddfdededededeegej        f         d	e	dz  d
e
f fdZdej        fdZdej        fdZ	 ddej        dej        dz  dej        fdZ xZS )TransformerBlockr   NrS   widthlayersheadsr   r   r   rY   c                     t                                                       | _        || _        t	          j        fdt          |          D                       | _        d S )Nc                 D    g | ]}t           d |           S )z.resblocks.r   r   rY   )r   ).0ir   r   r   rY   r   r   s     rJ   
<listcomp>z-TransformerBlock.__init__.<locals>.<listcomp>1  sZ     
 
 
  %)!-$4444  
 
 
rI   )r\   r]   r   r   r   
ModuleListrange	resblocks)	rg   r   r   r   r   r   r   rY   rh   s	    ` `````rJ   r]   zTransformerBlock.__init__"  s     	

 
 
 
 
 
 
 
 
 v
 
 

 
rI   rk   c                 D    | j         d         j        j        j        j        S Nr   )r   r   r   weightr   rg   s    rJ   get_cast_dtypezTransformerBlock.get_cast_dtype>  s    ~a $)066rI   c                 D    | j         d         j        j        j        j        S r   )r   r   r   r   devicer   s    rJ   get_cast_devicez TransformerBlock.get_cast_deviceA  s    ~a $)077rI   ri   rj   c                 4    | j         D ]} |||          }|S r   )r   )rg   ri   rj   rs       rJ   r   zTransformerBlock.forwardD  s0      	* 	*A!y)))AArI   r   )rA   rB   rC   r   r   r   r   r   r   r   r   r]   rF   r   r   r   r   rG   r   r   r   s   @rJ   r   r   !  s&        1326
 

 
 	

 
 cUBI-.
 )4/
 
 
 
 
 
 
87 7 7 7 78 8 8 8 8 AE */,*=	       rI   r   c                        e Zd Z	 	 	 	 	 ddededed	ed
ededededededz  def fdZdej	        dej	        fdZ
 xZS )VisionTransformerrN      1Q NrS   
image_size
patch_sizer   r   r   r   	n_queries
output_dimimage_start_idr   rY   c                 f   t                                                       ||fx\  }}| _        ||fx\  }}| _        ||z  ||z  f| _        || _        t          d|||d          | _        |dz  }t          j	        |t          j        d|          z            | _        t          t          j        d          } ||          | _        t!          ||||||
| d	          | _        t%          t'          t)          j        |                    ||d
z  ||dd| d                              | j        j        | j        j                  | _         ||          | _        t          j	        |dz  t          j        ||          z            | _        |	| _        |	dz   | _        |	dz   | _        d S )Nr<   F)in_channelsout_channelskernel_sizestriderV   g      rN   gư>)epsz.transformerr      z
.attn_pool)	grid_sizerT   rU   kv_dimr   adaptivedo_post_projectionrY   )r   r   r/   rp   )r\   r]   r   r   r   r   r   conv1r   	ParameterrF   randnpositional_embeddingr	   r   ln_prer   transformerr   r   rd   re   r   r   r   	attn_poolln_postprojr   image_end_idimage_pad_id)rg   r   r   r   r   r   r   r   r   r   r   rY   kwargsimage_heightimage_widthpatch_heightpatch_widthscaler   rh   s                      rJ   r]   zVisionTransformer.__init__M  s    	7A:6NN!kDO7A:6NN!kDO&,6{8RS$ "
 
 

 t$&LS%9P9P1P$Q$Q!R\t444
 j''+!%***
 
 
 $$)I..//  C'!$(((	
 	
 	
 ",3+1  
 
 	 "z*--LZ!D!DD
 
	 -*Q.*Q.rI   ri   rk   c                    |                     | j                                        | j                                                  }|                     |          }|                    |j        d         |j        d         d          }|                    ddd          }|t          | j	        t          t          j        |                    d                                        z   }|                     |          }|                    ddd          }|                     |          }|                    ddd          }|                     |          }|                     |          }|| j        z  }|S )N)r   r   r   r/   rm   rp   )r   r   r   r   r   reshapeshaperx   r   r   r   rd   re   rq   r   r   r   r   )rg   ri   s     rJ   r   zVisionTransformer.forward  s4   DD"1133#3355  
 
 JJqMMIIagaj!'!*b11IIaAD5s49QVVAYY;O;O7P7PQQQKKNNIIaAQIIaANN1LLOO	MrI   )rN   r   r   NrS   )rA   rB   rC   r   r   r   r   r]   rF   rG   r   r   r   s   @rJ   r   r   L  s         $26B/ B/B/ B/ 	B/
 B/ B/ B/ B/ B/ B/ )4/B/ B/ B/ B/ B/ B/ B/H %,        rI   r   c                   .     e Zd Zdddedef fdZ xZS )QwenVLModelrS   r[   vllm_configrY   c                    t                                          ||           |j        j        }|j        }t          di |j        || dd| _        d S )N)r   rY   z.visual)r   rY   rH   )r\   r]   model_config	hf_configr   r   visual)rg   r   rY   configr   rh   s        rJ   r]   zQwenVLModel.__init__  st    [@@@)3"/' 
 
m
*6&?Q?Q?Q
 
 
 
rI   )rA   rB   rC   r   r   r]   r   r   s   @rJ   r   r     sX        AC 
 
 
z 
3 
 
 
 
 
 
 
 
 
 
rI   r   )maxsize	tokenizerrk   c                     t          j        |           } G d d| j                  }| j        j         d|_        ||_        |S )a>  
    The logic of adding image pad tokens should only be applied in
    [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
    so they are patched out here.

    The definition of the wrapped tokenizer can be found here:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
    c                       e Zd Z	 	 ddedee         ez  dee         ez  deeez           fdZ	 	 dd
e	ee	         z  de
ded	z  defdZd	S )B_get_tokenizer_without_image_pad.<locals>.TokenizerWithoutImagePadallrH   textallowed_specialdisallowed_specialrk   c                      t          j        d|          } fd j                            |||          D             S )NNFCc                 *    g | ]}j         |         S rH   )decoder)r   trg   s     rJ   r   z__get_tokenizer_without_image_pad.<locals>.TokenizerWithoutImagePad.tokenize.<locals>.<listcomp>  s/        Q  rI   )r   r   )unicodedata	normalizer   encode)rg   r   r   r   r   s   `    rJ   tokenizezK_get_tokenizer_without_image_pad.<locals>.TokenizerWithoutImagePad.tokenize  sb     (55D   ..$3'9 /     rI   FN	token_idsskip_special_tokenserrorsc                 x    t          |t                    r|g}| j                            ||p| j                  S )N)r  )
isinstancer   r   decoder  )rg   r
  r  r  r   s        rJ   _decodezJ_get_tokenizer_without_image_pad.<locals>.TokenizerWithoutImagePad._decode  sI     )S)) (&K	>((, )   rI   )r   rH   )FN)rA   rB   rC   r   r   r   listbytesr	  r   r   r  rH   rI   rJ   TokenizerWithoutImagePadr     s         /48:		 		 !X^	 !+3# 5		 %#+	 	 	 	* ).!%		 	T#Y	 "&	 $J		 	 	 	 	 	 	rI   r  WithoutImagePad)copydeepcopyrh   rA   )r   new_tokenizerr  s      rJ    _get_tokenizer_without_image_padr    si     M),,M         9#6      D ,5+>+G(X(X(X%6MrI   c            	            e Zd ZdZdededdf fdZedefd            Z	edefd            Z
edefd	            Z	 	 	 dd
eee         z  dz  deee         z  dz  deez  dz  defdZ xZS )QwenVLProcessorac  
    This model doesn't define its own HF processor,
    so we implement our own one here.

    We call the wrapped tokenizer to automatically insert image pad tokens:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245

    The image processor is defined here:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
    r   r   rk   Nc                 D   t                                                       || _        || _        |j        }|d         }t          j        t          j        ||ft          j	                  t          j
                    t          j        dd          g          | _        d S )Nr   )interpolation)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)meanstd)r\   r]   r   r   r   r   ComposeResizer   BICUBICToTensor	Normalizeimage_transform)rg   r   r   vision_configr   rh   s        rJ   r]   zQwenVLProcessor.__init__  s    
 	""<0
)1!,"3";   #%%$<<  
 
  
rI   c                     | j         j        S r   )r   image_start_tagr   s    rJ   r'  zQwenVLProcessor.image_start_tag  s    ~--rI   c                     | j         j        S r   )r   image_end_tagr   s    rJ   r)  zQwenVLProcessor.image_end_tag      ~++rI   c                     | j         j        S r   )r   image_pad_tagr   s    rJ   r,  zQwenVLProcessor.image_pad_tag  r*  rI   r   imagesreturn_tensorsc                 <    |g }t          |t                    s|g}|g }t          |t                    s|g}                     |          }t          |          dk    ri }n$ fd|D             }dt	          j        |          i}t          i |||          S )Nr   c                 :    g | ]}                     |          S rH   )r$  )r   imagerg   s     rJ   r   z,QwenVLProcessor.__call__.<locals>.<listcomp>4  s'    LLLED0077LLLrI   r9   )tensor_type)r  r  r   lenrF   stackr   )rg   r   r-  r.  text_inputsimage_inputsr9   s   `      rJ   __call__zQwenVLProcessor.__call__   s     <D$%% 	6D>F&$'' 	XFnnT**v;;!LLLLLLVLLLL*EK,E,EFL '
 
 
 	
rI   )NNN)rA   rB   rC   rD   r   r   r]   propertyr   r'  r)  r,  r   r  r   r   r   r7  r   r   s   @rJ   r  r    sI       	 	
 
 '
 
	
 
 
 
 
 
6 . . . . X. ,s , , , X, ,s , , , X,
 487;26	
 
$y/)D0
 T*--4
 j(4/	

 

 
 
 
 
 
 
 
rI   r  c                   X    e Zd ZdefdZdedefdZdee	e
dz  f         fdZde
fdZdS )QwenVLProcessingInfork   c                     | j                                         }t          |t                    sJ t	          |          S r   )ctxget_tokenizerr  r   r  )rg   r   s     rJ   r=  z"QwenVLProcessingInfo.get_tokenizerA  s:    H**,,	)%899999/	:::rI   r   c                      | j         j        t          f|                                 |                                 d|S )N)r   r   )r<  init_processorr  get_hf_configr=  )rg   r   s     rJ   get_hf_processorz%QwenVLProcessingInfo.get_hf_processorG  sP    &tx&
%%''((**
 
 	
 
 	
rI   Nc                 
    dd iS )Nr1  rH   r   s    rJ   get_supported_mm_limitsz,QwenVLProcessingInfo.get_supported_mm_limitsO  s    rI   c                 r    |                                  }|j        }|d         }|d         }||z  dz  }||z  S )Nr   r   rp   )r@  r   )rg   r   r%  r   r   grid_lengths         rJ   get_num_image_tokensz)QwenVLProcessingInfo.get_num_image_tokensR  sI    &&((	!("<0
"<0
 J.!3[((rI   )rA   rB   rC   r   r=  objectr  rA  r   r   r   rC  rF  rH   rI   rJ   r:  r:  @  s        ;2 ; ; ; ;
 
O 
 
 
 
cDj)A    )c ) ) ) ) ) )rI   r:  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	QwenVLDummyInputsBuilder	mm_countsrk   c                     |                     dd          }| j                                        }|j        |j        d                    fdt          d|dz             D                       S )Nr1  r   rS   c              3   .   K   | ]}d | d  dV  dS )Picture z: 
NrH   )r   r   img_end	img_starts     rJ   	<genexpr>z:QwenVLDummyInputsBuilder.get_dummy_text.<locals>.<genexpr>d  sM       
 
782q22I2w222
 
 
 
 
 
rI   r/   )getinforA  r'  r)  joinr   )rg   rJ  
num_imageshf_processorrO  rP  s       @@rJ   get_dummy_textz'QwenVLDummyInputsBuilder.get_dummy_text]  s    ]]7A..
y1133 0	,ww 
 
 
 
 
<A!ZRS^<T<T
 
 
 
 
 	
rI   Nseq_len
mm_optionsc                     | j                                         }|j        }|d         x}}|                    dd          }|r|                    d          nd }	d|                     ||||	          iS )Nr   r1  r   )r   heightrU  	overrides)rS  r@  r   rR  _get_dummy_images)
rg   rX  rJ  rY  r   r%  target_widthtarget_heightrU  image_overridess
             rJ   get_dummy_mm_dataz*QwenVLDummyInputsBuilder.get_dummy_mm_datah  s     I++--	!('4\'BB}]]7A..
5?I*..111T T++"$%)	 ,  
 	
rI   r   )
rA   rB   rC   r   r   r   rW  r   r"   ra  rH   rI   rJ   rI  rI  \  s        	
S(9 	
c 	
 	
 	
 	
 =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rI   rI  c            
           e Zd Zdedeeef         deeef         deeef         def
 fdZdeded	eeef         d
eeef         de	f
dZ
ded	eeef         deeef         fdZded	eeef         dedee         fdZ xZS )QwenVLMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrk   c                 
   t          j        dd|          \  }}|                    d          }|.t          |t                    sJ t          |          }||k    sJ t                                          ||||          S )Nz"(Picture \d*: <img>).*?(<\/img>\n)z\1\2r-  )rd  re  rf  rg  )resubnrR  r  r  r3  r\   _call_hf_processor)	rg   rd  re  rf  rg  num_matched_images
image_datarU  rh   s	           rJ   rk  z,QwenVLMultiModalProcessor._call_hf_processor  s     &(W1&
 &
"" [[**
!j$/////ZJ%3333ww))!	 * 
 
 	
rI   prompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsc                     dS )NFrH   )rg   rn  ro  rp  rq  s        rJ   _hf_processor_applies_updatesz7QwenVLMultiModalProcessor._hf_processor_applies_updates  s	     urI   	hf_inputsc                 l    t          t          j        d          t          j        d                    S )Nr1  )r9   rM   )dictr#   batched)rg   rt  rp  s      rJ   _get_mm_fields_configz/QwenVLMultiModalProcessor._get_mm_fields_config  s7    
 .6w??.6w??
 
 
 	
rI   out_mm_kwargsc                 b   | j                                         }|j        }| j                                         }||j                 }||j                 }||j                 }	| j                                         }
|	g|
z  }t          d||gt          j
        |g|z   |gz   |	                    gS )Nr1  )embed_token_id)modalitytargetreplacement)rS  r=  special_tokensrA  r'  r)  r,  rF  r)   r+   select_token_id)rg   ro  rp  ry  r   r  	processorimg_start_id
img_end_id
img_pad_idnum_image_tokensimage_tokenss               rJ   _get_prompt_updatesz-QwenVLMultiModalProcessor._get_prompt_updates  s     I++--	)2)AI..00	%i&?@#I$;<
#I$;<
999;;"|&66  $j1/?!N\1ZL@#-    	
 		
rI   )rA   rB   rC   r   r   rG  r   rk  r%   r   rs  r#   rx  r$   r   r*   r  r   r   s   @rJ   rc  rc    sj       

 f%
 3;'	

 CK(
 

 
 
 
 
 
: & !(V 4	
 %S&[1 
   

 !(V 4
 
++	,	
 
 
 

%
 !(V 4
 -	

 
,	
 
 
 
 
 
 
 
rI   rc  )rS  dummy_inputsc                   8    e Zd ZdgddgdZej        ZdefdZede	de
de	d	z  fd
            Zdeddede	dee         dd	f fdZdeded	z  fdZdedej        fdZdedefdZ	 	 ddej        dej        ded	z  dej        d	z  dedej        ez  fdZ xZS )QwenVLForConditionalGenerationc_attnw2w1)r  gate_up_projrk   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        ztransformer.hztransformer.visual.attn_poolztransformer.visual.transformer)language_model	connectortower_model)r    from_string_fieldr   s    rJ   get_mm_mappingz-QwenVLForConditionalGeneration.get_mm_mapping  s'     /*48
 
 
 	
rI   r|  r   Nc                 V    |                     d          rd| dS t          d          )Nr1  rM  z: <img></img>z Only image modality is supported)
startswith
ValueError)clsr|  r   s      rJ   get_placeholder_strz2QwenVLForConditionalGeneration.get_placeholder_str  s8    w'' 	/.a....;<<<rI   rS   )rY   transformer_typer   rY   r  c                    |                      |t          dt          i          5  t                                          |||           d d d            n# 1 swxY w Y   |  d S )Nr1  )language_targetstower_targets)r   rY   r  )_mark_composite_modelr5   r   r\   r]   )rg   r   rY   r  rh   s       rJ   r]   z'QwenVLForConditionalGeneration.__init__  s     ''&"$56 ( 
 
 		 		
 GG'!1    		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 	%%%s   %AAAr   c                     |                     dd           }|                     dd           }|+| j        j        d         x}}||d}t          d||          S |t	          d|          S d S )Nr9   rM   r   )r=   r>   )r:   r?   resolve_bindings)r:   r?   )popr   r   r8   rL   )rg   r   r9   rM   
expected_h
expected_wr  s          rJ   _parse_and_validate_image_inputz>QwenVLForConditionalGeneration._parse_and_validate_image_input  s     zz.$77zz.$77#&*k&8&FFJ%/jAA'#!!1    #+#!   
 trI   image_inputc                 j    |d         dk    r|d         S | j                             |d                   S )Nr:   rM   r?   )r   r   )rg   r  s     rJ   _process_image_inputz3QwenVLForConditionalGeneration._process_image_input  s9    v.00v&&&&{6':;;;rI   c                 R     | j         di |}|g S |                     |          }|S )NrH   )r  r  )rg   r   r  vision_embeddingss       rJ   embed_multimodalz/QwenVLForConditionalGeneration.embed_multimodal#  s?    :d:DDVDDI 55kBB  rI   	input_ids	positionsintermediate_tensorsinputs_embedsc                 >    |d }|                      ||||          }|S r   )r   )rg   r  r  r  r  r   hidden_statess          rJ   r   z&QwenVLForConditionalGeneration.forward+  s6      + M((y"6
 
 rI   )NN)rA   rB   rC   packed_modules_mappingr2   embed_input_idsr    r  classmethodr   r   r  r   r   r:   r]   rG  rP   r  rF   rG   r  r0   r  r,   r   r   r   s   @rJ   r  r    s        *
  )8O
 
 
 
 
 =3 =3 =3: = = = [= .9& & &  & 	&
 {+& 
& & & & & &(	4	   0< <EL < < < <! !4H ! ! ! ! <@-1 < < 2D8	
 |d*  
+	+       rI   r  )drD   r  rd   r  collections.abcr   r   r   r   r   	functoolsr   r	   typingr
   r   r   regexri  rF   r   torchvisionr   torchvision.transformsr   transformersr   r   r   r   transformers.image_utilsr   $transformers.tokenization_utils_baser   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   $vllm.model_executor.layers.resamplerr   r   )vllm.model_executor.models.module_mappingr    vllm.multimodalr!   vllm.multimodal.inputsr"   r#   r$   vllm.multimodal.parser%   vllm.multimodal.processingr&   r'   r(   r)   r*   r+   vllm.sequencer,   vllm.utils.tensor_schemar-   r.   
interfacesr0   r1   r2   r3   qwenr4   r5   r6   r8   rL   rP   rE   r   rR   r   r   r   r   r   r  r  r:  rI  rc  register_processorr  rH   rI   rJ   <module>r     s   H G G       H H H H H H H H H H H H H H ( ( ( ( ( ( ( ( 0 0 0 0 0 0 0 0 0 0            " " " " " " 4 4 4 4 4 4 X X X X X X X X X X X X / / / / / / : : : : : : " " " " " " 3 3 3 3 3 3 < < < < < < 7 7 7 7 7 7         
 G F F F F F H H H H H H H H D D D D D D / / / / / /         
 6 5 5 5 5 5                . - - - - - > > > > > > > >            6 5 5 5 5 5 5 5 5 5B B B B B< B B B"@ @ @ @ @| @ @ @ 24LL L L Lp p p p pbi p p pf    	   D& & & & &29 & & &R( ( ( ( (ry ( ( (V\ \ \ \ \	 \ \ \~	
 	
 	
 	
 	
) 	
 	
 	
 12"22 2 2 2jP
 P
 P
 P
 P
 P
 P
 P
f) ) ) ) )- ) ) )8!
 !
 !
 !
 !
56JK !
 !
 !
HK
 K
 K
 K
 K
 78L M K
 K
 K
\ ('	)  
f f f f f:|-?f f 
f f frI   