
    .`ij              	       F   U d dl mZmZmZ d dlmZ d dlmZmZm	Z	 d dl
Zd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZE  e%eF          ZG G d de:          ZHeHZIe	eJd<   ded eKd!eKd"ejL        dz  fd#ZMd$eKd%eKd&eKd"ejN        fd'ZO G d( d)ejP                  ZQ G d* d+ejP                  ZR G d, d-eA          ZS G d. d/e2          ZT G d0 d1e0eT                   ZU G d2 d3e1eT                   ZV e'jW        eVeTeU4           G d5 d6ejP        e>e?                      ZXdS )7    )IterableMappingSequence)partial)	AnnotatedLiteral	TypeAliasN)	rearrange)Image)LayerNorm2d)resample_abs_pos_embed)RegStage)nn)BatchFeature)BaseModelOutput)Qwen2VLVisionConfig)
VllmConfig)BaseDummyOptions)init_logger)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)resolve_obj_by_qualname)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)Qwen2VisionTransformer)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixc                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
KananaVImagePixelInputsz
    Dimensions:
        - np: The total number of patches over all images in the batch
        - cps: Number of channels * patch_size * patch_size
        - ni: Number of images
    pixel_valuestypenpcpsni   vision_grid_thwN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr$        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/kanana_v.pyr.   r.   0   s           .
!!!!D%  	"   
 D!	     r>   r.   KananaVImageInputsconfignum_input_tokensvision_hidden_sizereturnc                     | j         rKt          j        t          j        d||                    }t          j                            |dd           nd}|S )z:Build positional embeddings for the visual encoder output.r%   g        g{Gz?)meanstdN)pos_embr   	Parameterr;   zerosinittrunc_normal_)rA   rB   rC   rH   s       r?   build_pos_embedsrM   H   sW     ~ ,u{1.>@RSSTT
gCT::::Nr>   depthhidden_sizeoutput_hidden_sizec                    t          j        ||          g}t          d|           D ]P}|                    t          j                               |                    t          j        ||                     Qt          j        | S )z6Simple SiLU-activated MLP used as a projector readout.r%   )r   LinearrangeappendSiLU
Sequential)rN   rO   rP   layers_s        r?   	build_mlprY   W   sz     i%7889F1e__ I Ibgii   bi 24FGGHHHH=&!!r>   c                   X     e Zd ZdZdeddf fdZ	 d
dej        dedej        fd	Z	 xZ
S )
PatchMergez9Merge neighboring patches spatially to reduce resolution.
merge_sizerD   Nc                 V    t                                                       || _        d S N)super__init__r\   )selfr\   	__class__s     r?   r`   zPatchMerge.__init__g   s$    $r>   Fxchannel_lastc                 |    |rt          |d          }|j        \  }}}}t          |d| j        | j                  }|S )z+Merge patches by `merge_size x merge_size`.zB H W D -> B D H Wz$B D (H h2) (W w2) -> B (D h2 w2) H W)h2w2)r
   shaper\   )ra   rc   rd   rX   HWmerged_xs          r?   forwardzPatchMerge.forwardk   sV      	3!122AW
1a2	
 
 
 r>   )F)r6   r7   r8   r9   intr`   r;   r<   boolrl   __classcell__rb   s   @r?   r[   r[   d   s        CC%3 %4 % % % % % % # <  
	       r>   r[   c                        e Zd ZdZdededdf fdZd fdZddZd	e	j
        d
e	j
        dedefdZde	j
        deeef         de	j
        fdZ xZS )DynamicCAbstractorz,Dynamic C-Abstractor based on RegNet blocks.rA   rB   rD   Nc                 @   t                                                       t          |d          s
J d            || _        |j        | _        |j        | _        |dk    r|j        }|| _        t          |||j                  | _	        | 
                                 d S )Nr\   zmerge_size must be provided.)r_   r`   hasattrrA   r\   pos_emb_sizerB   rM   encoder_hidden_sizerH   	build_net)ra   rA   rB   rb   s      r?   r`   zDynamicCAbstractor.__init__   s    
 	v|,,LL.LLL, +"/r!!%2 0'$f&@
 
 	r>   c                 b   |sd S | j         t          j        d          }d }|D ]}|                    |          r|} n|J ||         }|                    d          | j                             d          dz   k    r|d d dd f         ||<    t                      j        |g|R i | d S )Nz[\w,.]*abstractor[\w,.]*pos_embr%   )rH   recompilematchsizer_   _load_from_state_dict)	ra   
state_dictargskwargskey_repos_emb_keykeyrH   rb   s	           r?   r~   z(DynamicCAbstractor._load_from_state_dict   s     	F<#Z BCCFK!  <<$$ "%KE *** -G||A$,"3"3A"6"6":::*1!!!QRR%.
;'%%jB4BBB6BBBBBr>   c                    | j         j        }| j         j        }| j         j        }| j         j        }| j         j        }t          t          ddt          j	        t                    } ||||          }t          | j                  } ||| j        dz  |z  |          }	|r4t          j        |||	g          | _        t          |||          | _        d S || _        t          |||          | _        d S )Nr%   )stridedilation	act_layer
norm_layer)r\      )rA   rw   rO   rP   rN   	mlp_depthr   r   r   rU   r   r[   r\   
ModuleListnetrY   readout)
ra   rw   rO   rP   rN   r   RegBlocks1samplers2s
             r?   rx   zDynamicCAbstractor.build_net   s   "k=k-![;!K)	g"
 
 
 X
 

 888XOQ,
 
  	Y}b'2%677DH$Y=OPPDLLLDH$Y0CEWXXDLLLr>   flattened_visual_embedsgrid_thwunused_kwargsc           
      ~   t          j        |d          }t          j        ||                                          }g }t	          ||          D ]\  }}|\  }}	}
|dk    s
J d            t          |d||	|
          }|dddf         }| j        Xt          | j        t          t          | j
        dz            gd	z            |	|
fd
          }t          |d|	|
          }||z   }|                     ||	|
f          }|                    |           t          j        |d          }t          |          S )z>Apply the dynamic abstractor over flattened visual embeddings.r%   dimz(T must be 1. Video is not supported yet.z(t h w) d -> 1 t h w d)thwNr   g      ?r   )posembold_sizenew_sizenum_prefix_tokensz1 (h w) d -> 1 h w dr   r   )
input_size)last_hidden_state)r;   prodsplittolistzipr
   rH   r   tuplerm   rv   _forwardrT   catr   )ra   r   r   r   n_token_locsplit_visual_embeds_visual_embeds	_grid_thwTri   rj   reshaped_visual_embeds_local_pos_embs                r?   rl   zDynamicCAbstractor.forward   s    jq111#k*A;CUCUCWCWXX"$),-@()K)K 	C 	C%NIGAq!666E666%. 8Aa& & &" &<AAAqD%A"|'!7<"C(93(>$?$?#@1#DEEV&'	" " " "+"*	" " " *@.)P&%)]]&q6 &3 & &" $**+ABBBB!&+B!J!J!J1GHHHHr>   rc   r   c                 H   |\  }}t          |d||          }| j        j        rC | j        d         |          } | j        d         |          } | j        d         |          }n|                     |          }t          |d          }|                     |          }|S )Nz1 h w d -> 1 d h wr   r   r%   r   z1 d h w -> (h w) d)r
   rA   rN   r   r   )ra   rc   r   r   r   s        r?   r   zDynamicCAbstractor._forward   s    
 1a-a888; 	AAAAAAA Aa-..LLOOr>   )rD   N)r6   r7   r8   r9   r   rm   r`   r~   rx   r;   r<   objectr   rl   r   r   ro   rp   s   @r?   rr   rr   }   s       66#  
	     $C C C C C C* Y  Y  Y  YD*I!&*I ,*I  	*I
 
*I *I *I *IX< #s(O 
	       r>   rr   c                        e Zd ZdZdeddf fdZededd fd            Z	 	 ddej	        dej	        d	e
dz  d
e
dz  deez  f
dZdefdZ xZS )CustomQwen2VLVEzThin wrapper around the Qwen2-VL used as a vision encoder.

    This mirrors the original HF-based vision encoder used in Kanana-V, but
    reuses vLLM's optimized `Qwen2VisionTransformer` building blocks.
    rA   rD   Nc                     t                                          |t          |dd          d d           t          | d          r| `d S d S )Nrms_norm_epsgư> )vision_confignorm_epsquant_configprefixmerger)r_   r`   getattrru   r   )ra   rA   rb   s     r?   r`   zCustomQwen2VLVE.__init__  sc     V^T::	 	 	
 	
 	
 4"" 		 	r>   c                      | |          S )z:Drop-in replacement for the HF `_from_config` constructor.r=   )clsrA   s     r?   _from_configzCustomQwen2VLVE._from_config  s     s6{{r>   r/   r   output_hidden_statesreturn_dictc                 h   |s
J d            |                     | j        | j                  }|                     |          }t	          |t
                    r#|}t          j        |t          j                  }n:|	                                }|
                                                                }|                     |          \  }}	t          j        |dddf         |dddf         z  |dddf                                       dt          j                  }
t          j        t          j        dt          j                  |
g          }
t#          j        |
                               | j        d	
          }
|                    d          }|                     |
          }|rdnd}| j        D ]-}|r||                    d          fz   } |||
||	|          }.|                    d          }|r||fz   }|st/          d ||fD                       S t1          ||          S )a2  Run the vision transformer and optionally return intermediate states.

        Unlike the base `Qwen2VisionTransformer`, this wrapper exposes the
        pre-merger patch-level representations and a HF-style `BaseModelOutput`
        so that the existing projector / abstractor code can be reused.
        z#Only return_dict=True is supported.)devicedtype)r   Nr%   r   r   )axisr   T)non_blockingr=   )
cu_seqlensrotary_pos_emb_cosrotary_pos_emb_sin
max_seqlenc              3      K   | ]}||V  	d S r^   r=   ).0vs     r?   	<genexpr>z*CustomQwen2VLVE.forward.<locals>.<genexpr>e  s"      UUqq}}}}}UUr>   )r   hidden_states)tor   r   patch_embed
isinstancelistr1   arrayint32r   cpunumpyrot_pos_embrepeatcumsumconcatenaterJ   r;   
from_numpy	unsqueezecompute_attn_mask_seqlenblockssqueezer   r   )ra   r/   r   r   r   rc   grid_thw_listgrid_thw_npr   r   r   r   encoder_statesblkr   s                  r?   rl   zCustomQwen2VLVE.forward"  sc    AAAAA{ OO4;djOAAQ h%% 	1$M(828<<<KK$OO--M",,....00K151A1A-1P1P.. Y1AAAqD 111
 
 &arx&
(
( 	 ^RXarx%@%@%@*$MNN
%j1144K 5 
 

 KKNN 22:>>
3=; 	 	C# B!/199Q<</!A%#5#5%  AA 		! 	?+}.>>N 	VUU]N$CUUUUUU+(
 
 
 	
r>   c                     dS )Nrt   r=   ra   s    r?   get_num_tokenszCustomQwen2VLVE.get_num_tokensk  s    rr>   NN)r6   r7   r8   r9   r   r`   classmethodr   r;   r<   rn   r   r   rl   rm   r   ro   rp   s   @r?   r   r     s        2 t       "5 :K    [ -1#'G
 G
lG
 ,G
 #Tk	G

 D[G
 
	 G
 G
 G
 G
R        r>   r   c                       e Zd Zdeeedz  f         fdZdefdZdddded	ed
ede	de
eef         f
dZdedeeef         deeef         fdZdS )KananaVProcessingInforD   Nc                 
    dd iS )Nimager=   r   s    r?   get_supported_mm_limitsz-KananaVProcessingInfo.get_supported_mm_limitsq  s    r>   c                 <    |                      ddd          \  }}|S )N'  r%   image_widthimage_height
num_frames)_get_vision_info)ra   max_image_sizerX   s      r?   !get_image_size_with_most_featuresz7KananaVProcessingInfo.get_image_size_with_most_featurest  s2     11 2 
 

 r>   r%   T)r   	do_resizer   r   r   r   c                   | j                                         j        }t          t	          |          j         d          }|                                 }|j        }|j        }	|j	        }
|j
        }|r2 ||||	|
z  |j        |j                  \  }}t          ||          }nt          ||          }|||z  z   }t          ||z  d          }|j        |	z  }|j        |	z  }||z  |z  }||
dz  z  }||fS )Nz.smart_resize)heightwidthfactor
min_pixels
max_pixels)r   r   r%   r   )ctxget_hf_processorimage_processorr"   r0   r7   get_hf_configr   
patch_sizespatial_merge_sizetemporal_patch_sizer   r   r   maxr   r   )ra   r   r   r   r   r  smart_resize	hf_configr   r  r\   r  resized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wnum_patchesnum_vision_tokenss                        r?   r   z&KananaVProcessingInfo._get_vision_info|  sI    (3355E.O$$/>>>
 
 &&((	!/"-
"5
+? 
	R,8L#!!J.*5*5- - -)NM !*n U U U )L Q Q Q '6I)II&*==qAA")Z7"(J6vo.'JM: "333r>   seq_len	mm_countsc                 t    |                                  \  }}|                     ||d          d         }d|iS )Nr%   r   r   )r   r   )ra   r  r  target_widthtarget_heightr  s         r?   get_mm_max_tokens_per_itemz0KananaVProcessingInfo.get_mm_max_tokens_per_item  sV    
 '+&L&L&N&N#m 11$& 2 
 
 	
 *++r>   )r6   r7   r8   r   strrm   r   r   r   rn   r   r   r  r=   r>   r?   r   r   p  s        cDj)A    9     *4 *4 *4 *4 	*4
 *4 *4 
y#~	*4 *4 *4 *4X,, 38$, 
c		, , , , , ,r>   r   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	KananaVDummyInputsBuilderr  rD   c                 8    |                     dd          }d|z  S )Nr   r   <image>)get)ra   r  
num_imagess      r?   get_dummy_textz(KananaVDummyInputsBuilder.get_dummy_text  s     ]]7A..
:%%r>   Nr  
mm_optionsc                 b    |                     dd          }d|                     dd|          iS )Nr   r   r   )r   r   r  )r  _get_dummy_images)ra   r  r  r!  r  s        r?   get_dummy_mm_dataz+KananaVDummyInputsBuilder.get_dummy_mm_data  sB     ]]7A..
T++4J ,  
 	
r>   r^   )
r6   r7   r8   r   r  rm   r   r   r   r$  r=   r>   r?   r  r    s        &S(9 &c & & & & =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r>   r  c            
           e Zd ZdZedefd            Zdedeee	f         deee	f         deee	f         de
f
dZd	ed
eee	f         dedee         fdZde
d
eee	f         deeef         fdZdS )KananaVMultiModalProcessorz6vLLM multimodal processor for Kanana-V (text + image).rD   c                 N    | j                                         j        j        dz   S )Nr%   )infor  text_configeos_token_idr   s    r?   media_token_idz)KananaVMultiModalProcessor.media_token_id  s!    y&&((4AAEEr>   promptmm_data	mm_kwargs
tok_kwargsc           	         |r|                     dg           sL| j                                                            |          }t	          t          |g          d          S |                     dg           }g }t          |d         t          j                  sd |D             }| j                                        j	        fd|D             }d |D             }	d	 |D             fd
d         D             |	D ]"}
|
                    |
j        d                    #t          j        |	d          }	| j                                        }|                    | j        g          d         }|                    d|          }|                    |          }t          j        |          }t%          |          }t          j        d                   }|                    d                                          }t+          t-          d |D                                 }t+          || j        k                                                                              }||k    r||k    rg }d}|                                D ]_}|| j        k    r=||k     r7|                    | j        gt+          ||                   z             |dz  }J|
                    |           `|                    |          }t          |                    d          |	t          j        d                   t          j        d                   t          j        |                    }t	          |d          S )z7Run the underlying HF processor on text and image data.images)	input_idspt)tensor_typer   c                 6    g | ]}t          j        |          S r=   )r   	fromarray)r   r   s     r?   
<listcomp>zAKananaVMultiModalProcessor._call_hf_processor.<locals>.<listcomp>  s"    MMMuEOE22MMMr>   c                 &    g | ]} |          S r=   r=   )r   r   r  s     r?   r7  zAKananaVMultiModalProcessor._call_hf_processor.<locals>.<listcomp>  s#    MMMuOOE22MMMr>   c                     g | ]
}|d          S )r/   r=   r   os     r?   r7  zAKananaVMultiModalProcessor._call_hf_processor.<locals>.<listcomp>  s    DDDa.)DDDr>   c                     g | ]
}|d          S )
image_metar=   r:  s     r?   r7  zAKananaVMultiModalProcessor._call_hf_processor.<locals>.<listcomp>  s    @@@!ao@@@r>   c                 0    i | ]fd D             S )c                      g | ]
}|         S r=   r=   )r   dks     r?   r7  zLKananaVMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>.<listcomp>  s    3331!A$333r>   r=   )r   rA  r=  s    @r?   
<dictcomp>zAKananaVMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>  s1    KKKa3333
333KKKr>   r   r  image_token_thwr%   c              3   4   K   | ]}t          |          V  d S r^   )rm   )r   rc   s     r?   r   z@KananaVMultiModalProcessor._call_hf_processor.<locals>.<genexpr>  s(       H HAQ H H H H H Hr>   r5   )r2  r/   r5   rC  pixel_sizes)r  r(  get_tokenizerencoder   dictr   r   r  r  rT   rh   r;   concatconvert_ids_to_tokensr+  replacetensorlenr   r   rm   sumitemextend
new_tensorr   )ra   r,  r-  r.  r/  
prompt_idsimage_inputsrE  processor_outputr/   pixel_value	tokenizermedia_tokenprompt_replacedr2  r  rC  per_image_token_countsexpected_totaln_placeholdersexpandedimg_itokcombined_outputsr=  r  s                           @@r?   _call_hf_processorz-KananaVMultiModalProcessor._call_hf_processor  s     	Pgkk(B77 	P002299&AAJ
| < < <$OOOO {{8R00,q/5;77 	NMMMMML)4466FMMMMMMMDD3CDDD@@/?@@@
KKKKZ]KKK
' 	5 	5K{034444|La888I++--	55t7J6KLLQO ..K@@$$_55	L++	
 &&
,z2C'DEE!0!5!5!!5!<!<!C!C!E!ES H H1G H H HHHIIi4+>>CCEEJJLLMMZ''Nj,H,H"$HE '')) ) )$---%*2D2DOO,-4J54Q0R0RR   QJEEOOC((((!,,X66I))!,,%!L4E)FGG!L4E)FGG[11
 
 
 ,$????r>   mm_itemshf_processor_mm_kwargsout_mm_kwargsc                 l     dt           dt          t                    f fd}t          dd|          gS )NidxrD   c                     d         |          }|d         j         }t          |t          j                  sJ t	          |                                                                          }j        g|z  S )Nr   rC  )datar   r;   r<   rm   r   rO  r+  )re  out_itemrC  
num_tokensrc  ra   s       r?   get_replacementzGKananaVMultiModalProcessor._get_prompt_updates.<locals>.get_replacement  sn    $W-c2H&'89>Oou|<<<<<_113388::;;J'(:55r>   r   r  )modalitytargetreplacement)rm   r   r   )ra   ra  rb  rc  rj  s   `  ` r?   _get_prompt_updatesz.KananaVMultiModalProcessor._get_prompt_updates  s`    	6 	6# 	6 	6 	6 	6 	6 	6 	6   +  
 	
r>   	hf_inputsc                     |                     dt          j        d                    }t          t	          j        d|          t	          j        d          t	          j        d                    }|S )NrE  r   r   )r/   r5   rC  )r  r;   emptyrH  r   flat_from_sizesbatched)ra   ro  rb  rE  mm_fields_configs        r?   _get_mm_fields_configz0KananaVMultiModalProcessor._get_mm_fields_config+  sh    
  mmM5;q>>BB.>wTT19'BB19'BB
 
 

  r>   N)r6   r7   r8   r9   propertyrm   r+  r  r   r   r   r`  r   r   r   r    rn  r   ru  r=   r>   r?   r&  r&    sB       @@F F F F XFC@C@ f%C@ 3;'	C@
 CK(C@ 
C@ C@ C@ C@J
%
 !(V 4
 -	

 
,	
 
 
 
,   !(V 4  
++	,	           r>   r&  )r(  dummy_inputsc            
       "    e Zd Zededededz  fd            Zddded	ef fd
Zde	de
dz  fdZde
dej        fdZdeej                 deee         z  dej        fdZ	 d"dej        dedz  dej        fdZ	 d"dej        dedz  dej        fdZ	 d"dej        dedz  dej        fdZde	defdZ	 	 d#dej        dej        dedz  dej        dz  fdZdej        dej        fdZd eeeej        f                  dee         fd!Z xZS )$KananaVForConditionalGenerationrk  irD   Nc                 T    |                     d          rdS t          d|           )Nr   r  zUnsupported modality: )
startswith
ValueError)r   rk  rz  s      r?   get_placeholder_strz3KananaVForConditionalGeneration.get_placeholder_str@  s5    w'' 	B9@h@@AAAr>   r   )r   vllm_configr   c          	      J   t                                                       |j        j        }|| _        |                     |d          5  t                              |j                  | _	        t          |j        | j	                                                  | _        d d d            n# 1 swxY w Y   |                     |          5  t          ||j        t#          |d          dg          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nr   )rB   modelLlamaForCausalLM)r  r	  r   architectures)r_   r`   model_configr	  rA   _mark_tower_modelr   r   r   vision_modelrr   projector_configr   
abstractor_mark_language_modelr+   r)  r,   language_modelmake_empty_intermediate_tensors)ra   r  r   rA   rb   s       r?   r`   z(KananaVForConditionalGeneration.__init__G  s   )3##K99 	 	 / < <V=Q R RD0'!%!2!A!A!C!C  DO	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	"<' ,#FG4412	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   AB..B25B2-DDDr   c                 @   |                     dd           }|                     dd           }|d S |t          d          t          |t          j                  rN|j        dk    rnV|j        dk    r|                    dd          }n4t          d|j         d	|j         d
          t          j        |          }t          |t          j                  r"|j        dk    r|                    dd          }nt          j        |          }t          d||          S )Nr/   r5   z9vision_grid_thw is required when pixel_values is providedr   r4   r   r%   z:pixel_values should be 2D or batched 3D tensor. Got ndim: z (shape=))r0   r/   r5   )
popr}  r   r;   r<   ndimflattenrh   rI  r.   )ra   r   r/   r5   s       r?   _parse_and_validate_image_inputz?KananaVForConditionalGeneration._parse_and_validate_image_input`  sS    zz.$77 **%6==4"K  
 lEL11 	6 A%%"a''+33Aq99 4!-!24 4*04 4 4   !<55L ou|44 	<#q(("1"9"9!Q"?"?#l?;;O&%+
 
 
 	
r>   image_inputc                 h   |d         }|d         }d|i}|                      ||          }| j        j        }|                    d          }d}d}	t	          |          D ]R}
||
         d         ||
         d         |z  ||
         d         |z  }}}||z  |z  }||	|	|z            }||fz  }|	|z  }	S|S )Nr/   r5   r   r=   r%   r   )forward_and_project_visionr  r\   r}   rS   )ra   r  r/   r5   image_metasvisual_embedsr\   
batch_sizemulti_modal_embeddingssample_indexrz  r   r   r   ri  visual_embeds                   r?   _process_image_inputz4KananaVForConditionalGeneration._process_image_input  s    ">2%&78(/:77kRR_/
$))!,,
;=z"" 		' 		'A"1%"1%3"1%3 qA
 QJ(z8Q)QRL"|o5"J&LL%%r>   v_outputlayer_indexc                     t          |t          t          f          r!t          j        |d          d d |f         }n||         }|S )Nr%   r   )r   r   r   r;   stack)ra   r  r  visual_featuress       r?   _get_visual_feature_atz6KananaVForConditionalGeneration._get_visual_feature_at  sR    
 kD%=11 	4#k(:::;OO '{3Or>   r/   r  c                     |dd|d         d} | j         di |}| j        j        j        }|                     |j        |          }|S )NTr5   )r/   r   r   r   r=   )r  rA   r  feature_layer_indexr  r   )ra   r/   r  vision_model_args	v_outputsr  r  s          r?   forward_visionz.KananaVForConditionalGeneration.forward_vision  sp     )$(#$56	
 
 &D%::(9::	k2F55#[
 
 r>   r  c                 L    |                      ||d                   d         }|S )Nr5   )r   r   )r  )ra   r  r  r  s       r?   forward_projectorz1KananaVForConditionalGeneration.forward_projector  s:    
  !23 ( 
 
  r>   c                 j    |J |                      ||          }|                     ||          }|S )N)r  )r  r  )ra   r/   r  r  r  s        r?   r  z:KananaVForConditionalGeneration.forward_and_project_vision  sF    
 '''--l-TT..K.XXr>   c                 N     | j         di |}|g S |                     |          S )Nr=   )r  r  )ra   r   r  s      r?   embed_multimodalz0KananaVForConditionalGeneration.embed_multimodal  s9    :d:DDVDDI((555r>   r2  	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)r2  r  r  r  )r  )ra   r2  r  r  r  r   r   s          r?   rl   z'KananaVForConditionalGeneration.forward  s=      + M++!5'	 , 
 
 r>   r   c                 6    | j                             |          S r^   )r  compute_logits)ra   r   s     r?   r  z.KananaVForConditionalGeneration.compute_logits  s    "11-@@@r>   weightsc                 J    t          |           }|                    |          S r^   )r*   load_weights)ra   r  loaders      r?   r  z,KananaVForConditionalGeneration.load_weights  s#    "4((""7+++r>   r^   r   )r6   r7   r8   r   r  rm   r~  r   r`   r   r@   r  r;   r<   r  r   r  rH  r  r  r  r&   r  r!   rl   r  r   r   setr  ro   rp   s   @r?   ry  ry  :  s        B3 B3 B3: B B B [B BD 
 
 
z 
3 
 
 
 
 
 
2(
(
	d	"(
 (
 (
 (
T&0B &u| & & & &05<( 8C=( 
	     $( l D[ 
	   * $(	 		 D[	 
		 	 	 	 $( l D[ 
	   6 64H 6 6 6 6 <@-1 < < 2D8	
 |d*   (AEL AU\ A A A A,HU33D-E$F ,3s8 , , , , , , , ,r>   ry  )Ycollections.abcr   r   r   	functoolsr   typingr   r   r	   r   r1   regexrz   r;   einopsr
   PILr   timm.layersr   timm.layers.pos_embedr   timm.models.regnetr   r   transformersr   transformers.modeling_outputsr   3transformers.models.qwen2_vl.configuration_qwen2_vlr   vllm.configr   vllm.config.multimodalr   vllm.loggerr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   r    vllm.sequencer!   vllm.utils.import_utilsr"   vllm.utils.tensor_schemar#   r$   
interfacesr&   r'   r(   qwen2_vlr)   utilsr*   r+   r,   r6   loggerr.   r@   r:   rm   rI   rM   rV   rY   Moduler[   rr   r   r   r  r&  register_processorry  r=   r>   r?   <module>r     s   8 7 7 7 7 7 7 7 7 7 7       0 0 0 0 0 0 0 0 0 0                      # # # # # # 8 8 8 8 8 8 ' ' ' ' ' '       % % % % % % 9 9 9 9 9 9 S S S S S S " " " " " " 3 3 3 3 3 3 # # # # # # / / / / / /         
 A @ @ @ @ @ @ @              . - - - - - ; ; ; ; ; ; > > > > > > > > L L L L L L L L L L , , , , , , N N N N N N N N N N	X		    l   * !8 I 7 7 7  \D	   
"
"
" 
" ]	
" 
" 
" 
"       2H H H H H H H HVe e e e e, e e ePC, C, C, C, C,. C, C, C,L
 
 
 
 
 67L M 
 
 
&n  n  n  n  n !89N!O n  n  n b ('	*  
w, w, w, w, w,bi1CZ w, w, 
w, w, w,r>   