
    .`iX                        U d Z ddlZddlmZ ddlmZmZmZmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZ ddlZddlZddlZddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZHmIZImJZJmKZKmLZLmMZM ddlNmOZO ddlPmQZQ ddlRmSZS ddlTmUZUmVZV ddlWmXZX dd lYmZZZ dd!l[m\Z\m]Z]m^Z^m_Z_ dd"l`maZambZbmcZc d#Zd G d$ d%eU          Ze G d& d'eU          Zfeeefz  Zgeehd(<    e
eji        d)*          Zj G d+ d,e%          Zk G d- d.ek          Zld/ed0emend1f         fd2Zod3eepejq        f         fd4Zr G d5 d6e:          Zs G d7 d8e:          Zt G d9 d:eA          Zu G d; d<eH          Zv ed=evev>          Zw G d? d@eEew                   Zx G dA dBeGew                   Zy G dC dDejz        e^e_          Z{ G dE dFe{          Z| G dG dHe{e]          Z} G dI dJe{e]          Z~ G dK dLe{e]          Z G dM dNe{e]          Ze|e}e~eedOZ e3j        eyevexP           G dQ dRe{e^e]                      ZdS )SzCInference-only MiniCPM-V model compatible with HuggingFace weights.    N)defaultdict)CallableIterableMappingSequence)partial)chain)	AnnotatedAnyLiteral	TypeAlias)nn)trunc_normal_)BatchFeaturePretrainedConfig)TypeVar)
VllmConfig)BaseDummyOptions)QuantizationConfig)BaseResampler
Resampler2get_2d_sincos_pos_embedLlamaForCausalLMMiniCPMForCausalLM)MultiModelKeysQwen2ForCausalLMQwen3ForCausalLM)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)
DictEmbeddingItems	ImageItemImageProcessorItems	ImageSizeModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser	VideoItemVideoProcessorItems)BaseDummyInputsBuilder)BaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetailsResolvedPromptUpdate	_seq2text)current_platform)IntermediateTensors)flatten_2d_lists)TensorSchemaTensorShape)set_default_torch_dtype   )Idefics2VisionTransformer)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoader
flatten_bnmaybe_prefix   c            
           e Zd ZU dZdZed         ed<   eee	j
                  eddddddh          f         ed<   ee	j
         edd	          f         ed
<   ee	j
         ed          f         ed<   dS )MiniCPMVImagePixelInputsz
    Dimensions:
        - bns: Batch size * number of images * number of slices
        - bn: Batch size * number of images
        - c: Number of channels
        - h: Height
        - w: Width
    pixel_valuestypebnschwdynamic_dims   	tgt_sizesbn
num_slicesN)__name__
__module____qualname____doc__rL   r   __annotations__r
   listtorchTensorr=        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/minicpmv.pyrJ   rJ   g   s           %3D'.
!222 U\E3SSzBBB	D    E1	    D	     r`   rJ   c                       e Zd ZU dZed         ed<   eej        e	ej                 z   e
ddddh          f         ed<   dS )	MiniCPMVImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ns: Number of slices
        - hs: Hidden size (must match language model backbone)
    image_embedsrL   rU   nshsrQ   N)rW   rX   rY   rZ   r   r[   r
   r]   r^   r\   r=   r_   r`   ra   rc   rc      sx           .
!!!!tEL))D$D6:::	<     r`   rc   MiniCPMVImageInputsgư>)epsc                   (    e Zd Zdedddfdededededz  deegej        f         d	eeef         d
e	dz  de
ddf fdZ	 dd	eeef         dej        j        ddfdZdej        dej        j        ddfdZdej        dej        dej        fdZ xZS )Resampler2_5NF   rl    num_queries	embed_dim	num_headskv_dim
norm_layermax_sizequant_configprefixreturnc	           	          t                                          |||||||           || _        |                     | j                   d S )Nrt   ru   )super__init__rs   _set_2d_pos_cache)
selfrn   ro   rp   rq   rr   rs   rt   ru   	__class__s
            ra   rz   zResampler2_5.__init__   s`     	% 	 	
 	
 	
 !t}-----r`   cpudevicec                     t          | j        |d          }t          j        |                                                              |          }|                     d|d           d S )NrS      )version	pos_embedF
persistent)r   ro   r]   
from_numpyfloattoregister_buffer)r|   rs   r   pos_embed_arrr   s        ra   r{   zResampler2_5._set_2d_pos_cache   sn     0NHf
 
 
 $]3399;;>>vFF	[)FFFFFr`   rT   c                    |d d df                                                                          }|d d df                                                                          }t          |t                    rt          |t                    sJ || j        d         k    s|| j        d         k    rXt          || j        d                   t          || j        d                   f| _        |                     | j        |           d S d S )Nr   r?   )maxitem
isinstanceintrs   r{   )r|   rT   r   max_hmax_ws        ra   _adjust_pos_cachezResampler2_5._adjust_pos_cache   s     !!!Q$##%%**,,!!!Q$##%%**,,%%%@*UC*@*@@@@4=###ut}Q/?'?'?E4=+,,E4=+,,DM ""4=&99999 (@'?r`   xc                    |j         d         |j         d         k    sJ |j         d         }|j        }|j        }|d d df         |d d df         z  }|                     ||           |                                                                }t          |t                    sJ t          j	        ||ft          j
        |          }g }	t          |          D ]}
||
                                         \  }}|	                    | j        d |d |d d f                             ||z  df                              |                     d||
||
         d f<   t          j        j        j                            |	dd                              ddd	          }	|                     |          \  }}|                     |                              ddd	          }|                     | j                  }|                     |                     ||          ||	z   ||
          d         }|                    ddd	          }|                     |          }|| j        z  }|S )Nr   r?   r   dtyper   T        batch_firstpadding_valuerS   key_padding_mask)shaper   r   r   r   r   r   r   r]   zerosboolrangetolistappendr   reshaper   r   utilsrnnpad_sequencepermutekv_projln_kvln_qqueryattn_repeatln_postproj)r|   r   rT   bsr   r   	patch_lenmax_patch_lenr   r   itgt_htgt_w_qouts                   ra   forwardzResampler2_5.forward   sp   wqzY_Q/////WQZaaadOi1o5	y888!,,..------ ;uz&
 
 
 	r 	7 	7A$Q<..00LE5vvvvqqq019955=":MNNQQRWXX   37Q	!.//HN&334s 4 
 

'!Q

 	 ||A1JJqMM!!!Q**IIdj!!iiLLB	M-	  
 

  KK1a  LLOO	Mr`   r~   )rW   rX   rY   
DEFAULT_LNr   r   r   	LayerNormtupler   strrz   r]   typesDevicer{   r^   r   r   __classcell__r}   s   @ra   rj   rj      sy        "4>$,26. .. . 	.
 d
. cUBL01. S/. )4/. . 
. . . . . .2 GLG Gc3hG161CG	G G G G::/4{/A:	: : : :, ,%, ,5< , , , , , , , ,r`   rj   c                   X    e Zd Zdeddddfdededededz  d	eegej        f         d
eeef         dede	dz  de
ddf fdZdedej        fdZ	 ddedej        j        ddfdZ	 ddedej        j        fdZdej        ej        z  fdZ	 ddej        dej        dej        fdZ xZS )Resampler4_5Nrk   i  rm   rn   ro   rp   rq   rr   rs   max_temporal_sizert   ru   rv   c
           
          t                                          ||||||||	           t          | j        d           || _        |                     | j                   |                     | j                   d S )Nrx   {Gz?std)ry   rz   r   r   r   _set_temporal_pos_cacheapply_init_weights)r|   rn   ro   rp   rq   rr   rs   r   rt   ru   r}   s             ra   rz   zResampler4_5.__init__   s     	% 	 		
 		
 		
 	djd++++!2$$T%;<<<

4%&&&&&r`   posc                 X   |dz  dk    sJ t          j        |dz  t           j                  }||dz  z  }dd|z  z  }|                    d          }t          j        d||          }t          j        |          }t          j        |          }t          j        ||gd	
          }|S )z
        embed_dim: output dimension for each position
        pos: a list of positions to be encoded: size (M,)
        out: (M, D)
        rS   r   r   g       @      ?i'  r   zm,d->mdr?   )axis)nparangefloat32r   einsumsincosconcatenate)r|   ro   r   omegar   emb_sinemb_cosembs           ra   *get_1d_sincos_pos_embed_from_temporal_sizez7Resampler4_5.get_1d_sincos_pos_embed_from_temporal_size  s     1}!!!!	)q.
;;;S eUl"kk"ooi	3..&++&++ngw/a888
r`   r~   r   c                    t          j        |t           j                  }t          j        |                     | j        |                                                                        |          }| 	                    d|d           d S )Nr   temporal_pos_embedFr   )
r   r   r   r]   r   r   ro   r   r   r   )r|   r   r   temporal_sizer   s        ra   r   z$Resampler4_5._set_temporal_pos_cache)  s     	"32:FFF??NM  
 UWWRZZ 	 	19OOOOOr`   c                 d    || j         k    r$|| _         |                     | j         |           d S d S N)r   r   )r|   r   r   s      ra   _adjust_temporal_pos_cachez'Resampler4_5._adjust_temporal_pos_cache8  sA     t555%6D"(()?HHHHH 65r`   mc                    t          |t          j                  rbt          |j        d           t          |t          j                  r.|j        )t          j                            |j        d           d S d S d S t          |t          j                  rLt          j                            |j        d           t          j                            |j        d           d S d S )Nr   r   r   r   )	r   r   Linearr   weightbiasinit	constant_r   )r|   r   s     ra   r   zResampler4_5._init_weights?  s    a## 	-!(----!RY'' -AF,>!!!&!,,,,,- -,>,>2<(( 	-Gafa(((Gah,,,,,	- 	-r`   r   rT   c                 6
   |j         d         |j         d         k    sJ |j         d         }|j        }|j        }|d d df         |d d df         z  }|                     ||           d}d }	|[t	          t          j        |                    }	t          |	d          }
|
dk    rd}|
| j        k    r| 	                    |
|           |                                
                                }t          |t                    sJ t          j        ||ft          j        |          }|                     |          \  }}|                     |                              ddd	          }|                     | j                  }g }g }t+          |          D ]}||         \  }}|ru|	|         dk    r0|                    t          j        | j        ||                     n9|                    | j        |	|                                      |                     |                    | j        d |d |d d f                             ||z  df                              |                     d||||         d f<   t          j        j        j                            |dd
                              ddd	          }|}||z   }|r|t          j         |d          z  }tC          |          }g }g }g }d}|D ]}|tC          |          z   }|                    |d d ||d d f                             ddd	                              d| j                             |                    |d d ||d d f                             ddd	                              d| j                             |                    |||d d f                             dd                     |}t          j        j        j                            |dd
                              ddd	          }t          j        j        j                            |dd
                              ddd	          }t          j        j        j                            |dd          "                    d          }| #                    | $                    ||          |||          d         }|                    ddd	          }| %                    |          }|| j&        z  }|S )Nr   r?   r   F)defaultr   Tr   rS   r   r   )dimr   )'r   r   r   r   r\   r	   from_iterabler   r   r   r   r   r   r]   r   r   r   r   r   r   r   r   r   ro   r   r   r   r   r   r   r   r   stacklensqueezer   r   r   r   )r|   r   rT   temporal_idsr   r   r   r   temporal_pos_embtemporal_ids_flattenr   r   r   r   r   pos_embed_2dpos_embed_temporalr   r   r   kvmerge_kmerge_vmerge_key_padding_maskstarttpendr   s                                ra   r   zResampler4_5.forwardH  s{    wqzY_Q/////WQZaaadOi1o5	y888 ###'(;L(I(I#J#J  #$8! D D D 2%%#'  4#999//0A6JJJ!,,..------ ;uz&
 
 
 ||A1JJqMM!!!Q**IIdj!!r 	7 	7A$Q<LE5 '*b00&--DN%OOO    '--/0DQ0GHKKERR   vvvvqqq019955=":MNNQQRWXX   37Q	!.//x~)66d# 7 
 

'!Q

 	   	/Q7777A\""BGG%'"E"  c"ggo aaasAAAo&..q!Q77??DNSS   aaasAAAo&..q!Q77??DNSS   '--$U3Y\2::2qAA   "//T 0  gaA  "//T 0  gaA   %x~1>>&D  ?    gbkk  iiLLB-	  
 

  KK1a  LLOO	Mr`   r   r   )rW   rX   rY   r   r   r   r   r   r   r   r   rz   r   ndarrayr   r]   r   r   r   r   r   r   r^   r   r   r   s   @ra   r   r      s        "4>$,!&26' '' ' 	'
 d
' cUBL01' S/' ' )4/' ' 
' ' ' ' ' '8#%:   . DIP P!$P.3k.@P	P P P P  DII I!$I.3k.@I I I I-ry2<7 - - - - m m<m <m 
m m m m m m m mr`   r   configrv   .c                     t          | dd           }|| j        dk    r| j        dk    rdS dS t          |          }t	          d |                    d          D                       S )Nr    	  @   rS   r   r   c              3   4   K   | ]}t          |          V  d S r   r   .0r   s     ra   	<genexpr>z(get_version_by_config.<locals>.<genexpr>  s(      88AQ888888r`   .)getattrhidden_size	query_numr   r   split)r  version_floatversion_strs      ra   get_version_by_configr    sx    FIt44M %%&*:b*@*@6vm$$K88!2!23!7!7888888r`   	hf_inputsc                 P   t          t          j        d          t          j        d          t          j        d          t          j        d          t          j        d          t          j        d          t          j        d          t          j        d                    S )Nimagevideo)rK   image_sizesrT   rd   video_pixel_valuesvideo_image_sizesvideo_tgt_sizesvideo_embeds)dictr$   batched)r  s    ra   _minicpmv_field_configr     s    *27;;)1'::'/88*27;;08AA/7@@-5g>>*27;;	 	 	 	r`   c                        e Zd Zdeeej        f         deeeej        f         geeef         f         ddf fdZ	de
defdZ xZS )MiniCPMVImageEmbeddingItemsdatafields_factoryrv   Nc                 V    t                                          |dddh|           d S )Nr  rd   r  modalityrequired_fieldsr$  ry   rz   r|   r#  r$  r}   s      ra   rz   z$MiniCPMVImageEmbeddingItems.__init__  sA     	+];)	 	 	
 	
 	
 	
 	
r`   indexc                     |                      |          d                                         }t          |d         |d                   S )Nr  r   r?   widthheightgetr   r*   )r|   r+  
image_sizes      ra   get_image_sizez*MiniCPMVImageEmbeddingItems.get_image_size  s=    XXe__]3::<<
z!}Z]CCCCr`   )rW   rX   rY   r   r   r]   r^   r   r$   rz   r   r*   r3  r   r   s   @ra   r"  r"    s        
c5<'(
 !S%,&'(C../1

 

 
 
 
 
 
DC DI D D D D D D D Dr`   r"  c                        e Zd Zdeeej        f         deeeej        f         geeef         f         ddf fdZ	de
defdZde
de
fdZ xZS )	MiniCPMVVideoEmbeddingItemsr#  r$  rv   Nc                 V    t                                          |dddh|           d S )Nr  r  r  r&  r)  r*  s      ra   rz   z$MiniCPMVVideoEmbeddingItems.__init__  sB     	+-@A)	 	 	
 	
 	
 	
 	
r`   r+  c                     |                      |          d                                         }t          |d         |d                   S )Nr  r   r?   r-  r0  )r|   r+  
frame_sizes      ra   get_frame_sizez*MiniCPMVVideoEmbeddingItems.get_frame_size  s>    XXe__%89@@BB
z!}Z]CCCCr`   c                 R    t          |                     |          d                   S )Nr  )r   r1  )r|   r+  s     ra   get_num_framesz*MiniCPMVVideoEmbeddingItems.get_num_frames  s     488E??#67888r`   )rW   rX   rY   r   r   r]   r^   r   r$   rz   r   r*   r9  r;  r   r   s   @ra   r5  r5    s        
c5<'(
 !S%,&'(C../1

 

 
 
 
 
 
DC DI D D D D9C 9C 9 9 9 9 9 9 9 9r`   r5  c                        e Zd Zdeeej        f         ee         z  de	e
e
f         dz  f fdZdeeej        f         ee         z  de	e
e
f         dz  f fdZ xZS )MiniCPMVMultiModalDataParserr#  rv   Nc                     t          |t                    rt          |t                    S t	                                          |          S N)r$  )r   r  r"  r   ry   _parse_image_datar|   r#  r}   s     ra   r@  z.MiniCPMVMultiModalDataParser._parse_image_data   N     dD!! 	.5   
 ww((...r`   c                     t          |t                    rt          |t                    S t	                                          |          S r?  )r   r  r5  r   ry   _parse_video_datarA  s     ra   rD  z.MiniCPMVMultiModalDataParser._parse_video_data  rB  r`   )rW   rX   rY   r  r   r]   r^   r+   r(   r,   r   r@  r/   rD  r   r   s   @ra   r=  r=    s        
/3$%Y(??
/ 
38	$t	+
/ 
/ 
/ 
/ 
/ 
/
/3$%Y(??
/ 
38	$t	+
/ 
/ 
/ 
/ 
/ 
/ 
/ 
/ 
/ 
/r`   r=  c                   x   e Zd ZdZdZd ZdefdZdefdZd Z	de
eed	z  f         fd
Z	 	 	 d dededed	z  dedef
dZ	 d!deded	z  deeef         d	z  fdZ	 d!deded	z  defdZdefdZdefdZdefdZdefdZdede
eef         defdZdefdZdefdZdedefdZdede
eef         defdZd	S )"MiniCPMVProcessingInfo(<image>./</image>)(<video>./</video>)c                 4    | j                                         S r   )ctxget_hf_configr|   s    ra   rK  z$MiniCPMVProcessingInfo.get_hf_config  s    x%%'''r`   kwargsc                      | j         j        di |}|j        }dD ]O}t          ||          }t	          |t
          j                  r#t          |||                                           P|S )N)meanr   r_   )	rJ  get_hf_processorimage_processorr  r   r   r  setattrr   )r|   rM  hf_processorrQ  attrvals         ra   rP  z'MiniCPMVProcessingInfo.get_hf_processor   sy    0tx0::6:: '6# 	= 	=D/400C#rz** =szz||<<<r`   c                 &     | j         di |j        S Nr_   )rP  rQ  )r|   rM  s     ra   get_image_processorz*MiniCPMVProcessingInfo.get_image_processor-  s    $t$..v..>>r`   c                 D    t          |                                           S r   )r  rK  rL  s    ra   get_model_versionz(MiniCPMVProcessingInfo.get_model_version0  s    $T%7%7%9%9:::r`   rv   Nc                 D    dd i}|                                  dv rd |d<   |S )Nr     rS         r   r`  r   r  )rZ  )r|   	mm_limitss     ra   get_supported_mm_limitsz.MiniCPMVProcessingInfo.get_supported_mm_limits3  s3    dO	!!##'???!%Igr`   r   Tr2  	image_idxmax_slice_numsuse_image_idc                     |                                  }|                                 }|dk    s|dk    r|                    |          S |                    ||||          S )Nr  r   )rd  re  rf  )rX  rZ  get_slice_image_placeholder)r|   r2  rd  re  rf  rQ  r   s          ra   rh  z2MiniCPMVProcessingInfo.get_slice_image_placeholder:  sx     2244((**f6 1 1">>zJJJ::)%	 ; 
 
 	
r`   c                     |                                  }|                                 }|dk    s|dk    r|                    |          S ||j        }|                    ||          S )Nr  r   re  )rX  rZ  get_sliced_gridre  )r|   r2  re  rQ  r   s        ra   rk  z&MiniCPMVProcessingInfo.get_sliced_gridO  s     2244((**f6 1 1"22:>>>!,;N..) / 
 
 	
r`   c                     |                                  }|                     ||          }|dx}}n|\  }}||z  dz   |j        z  S )Nrj  r   r?   )rX  rk  image_feature_size)r|   r2  re  rQ  gridncolsnrowss          ra   get_num_image_tokensz+MiniCPMVProcessingInfo.get_num_image_tokensc  sh    
 2244##) $ 
 
 <EEELE5!_%GGGr`   c                 T    |                                  }|                     |          S r   )!get_image_size_with_most_featuresrq  )r|   r2  s     ra   get_max_image_tokensz+MiniCPMVProcessingInfo.get_max_image_tokensu  s'    ;;==
((444r`   c                 H    t          |                                 dd          S )Nmax_slice_num	   )r  rK  rL  s    ra   get_image_max_slice_numz.MiniCPMVProcessingInfo.get_image_max_slice_numy  s     t))++_a@@@r`   c                     t          |                                 dd          }|                                 }t          |||z            S Nr2  i  r-  )r  rK  rx  r*   r|   r2  rv  s      ra   rs  z8MiniCPMVProcessingInfo.get_image_size_with_most_features|  H    T//11<EE
4466z*}2LMMMMr`   c                 |    |                                  }|                     ||                                           S )Nrj  )'get_video_frame_size_with_most_featuresrq  get_video_max_slice_num)r|   r8  s     ra   get_max_video_frame_tokensz1MiniCPMVProcessingInfo.get_max_video_frame_tokens  sB    AACC
((7799 ) 
 
 	
r`   seq_len	mm_countsc                 `    |                      ||          }|                                 |z  }|S r   )!get_num_frames_with_most_featuresr  )r|   r  r  
num_framesnum_video_tokens_totals        ra   get_max_video_tokensz+MiniCPMVProcessingInfo.get_max_video_tokens  s6    
 ;;GYOO
!%!@!@!B!BZ!O%%r`   c                     dS )Nr?   r_   rL  s    ra   r  z.MiniCPMVProcessingInfo.get_video_max_slice_num  s    qr`   c                     t          |                                 dd          }|                                 }t          |||z            S rz  )r  rK  r  r*   r{  s      ra   r~  z>MiniCPMVProcessingInfo.get_video_frame_size_with_most_features  r|  r`   
max_tokensc                 8    |                                  }||z  }|S r   )r  )r|   r  num_frame_tokensr  s       ra   get_max_video_framesz+MiniCPMVProcessingInfo.get_max_video_frames  s%    ::<<#33
r`   c                 $   |                     dd          }|                     dd          }|                                 |z  }|                     ||z
            }t          |t	          |d          z  t
                    }t	          |d          S )Nr  r   r  r?   )r1  rt  r  minr   _MAX_FRAMES_PER_VIDEO)r|   r  r  
max_images
max_videosmax_image_tokensmax_total_framesmax_frames_per_videos           ra   r  z8MiniCPMVProcessingInfo.get_num_frames_with_most_features  s    
 ]]7A..
]]7A..
4466C44W?O5OPP"J 2 224I 
  
 '+++r`   )r   NTr   )rW   rX   rY   image_patternvideo_patternrK  objectrP  rX  rZ  r   r   r   rc  r*   r   rh  r   rk  rq  rt  rx  rs  r  r  r  r~  r  r  r_   r`   ra   rF  rF    s       )M)M( ( (    ?F ? ? ? ?; ; ;cDj)A     %)!
 

 	

 d

 
 

 
 
 
2 &*	
 

 d
	

 
sCx4	
 
 
 
. &*H HH d
H 
	H H H H$5c 5 5 5 5A A A A AN9 N N N N

C 
 
 
 
&& 38$& 
	& & & &    N N N N N
s s    
,, 38$, 
	, , , , , ,r`   rF  _I)boundr   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	MiniCPMVDummyInputsBuilderr  rv   c                     |                     dd          }|                     dd          }| j        j        |z  }| j        j        |z  }||z   S )Nr  r   r  )r1  infor  r  )r|   r  
num_images
num_videosimage_prompt_textsvideo_prompt_textss         ra   get_dummy_textz)MiniCPMVDummyInputsBuilder.get_dummy_text  sT    ]]7A..
]]7A..
!Y4zA!Y4zA!$666r`   Nr  
mm_optionsc                    |                     dd          }|                     dd          }| j                                        \  }}| j                                        \  }}	| j                            ||          }
|r|                     d          nd }|r|                     d          nd }|                     ||||          |                     ||	|
|          g|z  dS )Nr  r   r  )r.  r/  r  	overridesr  r  )r1  r  rs  r~  r  _get_dummy_images)r|   r  r  r  r  r  image_widthimage_heightvideo_widthvideo_heightnum_video_framesimage_overridesvideo_overridess                ra   get_dummy_mm_dataz,MiniCPMVDummyInputsBuilder.get_dummy_mm_data  s    ]]7A..
]]7A..
$(I$O$O$Q$Q!\$(I$U$U$W$W!\9FFY
 
 6@I*..111T5?I*..111T ++!#%)	 ,   &&%'/-	 '   
 
 	
r`   r   )
rW   rX   rY   r   r   r   r  r   r#   r  r_   r`   ra   r  r    s        7S(9 7c 7 7 7 7 =A	"
 "
"
 38$"
 C!112T9	"

 
"
 "
 "
 "
 "
 "
r`   r  c                       e Zd ZdefdZd dededefdZdededefdZ	d	e
eef         d
e
eef         de
eef         de
eef         fdZd	e
eef         d
e
eef         de
eef         de
eef         fdZd	e
eef         d
e
eef         de
eef         de
eef         fdZdee         d	e
eee         f         d
e
eef         de
eef         dee         deeef         f fdZded	e
eef         d
e
eef         de
eef         def
dZdedede
eef         de
eef         def
dZdede
eef         dedee         fdZdededef fdZdede
eef         de
eef         fdZ  xZ!S )!MiniCPMVMultiModalProcessorrv   c                     t                      S r   )r=  rL  s    ra   _get_data_parserz,MiniCPMVMultiModalProcessor._get_data_parser  s    +---r`   r   r2  rd  c                 :    | j                             ||          S )N)rd  )r  rh  )r|   r2  rd  s      ra   get_image_prompt_textsz2MiniCPMVMultiModalProcessor.get_image_prompt_texts  s'    y44 5 
 
 	
r`   r  c                 r    | j                             |d| j                                         d          |z  S )Nr   F)r2  rd  re  rf  )r  rh  r  )r|   r2  r  s      ra   get_video_prompt_textsz2MiniCPMVMultiModalProcessor.get_video_prompt_texts  sE    I11%#y@@BB"	 2   	
r`   mm_data	mm_kwargs
tok_kwargsc                    |                     d          x}i S |                                                     d|i                              dt          t
          f          }t          |t                    ri }nC|                     | j        j	        gt          |          z  dd |D             i||h d          }|S )Nimagesr  c                     g | ]}|gS r_   r_   )r  r  s     ra   
<listcomp>z>MiniCPMVMultiModalProcessor.process_images.<locals>.<listcomp>  s    #G#G#GUG#G#G#Gr`   >   rT   r  rK   promptsr  r  r  out_keys)r1  r  parse_mm_data	get_itemsr"  r)   r   _base_call_hf_processorr  r  r   )r|   r  r  r  r  parsed_imagesimage_inputss          ra   process_imagesz*MiniCPMVMultiModalProcessor.process_images  s     kk(+++F4I !!##]GV,--Yw!<>Q RSS 	 m%@AA 		LL7701C4F4FF!#G#G#G#G#GH#%EEE 8  L r`   c                     |                     d          x}i S                                                      d|i                              dt          t
          f          }t          |t                    ri }nT                      fd|D             dt          |          ii |d j	        
                                i|h d          }d |                                D             }|S )	Nvideosr  c                 H    g | ]}j         j        t          |          z  S r_   )r  r  r   )r  r  r|   s     ra   r  z>MiniCPMVMultiModalProcessor.process_videos.<locals>.<listcomp>&  s6       =BDI+c%jj8  r`   r  re  >   rT   r  rK   r  c                      i | ]\  }}d | |S video_r_   r  r   r   s      ra   
<dictcomp>z>MiniCPMVMultiModalProcessor.process_videos.<locals>.<dictcomp>2  s$    IIIDAqaIIIr`   )r1  r  r  r  r5  r0   r   r  r\   r  r  items)r|   r  r  r  r  parsed_videosvideo_inputss   `      ra   process_videosz*MiniCPMVMultiModalProcessor.process_videos  s-    kk(+++F4I !!##]GV,--Yw!<>Q RSS 	 m%@AA 	LL77   FS   "4#6#67$di&G&G&I&I  &EEE 8  L JIL4F4F4H4HIIIr`   c                 b    i |                      |||          |                     |||          S r   )r  r  )r|   r  r  r  s       ra   process_mm_inputsz-MiniCPMVMultiModalProcessor.process_mm_inputs6  s@    
!!'9jAA
!!'9jAA
 	
r`   r  r  c                  
 | j                                         dv r&t                                          ||||          nt	          t
          t          t          j                 f         t                    t          |          D ]\  
}t                                          |
fd|
                                D             ||          }|
                                D ]R\  }}	t          |	          dk    sJ |t          |	          f            |                             |	d                    Sfd|D             S )Nr\  )promptr  r  r  c                 (    i | ]\  }}||         S r_   r_   )r  r   r   r   s      ra   r  zGMiniCPMVMultiModalProcessor._base_call_hf_processor.<locals>.<dictcomp>X  s#    AAAAQ!AAAr`   r?   r   c                 "    i | ]}||         S r_   r_   )r  r   inputss     ra   r  zGMiniCPMVMultiModalProcessor._base_call_hf_processor.<locals>.<dictcomp>a  s    ///6!9///r`   )r  rZ  ry   _call_hf_processorr   r   r\   r]   r^   	enumerater  r   r   )r|   r  r  r  r  r  r  
inputs_oner   r   r   r  r}   s             @@ra   r  z3MiniCPMVMultiModalProcessor._base_call_hf_processorA  sR    9&&((,DDDWW//#%	 0  FF !d5<&8!89$??F&w// 
+ 
+	6"WW77!AAAAAAA')	 8  
 ',,.. + +DAqq66Q;;;CFF;;;1I$$QqT****+ 0///h////r`   r  c                     | j                                         }t          j         |j        |fi |g          }|                     |||          }t          d|i|          S )N	input_ids)r  get_tokenizerr]   tensorencoder  r   )r|   r  r  r  r  	tokenizerr  	mm_inputss           ra   r  z.MiniCPMVMultiModalProcessor._call_hf_processorc  sz     I++--	L"2)"26"H"HZ"H"H!IJJ	**7IzJJ	Y
 
 	
r`   prompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsc                     dS )NFr_   )r|   r  r  r  r  s        ra   _hf_processor_applies_updatesz9MiniCPMVMultiModalProcessor._hf_processor_applies_updatesv  s	     ur`   out_mm_kwargsc                 |    d j         j        fd j         j        fg}g } j                                         }|D ]L\  }}|                    |                    |d                    }	|	|k    r|                    ||	f           M||z  }dt          f fd}
dt          f fd}|
|dfd	|D             S )
Nr  r  F)add_special_tokensitem_idxc                                          dt          t          f          }|                    |           }t	          j                            ||           d          S )Nr  <unk>)r  r"  r)   r3  r6   select_textr  )r  r  r2  r  r|   s      ra   get_image_replacementzNMiniCPMVMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement  sd    ''57JK F  ..x88J&2++JAA  r`   c                                          dt          t          f          }|                    |           }|                    |           }t          j                            ||          d          S )Nr  r  )r  r5  r0   r9  r;  r6   r  r  )r  r  r8  r  r  r|   s       ra   get_video_replacementzNMiniCPMVMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement  sw    ''57JK F  ..x88J..x88J&2++J
CC  r`   r  c                 F    g | ]\  }}t          |||                    S ))r'  targetreplacement)r4   )r  r'  patternget_replacements      ra   r  zCMiniCPMVMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>  sJ     
 
 
 "' !'x?X  
 
 
r`   )r  r  r  r  decoder  r   r   )r|   r  r  r  placeholdersadditional_placeholdersr  r'  r  sub_patternr  r  r  s   ``          @ra   _get_prompt_updatesz/MiniCPMVMultiModalProcessor._get_prompt_updates  sK    di-.di-.
 #%I++--	!- 	H 	HHg#**  U CC K g%%'..+/FGGG//
	C 
	 
	 
	 
	 
	 
	 
		C 	 	 	 	 	 	 	 +*
 


 
 
 
 &2	
 
 
 	
r`   cached_updatenew_item_idxc           
         t                                          ||          }|j        dk    r| j                                        }| j                                        }| j                                        }t          ||j        j	                  }|j
        }|dk    s|dk    r|j        }	|j        }
n|j        }	|j        }
|                    t!          j        |                    |	 | |
 |	 | |
 d          d                    }|S )Nr  r  r   r?   r  )ry   _recompute_cached_prompt_updater'  r  r  rX  rZ  r8   contentfullr  im_start_tokenim_end_tokenim_id_start	im_id_endwith_contentr6   r  replace)r|   r   r  
new_updater  rQ  r   textprev_item_idxim_startim_endr}   s              ra   r  z;MiniCPMVMultiModalProcessor._recompute_cached_prompt_update  s-   
 WW<<
 


 !W,,	//11I"i;;==Oi1133GY(=(BCCD)2M&  Gv$5$5*9(5*6(2#00#/LL#<]<F<<#;\;6;; 
  	 	J r`   r  c                      t          |          S r   )r   )r|   r  r  s      ra   _get_mm_fields_configz1MiniCPMVMultiModalProcessor._get_mm_fields_config  s    
 &i000r`   )r   )"rW   rX   rY   r.   r  r*   r   r   r  r  r   r  r&   r  r  r  r\   r   setr  r  r   r  r-   r   r  r%   r5   r  r7   r  r$   r  r   r   s   @ra   r  r    s       ."6 . . . .
 
 
s 
SV 
 
 
 
	
 	
 	
PS 	
 	
 	
 	
f% 3;' CK(	
 
m#	$   8!f%! 3;'! CK(	!
 
m#	$! ! ! !F	
f%	
 3;'	
 CK(		

 
m#	$	
 	
 	
 	
 0c 0 hv../ 0 3;'	 0
 CK( 0 c( 0 
c= 	! 0  0  0  0  0  0D

 f%
 3;'	

 CK(
 

 
 
 
& & !(V 4	
 %S&[1 
   9
%9
 !(V 49
 -	9

 
,	9
 9
 9
 9
v$+$ $ 
	$ $ $ $ $ $L11 !(V 41 
++	,	1 1 1 1 1 1 1 1r`   r  c                   h    e Zd ZdZdZededededz  fd            Zdd	d
e	def fdZ
dedededz  fdZdedefdZdedej        eej                 z  eej        df         z  fdZdefdZdedefdZ	 	 d)dej        dej        dedz  dej        dz  dedej        fdZdej        dej        dz  fdZdeeeej        f                  dee         fdZdefdZ 	 d*d
e	dede!j"        fd Z#	 d*d!e$d"e%dz  dede!j"        fd#Z&	 	 d+d$ed%ed"e%dz  dede!j"        f
d&Z'd'e(dej        fd(Z) xZ*S ),MiniCPMVBaseModelz_
    The abstract class of MiniCPMV can only be inherited, but cannot be
    instantiated.
    Tr'  r   rv   Nc                 |    |                     d          rdS |                     d          rdS t          d          )Nr  rG  r  rH  z)Only image or video modality is supported)
startswith
ValueError)clsr'  r   s      ra   get_placeholder_strz%MiniCPMVBaseModel.get_placeholder_str  sI    w'' 	)((w'' 	)((DEEEr`   rm   ru   vllm_configru   c          
      b   |j         j        }|j         j        }|j        }|j        dk    | _        t                                                       || _        || _        t          | j                  | _
        |                     |          5  |                     |t          |d                    | _        d d d            n# 1 swxY w Y   |                     |ddh          5  |                     ||t          |d                    | _        | j
        dk    r| j        j        n| j        j        j        | _        | j        j        | _        |                     | j        | j        |t          |d	          
          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nr#  llmr  ru   r  r  vpmr  r  	resamplerrx   )model_config	hf_configmultimodal_configrt   mm_encoder_tp_modeuse_data_parallelry   rz   r  r  r   _mark_language_modelinit_llmrG   r  _mark_tower_modelinit_vision_moduler   ro   
embeddings
vision_dimr  init_resamplerr!  make_empty_intermediate_tensors)r|   r  ru   r  r$  rt   r}   s         ra   rz   zMiniCPMVBaseModel.__init__  s"   )3'4F"/!2!E!O
 !2,T[99&&{33 	 	}}'VU0K0K %  DH	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 ##K'71CDD 	 	..\&%-H-H /  DH
 <6)) ""X(2 O
 "[4DN!00)#FK88	 1  DN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	$ 04x/W,,,s%   +CCC'B FFFrM  c                 X   |                     dd           }|                     dd           }||d S |t          d|          S |                     d          }t          j        d |D                       }t	          |          }t	          |d          }t          d|||          S )	NrK   rd   )rL   rd   rT   c                 ,    g | ]}t          |          S r_   )r   )r  pss     ra   r  zFMiniCPMVBaseModel._parse_and_validate_vision_input.<locals>.<listcomp>4  s    'G'G'GBB'G'G'Gr`   T)concat)rL   rK   rT   rV   )poprc   r]   r  rF   rJ   )	r|   r'  rM  rK   rd   rT   num_slices_flatpixel_values_flattgt_sizes_flats	            ra    _parse_and_validate_vision_inputz2MiniCPMVBaseModel._parse_and_validate_vision_input!  s    
 zz.$77zz.$77L$84#/#)   
 JJ{++	,'G'G,'G'G'GHH&|44#Id;;;'*$&	
 
 
 	
r`   c                     i }|D ]P}|dv rd|vr | j         	 di ||d<   |dv r1d|vr- | j         	 di d |                                D             |d<   Q|S )N)rK   rd   r  )r  r  r  c                 @    i | ]\  }}|                     d           |S r  )removeprefixr  s      ra   r  zKMiniCPMVBaseModel._parse_and_validate_multimodal_inputs.<locals>.<dictcomp>Q  s*     X X XA!9!91 X X Xr`   )r  )r  )r7  r  )r|   rM  
modalities	input_keys       ra   %_parse_and_validate_multimodal_inputsz7MiniCPMVBaseModel._parse_and_validate_multimodal_inputs?  s    
   	 	I===J..'Lt'L( ( &( (
8$ CCCJ..'Lt'L( ( X X X X X( (
8$ r`   image_input.c                     |d         dk    r|d         S |                      |          }|d         }d |                    |                                          D             S )NrL   rd   rV   c                 :    g | ]}|                     d d          S )r   r?   )flatten)r  es     ra   r  z;MiniCPMVBaseModel._process_vision_input.<locals>.<listcomp>`  s$    XXXA		!QXXXr`   )get_vision_hidden_statesr  r   )r|   r>  image_features_flatrV   s       ra   _process_vision_inputz'MiniCPMVBaseModel._process_vision_inputV  sl     v.00~..";;KHH .
XX)<)B)B:CTCTCVCV)W)WXXXXr`   r;  c                     d}|D ]l}|dk    r/|d         }|                      |          }|t          |          z  }|dk    r/|d         }|                      |          }|t          |          z  }m|S )Nr_   r  r  )rE  r   )r|   r;  multimodal_embeddingsr'  r>  image_embeddingsvideo_inputvideo_embeddingss           ra   _process_multimodal_inputsz,MiniCPMVBaseModel._process_multimodal_inputsb  s     ;= # 	A 	AH8##(2#'#=#=k#J#J %/?)@)@@%8##(2#'#=#=k#J#J %/?)@)@@%$$r`   c                 N     | j         di |}|sg S |                     |          S rW  )r=  rK  )r|   rM  r;  s      ra   embed_multimodalz"MiniCPMVBaseModel.embed_multimodalu  s;    ?T?II&II
 	I..z:::r`   r  	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)r  rN  rO  rP  )r  model)r|   r  rN  rO  rP  rM  hidden_statess          ra   r   zMiniCPMVBaseModel.forward|  s=      + M!5'	 ' 
 
 r`   rS  c                 6    | j                             |          S r   )r  compute_logits)r|   rS  s     ra   rU  z MiniCPMVBaseModel.compute_logits  s     x&&}555r`   weightsc                 J    t          |           }|                    |          S r   rE   load_weightsr|   rV  loaders      ra   rY  zMiniCPMVBaseModel.load_weights  s#    "4((""7+++r`   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        r  r!  r   )language_model	connectortower_model)r   from_string_fieldrL  s    ra   get_mm_mappingz MiniCPMVBaseModel.get_mm_mapping  s%     / KU
 
 
 	
r`   c                     t           r   NotImplementedErrorr|   r  ru   s      ra   r(  zMiniCPMVBaseModel.init_llm  s
    
 "!r`   r  rt   c                     t           r   rc  )r|   r  rt   ru   s       ra   r*  z$MiniCPMVBaseModel.init_vision_module  s
     "!r`   ro   r,  c                     t           r   rc  )r|   ro   r,  rt   ru   s        ra   r-  z MiniCPMVBaseModel.init_resampler  s
     "!r`   r#  c                     t           r   rc  )r|   r#  s     ra   rC  z*MiniCPMVBaseModel.get_vision_hidden_states  s    !!r`   )NNrm   Nrm   )+rW   rX   rY   rZ   supports_encoder_tp_dataclassmethodr   r   r  r   rz   r  rg   r7  r  r=  r]   r^   r\   r   rE  rK  rA   rM  r:   r   r   rU  r   r  rY  r   ra  r   Moduler(  r   r   r*  r-  rJ   rC  r   r   s   @ra   r  r    s        
  $F3 F3 F3: F F F [F BD &X &X &Xz &X3 &X &X &X &X &X &XP

 
 
t	#	
 
 
 
<f     .
Y(
Y 
U\*	*U5<3D-E	E
Y 
Y 
Y 
Y%T % % % %&; ;4H ; ; ; ; <@-1 < < 2D8	
 |d*  
   &6|6 
	6 6 6 6,HU33D-E$F ,3s8 , , , ,
 
 
 
 
 " "" " 
	" " " " 	" " " )4/" 	"
 
" " " " 37" "" " )4/	"
 " 
" " " ""-E "%, " " " " " " " "r`   r  c                        e Zd ZdZdddedef fdZ	 ddededej        fdZ		 dd	e
d
edz  dedej        fdZ	 	 ddeded
edz  dedej        f
dZdedej        fdZ xZS )MiniCPMV2_0Frm   r  r  ru   c                h    t                                          ||           | j        dk    sJ d S )Nr  r  ry   rz   r   r|   r  ru   r}   s      ra   rz   zMiniCPMV2_0.__init__  9    [@@@|v%%%%%%r`   rv   c                 $    t          ||          S Nr  r   re  s      ra   r(  zMiniCPMV2_0.init_llm  s    
 "k&IIIIr`   r  rt   Nc                    	 dd l }n## t          $ r t          d          t          w xY wt          t          j                  5  |                    ddddd          }d d d            n# 1 swxY w Y   |                    t          j                              }t          ||j	        j
                  r*|j        #t          j                                        |_        | j        j        r|j        d d         |_        |S )	Nr   zPlease install timm==0.9.10z#vit_so400m_patch14_siglip_384.webliFT)
pretrainednum_classesdynamic_img_sizedynamic_img_padr   r   )timmImportErrorr>   r]   float16create_modelr   get_default_dtyper   modelsVisionTransformer	attn_poolr   Identityr  drop_vision_last_layerblocks)r|   r  rt   ru   r{  rR  s         ra   r*  zMiniCPMV2_0.init_vision_module  sK   	NKKKK 	N 	N 	N;<<+M	N %U]33 	 	%%5 !% $ &  E	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 u68899 udk;<<	2+#h//11EO;- 	- <,ELs     'A**A.1A.ro   r,  c                 N   t          t          j                  5  t          ||dz  t	          t          j        | j        j                            |dd||          }d d d            n# 1 swxY w Y   |	                    t          j        t          j                              S )N   FT)ro   rp   	grid_sizerq   adaptivedo_post_projectionrt   ru   r   r   )r>   r]   r}  r   r   mathsqrtr  r  r   r9   device_typer  r|   ro   r,  rt   ru   r!  s         ra   r-  zMiniCPMV2_0.init_resampler  s     %U]33 
	 
	"##s*di(=>>??!#')	 	 	I
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 ||#/u7N7P7P  
 
 	
s   AA**A.1A.r#  c                    |d         }| j         j        j        \  }}| j         j        j        j        }t          | j         dd          }t          t          j	                             }|D ]}|d         j
        dd          \  }	}
t          j        |	|z            t          j        |
|z            f}| j                             |                    d                              |                    }|dk    r|d d |d f         }|                    |                     ||                     t          j        |          S )NrK   num_prefix_tokensr   )r   patch_embed
patch_sizer   r#  r   r  r\   r]   r^   r   r  ceilforward_features	unsqueezerL   r   r!  vstack)r|   r#  rK   P_hP_wr   r  respixel_valueHWtgt_sizevision_embeddings                ra   rC  z$MiniCPMV2_0.get_vision_hidden_states  s7   N+8'2S!X/4:#DH.A1EE5< ""' 		C 		CKq>',DAq	!c'**DIa#g,>,>?H#x88%%a((--e44    !1$$#3AAA7H7I7I4I#J JJt~~&6AABBBB|C   r`   ri  rj  )rW   rX   rY   rk  r   r   rz   r   rm  r(  r   r   r*  r   r-  rJ   r]   r^   rC  r   r   s   @ra   ro  ro    sk       $AC & & &z &3 & & & & & & J JJ J 
	J J J J 	      )4/  	 
 
       L 37
 

 
 )4/	

 
 

 
 
 
.!-E !%, ! ! ! ! ! ! ! !r`   ro  c                        e Zd Zg dddgdZdddedef fd	Z	 ddeded
ej        fdZ		 dde
dedz  ded
ej        fdZ	 	 ddedededz  ded
ej        f
dZded
ej        fdZ xZS )MiniCPMV2_5q_projk_projv_proj	gate_projup_projqkv_projgate_up_projrm   r  r  ru   c                h    t                                          ||           | j        dk    sJ d S )Nr  r   rq  rr  s      ra   rz   zMiniCPMV2_5.__init__'  rs  r`   rv   c                 $    t          ||          S ru  r   re  s      ra   r(  zMiniCPMV2_5.init_llm+      
  KGGGGr`   r  rt   Nc                     t          |j        ||| j                  }| j        j        r|j        j        d d         |j        _        |S N)rt   ru   r&  r   r@   vision_configr&  r  r  encoderlayersr|   r  rt   ru   rR  s        ra   r*  zMiniCPMV2_5.init_vision_module2  V     * %"4	
 
 
 ;- 	=#(=#7#<EM r`   ro   r,  c           	         t          t          j                  5  t          | j        j        ||dz  |||          }d d d            n# 1 swxY w Y   |                    t          j        t          j	                              S Nr  )rn   ro   rp   rq   rt   ru   r  
r>   r]   r}  rj   r  r  r   r9   r  r  r  s         ra   r-  zMiniCPMV2_5.init_resamplerB  s     %U]33 	 	$ K1##s*!)  I	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ||#/u7N7P7P  
 
 	
   #A		AAr#  c                    |d         }|d         }t          |          }|d         j        d         }t          d |D                       }|d         j        }|d         j        }t          j        |d||f||          }	t          |          D ]\  }
}|j        d         }||	|
d	d |f<   |                    d          }|                                	                                }t          |t                    sJ t          j        ||ft
          j        |          }t          |          D ]\  }
}d
||
d |f<   |                     |	|                    d          d           }|                     ||          S )NrK   rT   r   r  c              3   0   K   | ]}|j         d          V  dS r   Nr   r  r   s     ra   r  z7MiniCPMV2_5.get_vision_hidden_states.<locals>.<genexpr>]  (      884
2888888r`      r   r   .Tr?   patch_attention_maskrT   r   r   r   r   r   r]   r   r  prodr   r   r   r   r   r  r!  r|   r#  rK   rT   BPLr   r   all_pixel_valuesr   pixel_values_itemL_itemnum_patchesmax_patchespatch_attn_masknum_patches_itemr  s                     ra   rC  z$MiniCPMV2_5.get_vision_hidden_statesW  s   N+%	O!"%88<88888a'Q% ;1a|5PPP$-l$;$; 	B 	B A &,R0F0AQWfW_--nnR((!oo'',,..+s++++++q+&6ejQWXXX#,[#9#9 	9 	9A48OA0 0001188!0!:!:1!=!= $ 
 
 ~~.	:::r`   ri  rj  )rW   rX   rY   packed_modules_mappingr   r   rz   r   rm  r(  r   r   r*  r   r-  rJ   r]   r^   rC  r   r   s   @ra   r  r    s       
 
 
 

 
 BD & & &z &3 & & & & & & H HH H 
	H H H H 	   )4/ 	
 
   ( 37
 

 
 )4/	

 
 

 
 
 
*;-E ;%, ; ; ; ; ; ; ; ;r`   r  c                       e Zd Zg dddgdZdddedef fd	Z	 ddeded
ej        fdZ		 	 dde
dedz  ded
ej        fdZ	 	 ddedededz  ded
ej        f
dZded
ej        fdZdeeeej        f                  d
ee         fdZ xZS )MiniCPMV2_6r  r  r  r  rm   r  r  ru   c                h    t                                          ||           | j        dk    sJ d S )Nr  r]  rq  rr  s      ra   rz   zMiniCPMV2_6.__init__  rs  r`   rv   c                 $    t          ||          S ru  r   re  s      ra   r(  zMiniCPMV2_6.init_llm  r  r`   Nr  rt   c                     t          |j        ||| j                  }| j        j        r|j        j        d d         |j        _        |S r  r  r  s        ra   r*  zMiniCPMV2_6.init_vision_module  r  r`   ro   r,  c           	         t          t          j                  5  t          | j        j        ||dz  |||          }d d d            n# 1 swxY w Y   |                    t          j        t          j	                              S r  r  r  s         ra   r-  zMiniCPMV2_6.init_resampler       %U]33 		 		$ K1##s*!)  I		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 ||#/u7N7P7P  
 
 	
r  r#  c                    |d         }|d         }t          |          }|d         j        d         }t          d |D                       }|d         j        }|d         j        }t          j        |d||f||          }	t          |          D ]\  }
}|j        d         }||	|
d	d |f<   |                    d          }|                                	                                }t          |t                    sJ t          j        ||ft
          j        |          }t          |          D ]\  }
}d
||
d |f<   |                     |	|                    d          |          }|                     ||          S )NrK   rT   r   r  c              3   0   K   | ]}|j         d          V  dS r  r  r  s     ra   r  z7MiniCPMV2_6.get_vision_hidden_states.<locals>.<genexpr>  r  r`   r  r   r   .Tr?   r  r  r  s                     ra   rC  z$MiniCPMV2_6.get_vision_hidden_states     N+%	O!"%88<88888a'Q% ;1a|5PPP$-l$;$; 	B 	B A &,R0F0AQWfW_--nnR((!oo'',,..+s++++++q+&6ejQWXXX#,[#9#9 	9 	9A48OA0 0001188!0!:!:1!=!= $ 
 
 ~~.	:::r`   rV  c                 R    t          | g d          }|                    |          S N)zapm.audiotts)skip_prefixesrX  rZ  s      ra   rY  zMiniCPMV2_6.load_weights  /    "47O7O7OPPP""7+++r`   ri  rj  rW   rX   rY   r  r   r   rz   r   rm  r(  r   r   r*  r   r-  rJ   r]   r^   rC  r   r   r  rY  r   r   s   @ra   r  r  w         
 
 
 

 
 BD & & &z &3 & & & & & & H HH H 
	H H H H 37	   )4/ 	
 
   ( 37
 

 
 )4/	

 
 

 
 
 
,;-E ;%, ; ; ; ;>,HU33D-E$F ,3s8 , , , , , , , ,r`   r  c                       e Zd Zg dddgdZdddedef fd	Z	 ddeded
ej        fdZ		 	 dde
dedz  ded
ej        fdZ	 	 ddedededz  ded
ej        f
dZded
ej        fdZdeeeej        f                  d
ee         fdZ xZS )MiniCPMV4_0r  r  r  r  rm   r  r  ru   c                h    t                                          ||           | j        dk    sJ d S )Nr  r_  rq  rr  s      ra   rz   zMiniCPMV4_0.__init__  rs  r`   rv   c                 $    t          ||          S ru  r   re  s      ra   r(  zMiniCPMV4_0.init_llm  r  r`   Nr  rt   c                     t          |j        ||| j                  }| j        j        r|j        j        d d         |j        _        |S r  r  r  s        ra   r*  zMiniCPMV4_0.init_vision_module  r  r`   ro   r,  c           	         t          t          j                  5  t          | j        j        ||dz  |||          }d d d            n# 1 swxY w Y   |                    t          j        t          j	                              S r  r  r  s         ra   r-  zMiniCPMV4_0.init_resampler  r  r  r#  c                    |d         }|d         }t          |          }|d         j        d         }t          d |D                       }|d         j        }|d         j        }t          j        |d||f||          }	t          |          D ]\  }
}|j        d         }||	|
d	d |f<   |                    d          }|                                	                                }t          |t                    sJ t          j        ||ft
          j        |          }t          |          D ]\  }
}d
||
d |f<   |                     |	|                    d          |          }|                     ||          S )NrK   rT   r   r  c              3   0   K   | ]}|j         d          V  dS r  r  r  s     ra   r  z7MiniCPMV4_0.get_vision_hidden_states.<locals>.<genexpr>  r  r`   r  r   r   .Tr?   r  r  r  s                     ra   rC  z$MiniCPMV4_0.get_vision_hidden_states  r  r`   rV  c                 R    t          | g d          }|                    |          S r  rX  rZ  s      ra   rY  zMiniCPMV4_0.load_weights6  r  r`   ri  rj  r  r   s   @ra   r  r    r  r`   r  c                       e Zd Zg dddgdZdddedef fd	Z	 ddeded
ej        fdZ		 	 dde
dedz  ded
ej        fdZ	 	 ddedededz  ded
ej        f
dZded
ej        fdZdeeeej        f                  d
ee         fdZ xZS )MiniCPMV4_5r  r  r  r  rm   r  r  ru   c                h    t                                          ||           | j        dk    sJ d S )Nr  ra  rq  rr  s      ra   rz   zMiniCPMV4_5.__init__H  rs  r`   rv   c                 $    t          ||          S ru  r    re  s      ra   r(  zMiniCPMV4_5.init_llmL  r  r`   Nr  rt   c                     t          |j        ||| j                  }| j        j        r|j        j        d d         |j        _        |S r  r  r  s        ra   r*  zMiniCPMV4_5.init_vision_moduleS  r  r`   ro   r,  c           	         t          t          j                  5  t          | j        j        ||dz  |||          }d d d            n# 1 swxY w Y   |                    t          j        t          j	                              S r  )
r>   r]   r}  r   r  r  r   r9   r  r  r  s         ra   r-  zMiniCPMV4_5.init_resamplerc  r  r  r#  c                 V   |d         }|d         }|                     dd           }t          |          }|d         j        d         }t          d |D                       }|d         j        }|d         j        }	t          j        |d||f|	|          }
|d nt          |          }t          |          D ]\  }}|j        d	         }||
|d
d |f<   |
                    d	          }|                                                                }t          |t                    sJ t          j        ||ft          j        |          }t          |          D ]\  }}d||d |f<   |                     |
|                    d          |          }|                     |||          S )NrK   rT   r   r   r  c              3   0   K   | ]}|j         d          V  dS r  r  r  s     ra   r  z7MiniCPMV4_5.get_vision_hidden_states.<locals>.<genexpr>  r  r`   r  r   r   .Tr?   r  )r1  r   r   r   r   r   r]   r   r;   r  r  r   r   r   r   r   r  r!  )r|   r#  rK   rT   r   r  r  r  r   r   r  all_temporal_idsr   r  r  r  r  r  r  r  s                       ra   rC  z$MiniCPMV4_5.get_vision_hidden_statesy  s   N+%	xx55O!"%88<88888a'Q% ;1a|5PPP (DD.>|.L.L 	 %.l$;$; 	B 	B A &,R0F0AQWfW_--nnR((!oo'',,..+s++++++q+&6ejQWXXX#,[#9#9 	9 	9A48OA0 0001188!0!:!:1!=!= $ 
 
 ~~.	;KLLLr`   rV  c                 R    t          | g d          }|                    |          S r  rX  rZ  s      ra   rY  zMiniCPMV4_5.load_weights  r  r`   ri  rj  r  r   s   @ra   r  r  ;  s       
 
 
 

 
 BD & & &z &3 & & & & & & H HH H 
	H H H H 37	   )4/ 	
 
   ( 37
 

 
 )4/	

 
 

 
 
 
,!M-E !M%, !M !M !M !MF,HU33D-E$F ,3s8 , , , , , , , ,r`   r  )r  r   r]  r_  ra  )r  dummy_inputsc                   (    e Zd ZdZdddedefdZdS )MiniCPMVz
    Different versions of MiniCPMV use different visual encoders and LLMs,
    which is not conducive to the current integration logic of LoRA and
    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
    rm   r  r  ru   c                p   |j         j        }t          |d          s|j        dk    r|j        dk    rd}nCd}n@t          |j                                      d          }t          d |D                       }t          
                    |          }|Xd                    d	 t          t                                                    D                       }t          d
| d|           | j                            |j                   | j                            |j                    |||          S )Nr   r  r  r  r   r  c                 ,    g | ]}t          |          S r_   r	  r
  s     ra   r  z$MiniCPMV.__new__.<locals>.<listcomp>  s    555SVV555r`   z, c                 4    g | ]}|d           d|d          S )r   r  r?   r_   )r  r   s     ra   r  z$MiniCPMV.__new__.<locals>.<listcomp>  s.    KKKaAaD!!1Q4!!KKKr`   z+Currently, MiniCPMV only supports versions z. Got version: r  )r"  r#  hasattrr  r  r   r   r  r   _SUPPORT_VERSIONr1  joinsortedkeysr  r  updateembedding_modules)r  r  ru   r  r   instance_clssupported_versionss          ra   __new__zMiniCPMV.__new__  sW   )3vy)) 	7!T))f.>".D.D  &.))//44G55W55566G'++G44!%KK62B2G2G2I2I+J+JKKK" " @%@ @6=@ @   	")),*MNNN$$\%CDDD|FCCCCr`   N)rW   rX   rY   rZ   r   r   r  r_   r`   ra   r  r    s[          @B D D DZ D D D D D D Dr`   r  )rZ   r  collectionsr   collections.abcr   r   r   r   	functoolsr   	itertoolsr	   typingr
   r   r   r   numpyr   r]   torch.typesr   torch.nn.initr   transformersr   r   typing_extensionsr   vllm.configr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   $vllm.model_executor.layers.resamplerr   r   r    vllm.model_executor.models.llamar   "vllm.model_executor.models.minicpmr   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.qwen2r    vllm.model_executor.models.qwen3r!   vllm.multimodalr"   vllm.multimodal.inputsr#   r$   r%   r&   vllm.multimodal.parser'   r(   r)   r*   r+   r,   r-   r.   r/   r0   vllm.multimodal.processingr1   $vllm.multimodal.processing.processorr2   r3   r4   r5   r6   r7   r8   vllm.platformsr9   vllm.sequencer:   vllm.utils.collection_utilsr;   vllm.utils.tensor_schemar<   r=   vllm.utils.torch_utilsr>   idefics2_vision_modelr@   
interfacesrA   rB   rC   rD   r   rE   rF   rG   r  rJ   rc   rg   r[   r   r   rj   r   r   r   r  r   r^   r   r"  r5  r=  rF  r  r  r  rm  r  ro  r  r  r  r  r  register_processorr  r_   r`   ra   <module>r'     s(  2 J I I  # # # # # # A A A A A A A A A A A A             5 5 5 5 5 5 5 5 5 5 5 5                ' ' ' ' ' ' 7 7 7 7 7 7 7 7 % % % % % % " " " " " " 3 3 3 3 3 3 F F F F F F         
 > = = = = = A A A A A A D D D D D D = = = = = = = = = = = = / / / / / /                                   > = = = = =                  , + + + + + - - - - - - 8 8 8 8 8 8 > > > > > > > > : : : : : : < < < < < <            ? > > > > > > > > >      |   8    <    ":<X!X Y X X XWR\t,,,
\ \ \ \ \= \ \ \~    <   D
9"2 
9uS#X 
9 
9 
9 
9
gc5<.?&@ 
 
 
 
D D D D D"4 D D D*9 9 9 9 9"4 9 9 90/ / / / /#7 / / /4T, T, T, T, T,/ T, T, T,n WT/9OPPP,
 ,
 ,
 ,
 ,
!7!; ,
 ,
 ,
^C1 C1 C1 C1 C1"9""= C1 C1 C1LR" R" R" R" R"	#5z R" R" R"jZ! Z! Z! Z! Z!# Z! Z! Z!zZ; Z; Z; Z; Z;#\ Z; Z; Z;z_, _, _, _, _,#\ _, _, _,D_, _, _, _, _,#\ _, _, _,Dc, c, c, c, c,#\ c, c, c,N    ('	+  
 D  D  D  D  D "4l  D  D 
 D  D  Dr`   