
    .`i3i                     (   U d Z ddlmZmZmZmZ ddlmZmZm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;  ej<        d          Z= G d de.          Z> G d de.          Z?e>e?z  Z@e
eAd<   deeBejC        f         fdZD G d de$          ZE G d de4          ZF G d d e6          ZG G d! d"e3eG                   ZH G d# d$e5eG                   ZI G d% d&ejJ                  ZK G d' d(ejJ                  ZL G d) d*e          ZM ejN        eIeGeH+           G d, d-e2                      ZOdS ).zCInference-only MiniCPM-O model compatible with HuggingFace weights.    )CallableIterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)nn)BatchFeature)BaseModelOutputWithPast)ACT2FNWhisperAttentionWhisperConfigWhisperEncoder)
VllmConfig)BaseDummyOptions)MULTIMODAL_REGISTRYMultiModalKwargsItems)MultiModalDataDictMultiModalFieldConfigNestedTensors)	AudioItemAudioProcessorItemsDictEmbeddingItemsModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser)PromptReplacementPromptUpdatePromptUpdateDetails)TensorSchemaTensorShape   )_MAX_FRAMES_PER_VIDEOMiniCPMV2_6MiniCPMVDummyInputsBuilderMiniCPMVMultiModalDataParserMiniCPMVMultiModalProcessorMiniCPMVProcessingInfo_minicpmv_field_config)AutoWeightsLoadercast_overflow_tensorsmaybe_prefixcpuc                       e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddddh          f         ed<   	 eej	        e
ej	                 z   edd	          f         ed
<   dS )MiniCPMOAudioFeatureInputsz
    Dimensions:
        - bns: Batch size * number of audios * number of slices
        - bn: Batch size * number of audios
        - c: Number of channels
        - l: Length
        - s: Number of slices
    audio_featurestypebnscldynamic_dimsbnsaudio_feature_lensN__name__
__module____qualname____doc__r4   r	   __annotations__r   torchTensorlistr$        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/minicpmo.pyr2   r2   O   s           '7D'"
#666tEL))E33%888	:    "tEL))D#	     rG   r2   c                       e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddddh          f         ed<   dS )	MiniCPMOAudioEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of audios
        - s: Number of slices
        - h: Hidden size (must match language model backbone)

    Length of each slice may vary, so pass it as a list.
    audio_embedsr4   r:   r;   hr8   Nr=   rF   rG   rH   rJ   rJ   o   s|           %3D'.
!222tEL))D#s#777	9     rG   rJ   MiniCPMOAudioInputs	hf_inputsc           	          t          di t          |           t          j        d          t          j        d          t          j        d          dS )Naudio)r3   r<   rK   rF   )dictr,   r   batched)rN   s    rH   _minicpmo_field_configrS      sb      
 
+
+,4W==08AA*27;;	    rG   c                        e Zd Zdeeej        f         deeeej        f         geeef         f         ddf fdZ	 xZ
S )MiniCPMOAudioEmbeddingItemsdatafields_factoryreturnNc                 T    t                                          |ddh|           d S )NimagerK   )modalityrequired_fieldsrW   )super__init__)selfrV   rW   	__class__s      rH   r^   z$MiniCPMOAudioEmbeddingItems.__init__   s?     	+,)	 	 	
 	
 	
 	
 	
rG   )r>   r?   r@   r   strrC   rD   r   r   r^   __classcell__r`   s   @rH   rU   rU      s        
c5<'(
 !S%,&'(C../1

 

 
 
 
 
 
 
 
 
 
rG   rU   c                   j     e Zd Zdeeej        f         ee         z  de	e
e
f         dz  f fdZ xZS )MiniCPMOMultiModalDataParserrV   rX   Nc                     t          |t                    rt          |t                    S t	                                          |          S )N)rW   )
isinstancerQ   rU   rS   r]   _parse_audio_data)r_   rV   r`   s     rH   rh   z.MiniCPMOMultiModalDataParser._parse_audio_data   sN     dD!! 	.5   
 ww((...rG   )r>   r?   r@   rQ   ra   rC   rD   r   r   r   r   rh   rb   rc   s   @rH   re   re      ss        
/3$%Y(??
/ 
38	$t	+
/ 
/ 
/ 
/ 
/ 
/ 
/ 
/ 
/ 
/rG   re   c            	            e Zd ZdZdeeedz  f         f fdZ	 	 ddeded	edefd
Z	defdZ
defdZdefdZdefdZdefdZdefdZdedefdZdedeeef         defdZ xZS )MiniCPMOProcessingInfo(<audio>./</audio>)rX   Nc                 P    i t                                                      dd iS )NrP   )r]   get_supported_mm_limits)r_   r`   s    rH   rm   z.MiniCPMOProcessingInfo.get_supported_mm_limits   s(    C%''1133CWdCCCrG   Tr%   
audio_lenschunk_inputchunk_lengthc                 Z    |                                  }|                    |||          S N)ro   rp   )get_hf_processorget_audio_placeholder)r_   rn   ro   rp   hf_processors        rH   rt   z,MiniCPMOProcessingInfo.get_audio_placeholder   s;     ,,..11#% 2 
 
 	
rG   c                     dS )N   rF   r_   s    rH   get_default_audio_pool_stepz2MiniCPMOProcessingInfo.get_default_audio_pool_step   s    qrG   c                     dS )Ni>  rF   rx   s    rH   get_default_audio_sampling_ratez6MiniCPMOProcessingInfo.get_default_audio_sampling_rate   s    urG   c                 4    |                                  j        S N)get_hf_configaudio_chunk_lengthrx   s    rH   get_chunk_lengthz'MiniCPMOProcessingInfo.get_chunk_length   s    !!##66rG   c                 Z    |                                  }d}|dz
  dz  dz   }||z
  |z  dz   S )Nd   r%   rw   )ry   )r_   	pool_stepfbank_feat_in_chunkcnn_feat_in_chunks       rH   get_max_audio_tokens_per_chunkz5MiniCPMOProcessingInfo.get_max_audio_tokens_per_chunk   sC    4466	!014:Q>!I-);a??rG   c                     dS )N   rF   rx   s    rH   'get_max_audio_chunks_with_most_featuresz>MiniCPMOProcessingInfo.get_max_audio_chunks_with_most_features   s    rrG   c                 X    |                                  }|                                 |z  S r}   )r   r   )r_   
num_chunkss     rH   get_max_audio_tokensz+MiniCPMOProcessingInfo.get_max_audio_tokens   s*    AACC
2244zAArG   r   c                     |                                  }|                                 }t          ||z  |z            dz   S )Nr%   )r{   r   int)r_   r   sampling_ratenum_tokens_per_chunks       rH   get_audio_len_by_num_chunksz2MiniCPMOProcessingInfo.get_audio_len_by_num_chunks   sC    <<>>#BBDD:-0DDEEIIrG   seq_len	mm_countsc                    |                     dd          }|                     dd          }|                     dd          }|                                 |z  }|                                 |z  }|                     ||z
  |z
            }t	          |t          |d          z  t                    }	t          |	d          S )NrZ   r   videorP   r%   )getget_max_image_tokensr   get_max_video_framesminmaxr&   )
r_   r   r   
max_images
max_videos
max_audiosmax_image_tokensmax_audio_tokensmax_total_framesmax_frames_per_videos
             rH   !get_num_frames_with_most_featuresz8MiniCPMOProcessingInfo.get_num_frames_with_most_features   s    
 ]]7A..
]]7A..
]]7A..
4466C4466C44&&)99
 
  #J 2 224I 
  
 '+++rG   Tr%   )r>   r?   r@   audio_patternr   ra   r   rm   boolrt   ry   r{   r   r   r   r   r   r   rb   rc   s   @rH   rj   rj      s       )MDcDj)A D D D D D D !	
 

 
 	

 

 
 
 
S        7# 7 7 7 7@ @ @ @ @    Bc B B B BJc Jc J J J J
,, 38$, 
	, , , , , , , ,rG   rj   c            	       ~     e Zd Zdeeef         def fdZ	 ddedeeef         deeef         dz  def fdZ	 xZ
S )	MiniCPMODummyInputsBuilderr   rX   c                     |                     dd          }| j        j        |z  }t                                          |          |z   S )NrP   r   )r   infor   r]   get_dummy_text)r_   r   
num_audiosaudio_prompt_textsr`   s       rH   r   z)MiniCPMODummyInputsBuilder.get_dummy_text   sD    ]]7A..
!Y4zAww%%i003EEErG   Nr   
mm_optionsc                 J   |                     dd          }| j                                        | j                                        z  }|r|                     d          nd }d|                     |||          i}i t                                          |||          |S )NrP   r   )lengthr   	overrides)r   r   r   r{   _get_dummy_audiosr]   get_dummy_mm_data)	r_   r   r   r   r   	audio_lenaudio_overridesaudio_mm_datar`   s	           rH   r   z,MiniCPMODummyInputsBuilder.get_dummy_mm_data   s     ]]7A..
I==??i7799: 	
 6@I*..111T T++ Z? ,  

gg''JGG

 	
rG   r}   )r>   r?   r@   r   ra   r   r   r   r   r   rb   rc   s   @rH   r   r      s        FS(9 Fc F F F F F F =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
 
 
 
 
rG   r   c            
       ^    e Zd ZdefdZ	 	 ddedededefdZd	e	ee
f         d
e	ee
f         de	ee
f         de	eef         fdZd	e	ee
f         d
e	ee
f         de	ee
f         de	eef         f fdZdede	ee
f         dedee         f fdZdede	ee
f         de	eef         fdZ xZS )MiniCPMOMultiModalProcessorrX   c                 P    t          | j                                                  S )N)	target_sr)re   r   r{   rx   s    rH   _get_data_parserz,MiniCPMOMultiModalProcessor._get_data_parser  s*    +i??AA
 
 
 	
rG   Tr%   rn   ro   rp   c                 <    | j                             |||          S rr   )r   rt   )r_   rn   ro   rp   s       rH   get_audio_prompt_textsz2MiniCPMOMultiModalProcessor.get_audio_prompt_texts  s,     y..#% / 
 
 	
rG   mm_data	mm_kwargs
tok_kwargsc                    |                     d          x}i S |                                                     d|i                              dt          t
          f          }t          |t                    ri }nt|                     | j        j	        gt          |          z  dd |D             ii |ddi|ddh          }d	 t          |d         |d                   D             }||d<   |S )
NaudiosrP   c                     g | ]}|gS rF   rF   ).0rP   s     rH   
<listcomp>z>MiniCPMOMultiModalProcessor.process_audios.<locals>.<listcomp>:  s    #G#G#GUG#G#G#GrG   ro   Tr3   r<   )promptsr   r   r   out_keysc                 0    g | ]\  }}|d d d |f         S r}   rF   )r   featfeature_lens      rH   r   z>MiniCPMOMultiModalProcessor.process_audios.<locals>.<listcomp>B  s=     ' ' '%D+ QQQ_%' ' 'rG   )r   r   parse_mm_data	get_itemsrU   r   rg   _base_call_hf_processorr   r   lenzip)r_   r   r   r   r   parsed_audiosaudio_inputsunpadded_audio_featuress           rH   process_audiosz*MiniCPMOMultiModalProcessor.process_audios&  s1    kk(+++F4I !!##]GV,--Yw!<>Q RSS 	 m%@AA 	ELL7701C4F4FF!#G#G#G#G#GH<Y<t<<%*,@A 8  L' '), !12 !56* *' ' '# .EL)*rG   c                 |    i t                                          |||          |                     |||          S r}   )r]   process_mm_inputsr   )r_   r   r   r   r`   s       rH   r   z-MiniCPMOMultiModalProcessor.process_mm_inputsM  sE    
gg''JGG
!!'9jAA
 	
rG   mm_itemshf_processor_mm_kwargsout_mm_kwargsc                      t                                          ||          } j        j        }dt          f fd}g |t          d||          S )N)r   r   r   item_idxc                                         dt          t          f          }t          |t                    rV|                    |           d         }j                            t          t          t          |                              }n|
                    |           }t          j                            |          d          S )NrP   rK   z<unk>)r   rU   r   rg   r   r   r   summapr   get_audio_lengthr"   select_textr   )r   r   single_audio_embedsr   r   r_   s       rH   get_audio_replacementzNMiniCPMOMultiModalProcessor._get_prompt_updates.<locals>.get_audio_replacementf  s    ''57JK F &"=>> >&,jj&:&:>&J# IAAC!45566 		 #33H==	&2++I66  rG   rP   )r[   targetreplacement)r]   _get_prompt_updatesr   r   r   r    )r_   r   r   r   base_updatesaudio_placeholderr   r`   s   ``     rH   r   z/MiniCPMOMultiModalProcessor._get_prompt_updatesX  s     ww22#9' 3 
 
 !I3	C 	 	 	 	 	 	 	$

 (1  
 	
rG   rN   c                      t          |          S r}   )rS   )r_   rN   r   s      rH   _get_mm_fields_configz1MiniCPMOMultiModalProcessor._get_mm_fields_config  s    
 &i000rG   r   )r>   r?   r@   r   r   r   r   ra   r   r   objectr   r   r   r   r   r   r!   r   r   r   r   rb   rc   s   @rH   r   r     s       
"6 
 
 
 
 !	

 



 

 	


 


 

 

 

%f%% 3;'% CK(	%
 
m#	$% % % %N	
f%	
 3;'	
 CK(		

 
m#	$	
 	
 	
 	
 	
 	
'
%'
 !(V 4'
 -	'

 
,	'
 '
 '
 '
 '
 '
R11 !(V 41 
++	,	1 1 1 1 1 1 1 1rG   r   c                   L     e Zd Zdedef fdZdej        dej        fdZ xZS )MultiModalProjectorin_dimout_dimc                     t                                                       t          j        ||d          | _        t          j                    | _        t          j        ||d          | _        d S )NT)in_featuresout_featuresbias)r]   r^   r   Linearlinear1ReLUrelulinear2)r_   r   r   r`   s      rH   r^   zMultiModalProjector.__init__  s\    yV'PTUUUGII	yW7QUVVVrG   r3   rX   c                     |                      |                     |                    }|                     |          }|S r}   )r   r   r   )r_   r3   hidden_statess      rH   forwardzMultiModalProjector.forward  s6    		$,,~">">??]33rG   )	r>   r?   r@   r   r^   rC   rD   r   rb   rc   s   @rH   r   r     sy        Ws WS W W W W W Wel u|        rG   r   c                   Z     e Zd Zdedef fdZdej        dej        dej        fdZ xZ	S )MiniCPMWhisperEncoderLayerconfig	layer_idxc                    t                                                       |j        | _        t	          | j        |j        |j        ||          | _        t          j	        | j                  | _
        |j        | _        t          |j                 | _        |j        | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j	        | j                  | _        d S )N)	embed_dim	num_headsdropoutr   r   )r]   r^   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_norm)r_   r   r   r`   s      rH   r^   z#MiniCPMWhisperEncoderLayer.__init__  s    )n4,
 
 
 %'L$@$@!~#F$>?"(";9T^V-CDD9V3T^DD "T^ < <rG   r   attention_maskrX   c                 z   |}|                      |          }|                     ||          \  }}t          j                            || j        | j                  }||z   }|}|                     |          }|                     |                     |                    }t          j                            || j	        | j                  }| 
                    |          }t          j                            || j        | j                  }||z   }|j        t          j        k    rt          |          }|f}|S )N)r   r  ptraining)r  r  r   
functionalr   r  r  r  r  r	  r  dtyperC   float16r.   )r_   r   r  residual_outputss         rH   r   z"MiniCPMWhisperEncoderLayer.forward  sC   
 !11-@@>>') * 
 
q --T\DM . 
 
 !=0 --m<<**488M+B+BCC--T4t} . 
 
 //--T\DM . 
 
 !=0%-//1-@@M "rG   )
r>   r?   r@   r   r   r^   rC   rD   r   rb   rc   s   @rH   r   r     s        =} = = = = = = =$!|! ! 
	! ! ! ! ! ! ! !rG   r   c                   V     e Zd Zdef fdZ	 ddej        dej        dz  defdZ xZ	S )	MiniCPMWhisperEncoderr   c                     t                                                     t          j        fdt	          j                  D                       | _        d S )Nc                 2    g | ]}t          |           S ))r   )r   )r   ir   s     rH   r   z2MiniCPMWhisperEncoder.__init__.<locals>.<listcomp>  s6        +6Q???  rG   )r]   r^   r   
ModuleListrangeencoder_layerslayers)r_   r   r`   s    `rH   r^   zMiniCPMWhisperEncoder.__init__  sd       m   v455  
 
rG   Ninput_featuresr  rX   c                 0   |                     | j        j        j        | j        j        j                  }t
          j                            |                     |                    }t
          j                            |                     |                    }|	                    ddd          }| j
        j        }|d |j        d         d d f         }||z   }t
          j                            || j        | j                  }d}t          | j                  D ]N\  }}||fz   }d}	| j        r!t!          j        g           }
|
| j        k     rd}	|	rd	}: |||          }|d         }O|                     |          }||fz   }t)          ||
          S )Nr  devicer   rw   r%   r  rF   FT)NN)last_hidden_stater   )toconv1weightr  r%  r   r  geluconv2permuteembed_positionsshaper   r  	enumerater!  rC   rand	layerdrop
layer_normr   )r_   r"  r  inputs_embeds	embed_posr   encoder_statesidxencoder_layerto_dropdropout_probabilitylayer_outputss               rH   r   zMiniCPMWhisperEncoder.forward  s    (***#)$*2C2J + 
 
 **4::n+E+EFF**4::m+D+DEE%--aA66(/	6 3A 669:	%	1--T\DM . 
 
 "+DK"8"8 	1 	1C+}.>>NG} #&+jnn#&77"G  1 , -!"! !
 !.a 066'=*::&+(
 
 
 	
rG   r}   )
r>   r?   r@   r   r^   rC   rD   r   r   rb   rc   s   @rH   r  r    s        
} 
 
 
 
 
 
 /33
 3
3
 t+3
 
!	3
 3
 3
 3
 3
 3
 3
 3
rG   r  )r   dummy_inputsc                       e Zd Zg dddgdZededededz  fd	            Zd
ddedef fdZ	d
ddedefdZ
deeeej        f                  dee         fdZdedfdedededej        dedej        fdZdej        fdZdedeej                 fdZdededz  fdZdedef fd Zd!edej        eej                 z  fd"Zd#ef fd$Z xZ S )%MiniCPMO)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr[   r  rX   Nc                     |                     d          rdS |                     d          rdS |                     d          rdS t          d          )NrZ   z(<image>./</image>)r   z(<video>./</video>)rP   rk   z0Only image, video or audio modality is supported)
startswith
ValueError)clsr[   r  s      rH   get_placeholder_strzMiniCPMO.get_placeholder_str  se    w'' 	)((w'' 	)((w'' 	)((KLLLrG    )prefixvllm_configrK  c                    t                                          ||           |                     |d          5  |                     |t	          |d                    | _        d d d            d S # 1 swxY w Y   d S )N)rL  rK  rP   apm)r]   r^   _mark_tower_modelinit_audio_moduler/   rN  )r_   rL  rK  r`   s      rH   r^   zMiniCPMO.__init__*  s    [@@@##K99 	 	--'VU0K0K .  DH	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   +A33A7:A7c                   | j         j        }t          |          }t          |j        dz            }t          j        | j         j        | j         j                  | _        t          || j
                  | _        d| _        |S )N   )stride)r   r   )r   audio_configr  r   r
  r   	AvgPool1daudio_pool_stepaudio_avg_poolerr   r   audio_projection_layeraudio_encoder_layer)r_   rL  rK  rU  modelaudio_output_dims         rH   rP  zMiniCPMO.init_audio_module2  s    {/%l33|;q@AA "K'0K!
 !
 !
 ':#T^'
 '
 '
# $& rG   weightsc                 P    t          | dg          }|                    |          S )Ntts)skip_prefixes)r-   load_weights)r_   r]  loaders      rH   ra  zMiniCPMO.load_weights@  s*    "4w???""7+++rG   rT  r   size
chunk_sizenum_left_chunksr%  num_lookheadc                    t          j        |||t           j                  }t          j        ||          }||z  }|dk     rt          j        |          }	nt          j        ||z
  d          }
|
|z  }	|dz   }t          j        ||z  |z   |          }t          j        ||                              d          }|	                    d          }	|                    d          }||	k    ||k     z  }|S )N)r%  r  )r%  r   )r   r%   )r   )rC   zerosr   arange
zeros_likeclamp	unsqueeze)r_   rc  rd  re  r%  rf  retrow_indiceschunk_indicesstart_indicesstart_chunk_indicesend_chunk_indicesend_indicescol_indicess                 rH   subsequent_chunk_maskzMiniCPMO.subsequent_chunk_maskD  s    k$V5:FFFl4777#z1Q!,[99MM #(+mo.MST"U"U"U/*<M)A-k
*\9t
 
 
 l4777AA!DD%//22!++A..m+k0IJ
rG   input_lengthsc                     |dz
  dz  dz   }|| j         j        z
  | j         j        z  dz   }|                    t          j                  }||fS )Nr%   rw   )r  )r   rW  r'  rC   int32)r_   rv  input_lengths_after_cnninput_lengths_after_poolings       rH    _get_feat_extract_output_lengthsz)MiniCPMO._get_feat_extract_output_lengthsd  sd    #01#4":Q">#dk&AA[(')+,'-# 'B&D&D5;&D&W&W#&(CCCrG   rV   c           	         | j         j        }|d         }t          |t                    rt	          |          }|d         j        d         }t          d |D                       }|d         j        }|d         j        }t          j
        |||f||          }	t          |          D ]\  }
}|j        d         }||	|
dd |f<   n|}	|d         }t          |t          j                  r|                    d          }t          j        |          }|	j        \  }}}|d	z
  d
z  d	z   }t          j        d||j        |j                                      d                              ||          }|                    d	                              ||          }||k    }|                    |d	d	|                              |d	||          }|                    | j        j        j        j        | j        j        j        j                  }|dk    rWt/          |dz            }|                     ||d|j                  }t          j        |t          j        |                    }t7          d          ||<   |                     |	|          j        | j                 }|                     |          }|                    d	d
          }|                      |          }|                    d	d
          }| !                    |          \  }}|}t          t          j                             }d}tE          t	          |                    D ]}
t          t          j                             } tE          t	          ||
                             D ]/}| #                    ||d ||         d d f                    |d	z  }0|#                    t          j$        |                      |S )Nr3   r   c              3   0   K   | ]}|j         d          V  dS )rT  N)r.  )r   items     rH   	<genexpr>z3MiniCPMO.get_audio_hidden_states.<locals>.<genexpr>w  s(      <<tDJrN<<<<<<rG   r$  rT  .r<   r%   rw   2   )rc  rd  re  r%  z-inf)r  )%r   r   rg   rE   r   r.  r   r%  r  rC   rh  r/  rD   unbindhstackri  rl  expandviewr'  rN  r(  r)  r   ru  
logical_orlogical_notfloatr   rZ  rY  	transposerX  r{  r  appendcat)!r_   rV   rp   wavforms_rawBCLr%  r  wavformsr  wavforms_itemL_itemaudio_feature_lens_rawr<   
batch_sizer  max_mel_seq_lenmax_seq_len	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskchunk_num_frame
chunk_maskaudio_statesrK   feature_lens_after_poolingnum_audio_tokensfinal_audio_embedsr6  target_audio_embeds_lsts!                                    rH   get_audio_hidden_statesz MiniCPMO.get_audio_hidden_statesm  s5    {5 ,-lD)) 	$L!!AQ%b)A<<|<<<<<A!!_+F O)E{Aq!9E&IIIH$-l$;$; : : =&,R0,9C&)): $H "&&:!;,el;; 	F%;%B%B1%E%E""\*@AA)1&
A&*q014 L(.)0	   Yq\\VJ,, 	 ,55a88??
KXX N2 , 1 1*aK P P W W;!
 !
  577(.'-dhn6K6R  8  
  
 !!,"344O33 * ",3	 4  J %*$4%u'8'D'D% %! 7<Fmm23xx%9   
 

02 22<@@#--a33,,\::#--a33(,(M(M)
 )
%% 6!%,/11s12233 	J 	JA&*5<&8&:&:#35a899::  '.. &=(8(=&=qqq!@A   q%%ei0G&H&HIIII!!rG   kwargsc                     |                     dd           }|                     dd           }||d S |t          d|          S |                     d          }t          d||          S )Nr3   rK   )r4   rK   r<   )r4   r3   r<   )poprJ   r2   )r_   r  r3   rK   r<   s        rH   _parse_and_validate_audio_inputz(MiniCPMO._parse_and_validate_audio_input  s      $4d;;zz.$77!l&:4#/#)   
 $ZZ(<==)!)1
 
 
 	
rG   c                 t     t                      j        di |}|D ]}|dv rd|vr | j        di ||d<   |S )N)r3   rK   r   rF   )r]   %_parse_and_validate_multimodal_inputsr  )r_   r  
modalities	input_keyr`   s       rH   r  z.MiniCPMO._parse_and_validate_multimodal_inputs  sm    BUWWBLLVLL
   	V 	VI???J..'Kt'K'U'Uf'U'U
8$rG   audio_inputc                 T    |d         dk    r|d         S |                      |          S )Nr4   rK   )r  )r_   r  s     rH   _process_audio_inputzMiniCPMO._process_audio_input  s3     v.00~..++K888rG   r  c                     t                                          |          }|D ]7}|dk    r/|d         }|                     |          }|t          |          z  }8|S )Nr   )r]   _process_multimodal_inputsr  tuple)r_   r  multimodal_embeddingsr[   r  audio_embeddingsr`   s         rH   r  z#MiniCPMO._process_multimodal_inputs  sp     % B B: N N" 	A 	AH8##(2#'#<#<[#I#I %/?)@)@@%$$rG   )!r>   r?   r@   packed_modules_mappingclassmethodra   r   rI  r   r^   rP  r   r  rC   rD   setra  
CPU_DEVICEr%  ru  
LongTensorr{  r2   rE   r  r   rM   r  rQ   r  r  r  rb   rc   s   @rH   r=  r=    s       
 
 
 

 
 M3 M3 M3: M M M [M BD   z 3       KM   
 C    ,HU33D-E$F ,3s8 , , , ,  ")   	
   
   @De>N D D D D\".\"	el	\" \" \" \"|

	t	#
 
 
 
.f       9(9 
U\*	*9 9 9 9	%T 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%rG   r=  )PrA   collections.abcr   r   r   r   typingr   r   r	   r
   rC   r   transformersr   transformers.modeling_outputsr   ,transformers.models.whisper.modeling_whisperr   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.multimodalr   r   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   r   r   r   vllm.multimodal.processingr    r!   r"   vllm.utils.tensor_schemar#   r$   minicpmvr&   r'   r(   r)   r*   r+   r,   utilsr-   r.   r/   r%  r  r2   rJ   rM   rB   ra   rD   rS   rU   re   rj   r   r   Moduler   r   r  register_processorr=  rF   rG   rH   <module>r     s  2 J I I A A A A A A A A A A A A 5 5 5 5 5 5 5 5 5 5 5 5        % % % % % % A A A A A A            # " " " " " 3 3 3 3 3 3 F F F F F F F F         
                          
 ? > > > > > > >                  J I I I I I I I I IU\%  
       @    <   & !== Y   
gc5<.?&@    
 
 
 
 
"4 
 
 
"/ / / / /#? / / /A, A, A, A, A,3 A, A, A,H
 
 
 
 
!;<R!S 
 
 
Dr1 r1 r1 r1 r1"=>T"U r1 r1 r1j
 
 
 
 
") 
 
 
4 4 4 4 4 4 4 4n=
 =
 =
 =
 =
N =
 =
 =
@ ('	+  
p% p% p% p% p%{ p% p% 
p% p% p%rG   