
    .`iB                        U d Z ddlmZmZmZ ddlmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:  G d de1          Z; G d de1          Z<e;e<z  Z=e	e>d<    G d dej?                  Z@de
jA        fdZB G d de*          ZC G d de(eC                   ZDd eeEe
jA        f         fd!ZF G d" d#e&          ZG G d$ d%e)eC                   ZH ejI        eHeCeD&           G d' d(ej?        e5e6                      ZJdS ))zEInference-only Qwen2-Audio model compatible with HuggingFace weights.    )IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)BatchFeature)Qwen2AudioConfigQwen2AudioEncoderQwen2AudioProcessor)WhisperFeatureExtractor)
VllmConfig)BaseDummyOptions)MULTIMODAL_REGISTRY)	AudioItemModalityDataMultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)AudioProcessorItemsDictEmbeddingItemsModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixc                       e Zd ZU dZed         ed<   eej        e	ej                 z   e
ddd          f         ed<   eej         e
dd          f         ed<   d	S )
Qwen2AudioFeatureInputszV
    Dimensions:
        - na: Number of audios
        - nmb: Number of mel bins
    audio_featurestypenanmbi  input_featuresfeature_attention_maskN)__name__
__module____qualname____doc__r   __annotations__r   torchTensorlistr$        z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/qwen2_audio.pyr-   r-   H   s           "
####tEL))D%&&	(   
 &D$	!     r=   r-   c                   z    e Zd ZU dZdZed         ed<   eee	j
                  eddddh          f         ed<   dS )	Qwen2AudioEmbeddingInputsz
    Dimensions:
        - bn: Batch size
        - naf: Number of audio features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    audio_embedsr/   bnnafhs)dynamic_dimsN)r4   r5   r6   r7   r/   r   r8   r   r;   r9   r:   r$   r<   r=   r>   r@   r@   [   su           %3D'.
!222U\D%UG<<<	>     r=   r@   Qwen2AudioInputsc                   .     e Zd Zdedef fdZd Z xZS )Qwen2AudioMultiModalProjectoraudio_hidden_sizetext_hidden_sizec                     t                                                       t          j        ||d          | _        d S )NT)bias)super__init__nnLinearlinear)selfrI   rJ   	__class__s      r>   rN   z&Qwen2AudioMultiModalProjector.__init__r   s7    i 13C$OOOr=   c                 0    |                      |          }|S N)rQ   )rR   r.   hidden_statess      r>   forwardz%Qwen2AudioMultiModalProjector.forwardv   s    N33r=   )r4   r5   r6   intrN   rW   __classcell__rS   s   @r>   rH   rH   q   sb        P# P P P P P P P      r=   rH   input_lengthsc                 6    | dz
  dz  dz   }|dz
  dz  dz   }||fS )Nr%      r<   )r[   feat_lengthsoutput_lengthss      r>    _get_feat_extract_output_lengthsr`   |   s5    !A%!+a/L"Q&1,q0N''r=   c                   b    e Zd Zd ZdedefdZdedefdZde	fdZ
deee	dz  f         fdZdS )	Qwen2AudioProcessingInfoc                 @    | j                             t                    S rU   )ctxget_hf_configr   rR   s    r>   re   z&Qwen2AudioProcessingInfo.get_hf_config   s    x%%&6777r=   kwargsreturnc                 2     | j         j        t          fi |S rU   )rd   get_hf_processorr   )rR   rg   s     r>   rj   z)Qwen2AudioProcessingInfo.get_hf_processor   s     (tx()<GGGGGr=   c                 \     | j         di |}|j        }t          |t                    sJ |S Nr<   )rj   feature_extractor
isinstancer   )rR   rg   hf_processorrm   s       r>   get_feature_extractorz.Qwen2AudioProcessingInfo.get_feature_extractor   sA    ,t,66v66(:+-DEEEEE  r=   c                     dS )z;Return target audio channels for Qwen2 Audio models (mono).r%   r<   rf   s    r>   get_target_channelsz,Qwen2AudioProcessingInfo.get_target_channels   s    qr=   Nc                 
    dd iS )Naudior<   rf   s    r>   get_supported_mm_limitsz0Qwen2AudioProcessingInfo.get_supported_mm_limits   s    r=   )r4   r5   r6   re   objectr   rj   r   rp   rX   rr   r   strru   r<   r=   r>   rb   rb      s        8 8 8H H4G H H H H!f !9P ! ! ! !S    cDj)A      r=   rb   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	Qwen2AudioDummyInputsBuilder	mm_countsrh   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nrt   r   )getinforj   audio_token)rR   rz   
num_audiosro   r~   s        r>   get_dummy_textz+Qwen2AudioDummyInputsBuilder.get_dummy_text   s;    ]]7A..
y1133".Z''r=   Nseq_len
mm_optionsc                     | j                                         }|j        }|j        |z  }|                    dd          }|r|                    d          nd }d|                     |||          iS )Nrt   r   )lengthr   	overrides)r}   rp   sampling_ratechunk_lengthr|   _get_dummy_audios)	rR   r   rz   r   rm   r   	audio_lenr   audio_overridess	            r>   get_dummy_mm_dataz.Qwen2AudioDummyInputsBuilder.get_dummy_mm_data   s     !I;;==)7%2]B	]]7A..
5?I*..111T T++ Z? ,  
 	
r=   rU   )
r4   r5   r6   r   rw   rX   r   r   r   r   r<   r=   r>   ry   ry      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r=   ry   	hf_inputsc                     t          t          j        d          t          j        d          t          j        d                    S )Nrt   )rA   r2   r3   )dictr   batched)r   s    r>   _qwen2audio_field_configr      sC    *27;;,4W==4<WEE   r=   c                   j     e Zd Zdeeej        f         ee         z  de	e
e
f         dz  f fdZ xZS )Qwen2AudioMultiModalDataParserdatarh   Nc                     t          |t                    rt          |ddht                    S t	                                          |          S )Nrt   rA   )modalityrequired_fieldsfields_factory)rn   r   r   r   rM   _parse_audio_data)rR   r   rS   s     r>   r   z0Qwen2AudioMultiModalDataParser._parse_audio_data   sW     dD!! 	% !/ 07	    ww((...r=   )r4   r5   r6   r   rw   r9   r:   r   r   r   r   r   rY   rZ   s   @r>   r   r      ss        /3$%Y(??/ 
38	$t	+/ / / / / / / / / /r=   r   c            
            e Zd ZdefdZdedeeef         deeef         deeef         de	f
 fdZ
de	d	eeef         deeef         fd
Zded	eeef         dedee         fdZ xZS )Qwen2AudioMultiModalProcessorrh   c                     | j                                         }t          |j        | j                                                   S )N)	target_srtarget_channels)r}   rp   r   r   rr   )rR   rm   s     r>   _get_data_parserz.Qwen2AudioMultiModalProcessor._get_data_parser   sD     I;;==-'5 I99;;
 
 
 	
r=   promptmm_data	mm_kwargs
tok_kwargsc                    |                     dg           }|r||d<   |                    dg           sa| j                                                            |          }|                     |          }t          t          |g          d          S  | j        j        di |}t          di |d|j	        i}t                                          ||||          S )	Naudiosrt   )	input_idspt)tensor_typer   )r   r   r   r   r<   )popr|   r}   get_tokenizerencode_apply_hf_processor_tokens_onlyr
   r   rp   r   rM   _call_hf_processor)	rR   r   r   r   r   r   
prompt_idsrm   rS   s	           r>   r   z0Qwen2AudioMultiModalProcessor._call_hf_processor   s    Xr** 	&%GG {{7B'' 	P002299&AAJ==jIIJ
| < < <$OOOO;DI;HHiHH 
 

 
+9
 
 
	
 ww))!	 * 
 
 	
r=   r   hf_processor_mm_kwargsc                      t          |          S rU   )r   )rR   r   r   s      r>   _get_mm_fields_configz3Qwen2AudioMultiModalProcessor._get_mm_fields_config   s    
 (	222r=   mm_itemsout_mm_kwargsc                 r    | j         j        di |}| j                                         }|                                }t	          |dd          }t	          |dd          }t	          |dd          }	||         ||         ||	         |                                                    d          }
|
g nUt          |
t          j	                  sJ t          |
                    d                    \  }}|                                d	t          ffd
}t          d||          gS )Nr~   z	<|AUDIO|>audio_bos_tokenz<|audio_bos|>audio_eos_tokenz<|audio_eos|>r3   item_idxc                 n   r	|          }n=d         |          }t          |j                  dk    s
J d            |j        d         }|dk    rC
                    dt                    }|                    |           }t          d| d          	g|z  }t          j        g|z   gz   	          S )	NrA   r]   z audio_embeds must be a 2D tensorr   rt   zThe audio (len=z1) is too short to be represented inside the model)embed_token_id)lenshape	get_itemsr   get_audio_length
ValueErrorr!   select_token_id)r   num_featuresrA   r   r   audio_tokensaudio_bos_idaudio_eos_idaudio_output_lengthsaudio_token_idr   out_mm_datas         r>   get_replacement_qwen2_audiozVQwen2AudioMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_qwen2_audio  s    # 53H=*>:8D<-..!3335W333+1!4q  !++G5HII"33H==	 9i 9 9 9  
 ++l:L&6->-   r=   rt   )r   targetreplacementr<   )r}   rj   r   	get_vocabgetattrget_datar|   rn   r9   r:   r`   sumtolistrX   r   )rR   r   r   r   	processor	tokenizervocabr~   r   r   r3   _audio_output_lensr   r   r   r   r   r   s    `            @@@@@r>   _get_prompt_updatesz1Qwen2AudioMultiModalProcessor._get_prompt_updates   s    /DI.HH1GHH	I++--	##%% iDD!)->PP!)->PP{+_-_-#,,..!,1I!J!J!)#%  4elCCCCC#C&**2..$ $ A  $5#;#;#=#= 	# 	 	 	 	 	 	 	 	 	 	 	2  "7  
 	
r=   )r4   r5   r6   r   r   rw   r   rv   r   r
   r   r   r   r   r   r   r    r   rY   rZ   s   @r>   r   r      s1       
"6 
 
 
 


 f%
 38$	

 CK(
 

 
 
 
 
 
B33 !(V 43 
++	,	3 3 3 3=
%=
 !(V 4=
 -	=

 
,	=
 =
 =
 =
 =
 =
 =
 =
r=   r   )r}   dummy_inputsc                       e Zd Zededededz  fd            Zddded	ef fd
Zde	de
dz  fdZde
dej        eej        df         z  fdZde	defdZ	 	 ddej        dej        dedz  dej        dz  de	dej        ez  fdZdej        dej        dz  fdZdeeeej        f                  dee         fdZ xZS )"Qwen2AudioForConditionalGenerationr   irh   Nc                 V    |                     d          rd| dS t          d          )Nrt   zAudio z%: <|audio_bos|><|AUDIO|><|audio_eos|>z Only audio modality is supported)
startswithr   )clsr   r   s      r>   get_placeholder_strz6Qwen2AudioForConditionalGeneration.get_placeholder_strC  s9    w'' 	EDADDDD;<<<r=    )prefixvllm_configr   c          	      d   t                                                       |j        j        }|j        }|j        j        }|| _        || _        || _        |                     |d          5  t          |j	                  | _
        t          |j	        j        |j        j                  | _        d d d            n# 1 swxY w Y   |                     |          5  t#          ||j        t%          |d          dg          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nrt   language_modelQwen2ForCausalLM)r   	hf_configr   architectures)rM   rN   model_configr   quant_configmultimodal_configconfig_mark_tower_modelr   audio_configaudio_towerrH   d_modeltext_confighidden_sizemulti_modal_projector_mark_language_modelr*   r+   r   make_empty_intermediate_tensors)rR   r   r   r   r   r   rS   s         r>   rN   z+Qwen2AudioForConditionalGeneration.__init__J  s   )3"/'4F!2(##K99 	 	01DEED)F#+V-?-K* *D&	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	"<' ,#F,<==12	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   ,AB;;B?B?-DDDrg   c                     |                     dd           }|                     dd           }|                     dd           }||d S |t          d|          S |t          d||          S t          d          )Nr2   rA   r3   )r/   rA   r.   )r/   r2   r3   z This line should be unreachable.)r   r@   r-   AssertionError)rR   rg   r2   rA   r3   s        r>   _parse_and_validate_audio_inputzBQwen2AudioForConditionalGeneration._parse_and_validate_audio_inpute  s      $4d;;zz.$77!',Dd!K!K!l&:4#,#,    %*%-'=    ?@@@r=   audio_input.c                    |d         dk    r|d         }t          |          S |d         }|d         }| j                            |                    d                    \  }}|j        \  }}}	|	dz
  dz  dz   }
t          j        d|
|j        |j        	          	                    d          
                    ||
          }|	                    d          
                    ||
          }||k    }|                    |dd|
          
                    |d|
|
          }|                    | j        j        j        j        | j        j        j        j        	          }t          d
          ||<   |                     ||          }|j        }|                     |          }|j        \  }}}|	                    d          }t          j        |          
                    ||                              |j                  |k     }||                             d|          }t          j        ||                                                                          S )Nr/   rA   r2   r3   r   r]   r%   r   )dtypedevicez-inf)attention_mask)tupler   r`   r   r   r9   aranger   r   	unsqueezeexpandviewtoconv1weightfloatlast_hidden_stater   splitflattenr   )rR   r   rA   r2   r3   audio_feat_lengthsr   
batch_sizer   max_mel_seq_lenmax_seq_len	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskaudio_outputsselected_audio_featurer.   r   max_audio_tokens	embed_dimaudio_features_maskmasked_audio_featuress                           r>   _process_audio_inputz7Qwen2AudioForConditionalGeneration._process_audio_input}  s    v.00&~6L&&&$%56!,-E!F ==&**2..  	10 *8)=&
A&*q014 L(.)0	   Yq\\VJ,, 	 ,55b99@@
 
 !N2 , 1 1*aK P P W W;!
 !
  577"(/5#)07  8  
  
 7<Fmm23((+? ) 
 
 "/!@334JKK2@2F/
$i3==a@@L)**VJ 011R$+,,"# 	 !//B C H HY W W {!#7#?#?#A#A#H#H#J#J
 
 	
r=   c                 R     | j         di |}|g S |                     |          }|S rl   )r   r  )rR   rg   r   r  s       r>   embed_multimodalz3Qwen2AudioForConditionalGeneration.embed_multimodal  s?    :d:DDVDDI $ 9 9+ F F$$r=   r   	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)r  )r   model)rR   r   r  r  r  rg   rV   s          r>   rW   z*Qwen2AudioForConditionalGeneration.forward  s>      + M+11y"6m 2 
 
 r=   rV   c                 6    | j                             |          S rU   )r   compute_logits)rR   rV   s     r>   r  z1Qwen2AudioForConditionalGeneration.compute_logits  s     "11-@@@r=   weightsc                 J    t          |           }|                    |          S rU   )r)   load_weights)rR   r  loaders      r>   r!  z/Qwen2AudioForConditionalGeneration.load_weights  s#    "4((""7+++r=   )NN)r4   r5   r6   classmethodrw   rX   r   r   rN   rv   rF   r   r9   r:   r   r  r&   r  r"   rW   r  r   setr!  rY   rZ   s   @r>   r   r   =  s        =3 =3 =3: = = = [= BD 
 
 
z 
3 
 
 
 
 
 
6AA	D	 A A A A0>
+>
	elC/0	0>
 >
 >
 >
@% %4H % % % % <@-1 < < 2D8	
 |d*  
+	+    A|A 
	A A A A,HU33D-E$F ,3s8 , , , , , , , ,r=   r   )Kr7   collections.abcr   r   r   typingr   r   r   r	   r9   torch.nnrO   transformersr
   transformers.models.qwen2_audior   r   r   transformers.models.whisperr   vllm.configr   vllm.config.multimodalr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   r   vllm.multimodal.parser   r   r   r   r   vllm.multimodal.processingr   r   r   r   r    r!   vllm.sequencer"   vllm.utils.tensor_schemar#   r$   
interfacesr&   r'   r(   utilsr)   r*   r+   r-   r@   rF   r8   ModulerH   r:   r`   rb   ry   rw   r   r   r   register_processorr   r<   r=   r>   <module>r7     sQ  0 L K K 7 7 7 7 7 7 7 7 7 7 5 5 5 5 5 5 5 5 5 5 5 5        % % % % % %         
 @ ? ? ? ? ? " " " " " " 3 3 3 3 3 3 / / / / / /                                          . - - - - - > > > > > > > > L L L L L L L L L L N N N N N N N N N N    l   &       " 68QQ ) Q Q Q
    BI   (EL ( ( ( (    1   *
 
 
 
 
#9:R#S 
 
 
<U\0A(B    / / / / /%9 / / / m
 m
 m
 m
 m
$;<T$U m
 m
 m
` ('!	!-  
Z, Z, Z, Z, Z,4F
 Z, Z, 
Z, Z, Z,r=   