
    .`iW              	          U d dl mZmZmZ d dlmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z< dZ= G d de2          Z> G d de2          Z?e>e?z  Z@eeAd<    G d de          ZB G d dejC                  ZD G d de+          ZE G d  d!e)eE                   ZFd"eeGe	jH        f         fd#ZI G d$ d%e'          ZJ G d& d'e*eE                   ZK ejL        eKeEeF(           G d) d*ejC        e7e8e6                      ZMdS )+    )IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)BatchFeaturePretrainedConfig)AudioFlamingo3ConfigAudioFlamingo3Processor)Qwen2AudioEncoder)
VllmConfig)BaseDummyOptions)
get_act_fn)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)DictEmbeddingItemsModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixiX  c                       e Zd ZU dZed         ed<   eej        e	ej                 z   e
ddd          f         ed<   eej         e
dd          f         ed<   eej         e
d	          f         ed
<   dS )AudioFlamingo3FeatureInputsz
    Dimensions:
        - num_chunks: Number of audio chunks (flattened)
        - nmb: Number of mel bins
        - num_audios: Number of original audio files
    audio_featurestype
num_chunksnmbi  input_featuresfeature_attention_mask
num_audioschunk_countsN)__name__
__module____qualname____doc__r   __annotations__r   torchTensorlistr$        }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/audioflamingo3.pyr.   r.   L   s           "
####tEL))L%..	0   
 &L$''	)   
 L!!	#     r@   r.   c                   z    e Zd ZU dZdZed         ed<   eee	j
                  eddddh          f         ed<   dS )	AudioFlamingo3EmbeddingInputsz
    Dimensions:
        - bn: Batch size
        - naf: Number of audio features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    audio_embedsr0   bnnafhs)dynamic_dimsN)r7   r8   r9   r:   r0   r   r;   r   r>   r<   r=   r$   r?   r@   rA   rC   rC   e   su           %3D'.
!222U\D%UG<<<	>     r@   rC   AudioFlamingo3Inputsc                   ~     e Zd Zdef fdZ	 d	dej        eej                 z  dej        fdZdej        fdZ	 xZ
S )
AudioFlamingo3Encoderconfigc                     t                                          |           t          j        dd          | _        d S )N   )kernel_sizestride)super__init__nn	AvgPool1d
avg_poolerselfrL   	__class__s     rA   rR   zAudioFlamingo3Encoder.__init__|   s7     	   ,1Q???r@   Nr3   attention_maskc                    t          |t                    rt          j        |          }t          j                            |                     |                    }t          j                            |                     |                    }|	                    dd          }|| j
        j        d |                    d          d d f         z                       |j                  }| j        D ]} |||          }|d         }|                    ddd          }|                     |          }|                    ddd          }|                     |          }|S )Nr   rN   r%   )
isinstancer>   r<   stackrS   
functionalgeluconv1conv2	transposeembed_positionsweightsizetodtypelayerspermuterU   
layer_norm)rW   r3   rY   hidden_stateslayerlayer_outputss         rA   forwardzAudioFlamingo3Encoder.forward   sS    nd++ 	9"[88N**4::n+E+EFF**4::m+D+DEE%//B77D078P-:L:LR:P:P8PRSRSRS8STT
"] 
!
! 	 [ 	- 	-E!E-@@M)!,MM &--aA6666%--q!
 
 66r@   input_lengthsc                 6    |dz
  dz  dz   }|dz
  dz  dz   }||fS )z{
        Computes the output length of the convolutional layers and the output length
        of the audio encoder
        r%   rN   r?   )rW   rp   output_lengthss      rA    _get_feat_extract_output_lengthsz6AudioFlamingo3Encoder._get_feat_extract_output_lengths   s7    
 '*q014'!+1A5n,,r@   N)r7   r8   r9   r   rR   r<   r=   r>   ro   rs   __classcell__rX   s   @rA   rK   rK   {   s        @ @ @ @ @ @ @ (, tEL'99    >-el - - - - - - - -r@   rK   c                   *     e Zd Zdef fdZd Z xZS )!AudioFlamingo3MultiModalProjectorrL   c                 N   t                                                       t          j        |j        j        |j        j        |j                  | _        t          |j
                  | _        t          j        |j        j        |j        j        |j                  | _        d S )N)bias)rQ   rR   rS   Linearaudio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2rV   s     rA   rR   z*AudioFlamingo3MultiModalProjector.__init__   s    	+*&
 
 

 f9::	**&
 
 
r@   c                     |                      |          }|                     |          }|                     |          }|S rt   )r   r   r   )rW   r/   rl   s      rA   ro   z)AudioFlamingo3MultiModalProjector.forward   s;    n55//m44r@   )r7   r8   r9   r   rR   ro   ru   rv   s   @rA   rx   rx      sT        
/ 
 
 
 
 
 
      r@   rx   c                   N    e Zd Zd ZdefdZdefdZdeee	dz  f         fdZ
dS )AudioFlamingo3ProcessingInfoc                 @    | j                             t                    S rt   )ctxget_hf_configr   rW   s    rA   r   z*AudioFlamingo3ProcessingInfo.get_hf_config   s    x%%&:;;;r@   kwargsc                 2     | j         j        t          fi |S rt   )r   get_hf_processorr   )rW   r   s     rA   r   z-AudioFlamingo3ProcessingInfo.get_hf_processor   s     (tx()@KKFKKKr@   c                 .     | j         di |}|j        }|S Nr?   )r   feature_extractor)rW   r   hf_processorr   s       rA   get_feature_extractorz2AudioFlamingo3ProcessingInfo.get_feature_extractor   s)    ,t,66v66(:  r@   returnNc                 
    dd iS )Naudior?   r   s    rA   get_supported_mm_limitsz4AudioFlamingo3ProcessingInfo.get_supported_mm_limits   s    r@   )r7   r8   r9   r   objectr   r   r   strintr   r?   r@   rA   r   r      s        < < <L L L L L!f ! ! ! !
cDj)A      r@   r   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	 AudioFlamingo3DummyInputsBuilder	mm_countsr   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nr   r   )getinfor   audio_token)rW   r   r5   r   r   s        rA   get_dummy_textz/AudioFlamingo3DummyInputsBuilder.get_dummy_text   s;    ]]7A..
y1133".Z''r@   Nseq_len
mm_optionsc                     | j                                         }|j        }t          |z  }|                    dd          }|r|                    d          nd }d|                     |||          iS )Nr   r   )lengthr5   	overrides)r   r   sampling_rateMAX_AUDIO_LENr   _get_dummy_audios)	rW   r   r   r   r   r   	audio_lenr5   audio_overridess	            rA   get_dummy_mm_dataz2AudioFlamingo3DummyInputsBuilder.get_dummy_mm_data   s     !I;;==)7!M1	]]7A..
5?I*..111T T++ %) ,  
 	
r@   rt   )
r7   r8   r9   r   r   r   r   r   r   r   r?   r@   rA   r   r      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r@   r   	hf_inputsc           	         |                      d          }|at          t          j        d          t          j        d|d          t          j        d|d          t          j        d                    S t          t          j        d          t          j        d          t          j        d          t          j        d                    S )Nr6   r   r   dim)rD   r3   r4   r6   )r   dictr   batchedflat_from_sizes)r   r6   s     rA   _audioflamingo3_field_configr      s    ==00L.6w??0@1   $9#H1$ $ $ /6w??	
 	
 	
 		
 *27;;,4W==4<WEE*27;;	   r@   c                   j     e Zd Zdeeej        f         ee         z  de	eef         dz  f fdZ
 xZS )"AudioFlamingo3MultiModalDataParserdatar   Nc                     t          |t                    rt          |ddht                    S t	                                          |          S )Nr   rD   )modalityrequired_fieldsfields_factory)r]   r   r   r   rQ   _parse_audio_data)rW   r   rX   s     rA   r   z4AudioFlamingo3MultiModalDataParser._parse_audio_data  sW     dD!! 	% !/ 0;	    ww((...r@   )r7   r8   r9   r   r   r<   r=   r   r   r   r   ru   rv   s   @rA   r   r     ss        /3$%S(99/ 
38	$t	+/ / / / / / / / / /r@   r   c            
            e Zd ZdefdZdedeeef         deee	f         deeef         de
f
 fdZde
d	eeef         deeef         fd
Zded	eeef         dedee         fdZ xZS )!AudioFlamingo3MultiModalProcessorr   c                 ^    | j                                         }t          |j                  S )N)	target_sr)r   r   r   r   )rW   r   s     rA   _get_data_parserz2AudioFlamingo3MultiModalProcessor._get_data_parser  s3     I;;==1'5
 
 
 	
r@   promptmm_data	mm_kwargs
tok_kwargsc                    |                     dg           }|r||d<   |                    dg           sa| j                                                            |          }|                     |          }t          t          |g          d          S  | j        j        di |}t          di |d|j	        i}|                    d          }t          |t                    s|g}g }	|j	        }
|j        }t          |
|z            }t          t          |z            }|D ]i}t          |t                    rt          |          n|j        d         }t#          d||z   dz
  |z            }||k    r|}|	                    |           jt'                                          ||||	          }d
|v r|                     d
          |d<   t+          j        |	t*          j                  |d<   |S )Naudiosr   )	input_idspt)tensor_typer   r   r%   )r   r   r   r   input_features_maskr4   )rh   r6   r?   )popr   r   get_tokenizerencode_apply_hf_processor_tokens_onlyr
   r   r   r   r]   r>   chunk_lengthr   r   lenshapemaxappendrQ   _call_hf_processorr<   tensorlong)rW   r   r   r   r   r   
prompt_idsr   
audio_listr6   r   r   window_sizemax_windowsr   	n_samplesn_winoutputsrX   s                     rA   r   z4AudioFlamingo3MultiModalProcessor._call_hf_processor  s-    Xr** 	&%GG{{7B'' 	P002299&AAJ==jIIJ
| < < <$OOOO;DI;HHiHH 
 

 
+9
 
 
	 [[))
*d++ 	&$J)7(5-,677-<788 	' 	'E&0&=&=QE


5;q>II3a7KGHHE{""#&&&&'',,!	 - 
 
 !G++07<Q0R0RG,-"',|5:"N"N"Nr@   r   hf_processor_mm_kwargsc                      t          |          S rt   )r   )rW   r   r   s      rA   _get_mm_fields_configz7AudioFlamingo3MultiModalProcessor._get_mm_fields_configW  s    
 ,I666r@   mm_itemsout_mm_kwargsc                   	
  | j         j        d	i |}| j                                         }|                                }t	          |dd          }|                    |          		|j        	|                                                    d                              d          
dt          f	
fd}t          d||          gS )
Nr   z<sound>r4   r6   item_idxc                    ht          t          j                  r                                n}t	          |d |                    }||          }||z   }t          t
                    rg||         }t          |          dk    r5t          |d         t          j                  rt          j        |          }nYt          j        |          }nD||         }n9t          t
                    r	|          }n|          	                    d          }|                    d          }|dz
  dz  dz   }|dz
  dz  dz   }	|	                                
                                }
nd         |          }|j        d         }
|
dk    rt          d          gt          |
          z  }t          j        |          S )Nr   r[   r%   rN   rD   zAudio is too short)embed_token_id)r]   r<   r=   tolistsumr>   r   r^   r   	unsqueezeitemr   
ValueErrorr   r!   select_token_id)r   counts	start_idxcountend_idx	mask_listmaskrp   conv_lengthsaudio_output_lengthsnum_featuresrD   audio_tokensaudio_token_idr6   r4   out_mm_datas                rA   get_replacement_audioflamingo3z]AudioFlamingo3MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_audioflamingo3r  s   %1+ &lELAA*++---) 
 !$F9H9$5 6 6I"8,E'%/G!"8$?? 	I$:9W;L$M	y>>A--*%aL%,3 3- $);y#9#9DD#(<	#:#:DD5i6GH ""8$?? M5h?5h?II!LL !% - 1a7!;(4q(8Q'>'B$37799>>@@*>:8D+1!4q   !5666*+c,.?.??L&6-   r@   r   )r   targetreplacementr?   )
r   r   r   	get_vocabgetattrr   r   get_datar   r   )rW   r   r   r   	processor	tokenizervocabr   r   r   r6   r4   r   s            @@@@rA   _get_prompt_updatesz5AudioFlamingo3MultiModalProcessor._get_prompt_updates^  s    /DI.HH1GHH	I++--	##%%i	BB;//!&5N#,,..!,1I!J!J"~66-	S -	 -	 -	 -	 -	 -	 -	 -	 -	`  ":  
 	
r@   )r7   r8   r9   r   r   r   r   r   r   r   r
   r   r   r   r   r   r   r    r  ru   rv   s   @rA   r   r     s=       
"6 
 
 
 
77 c6k"7 38$	7
 CK(7 
7 7 7 7 7 7r77 !(V 47 
++	,	7 7 7 7I
%I
 !(V 4I
 -	I

 
,	I
 I
 I
 I
 I
 I
 I
 I
r@   r   )r   dummy_inputsc                   |    e Zd ZdZg dddgdZdefdZdd	d
edef fdZ	de
dedz  fdZdedej        eej        df         z  fdZde
defdZ	 	 ddej        dej        dedz  dej        dz  de
dej        ez  fdZdej        dej        dz  fdZdeeeej        f                  dee         fdZ xZS )&AudioFlamingo3ForConditionalGenerationz
    AudioFlamingo3 model for conditional generation.

    This model integrates a Whisper-based audio encoder with a Qwen2 language model.
    It supports multi-chunk audio processing.
    )q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        zlanguage_model.zmulti_modal_projector.zaudio_tower.)language_model	connectortower_model)r   from_string_fieldr   s    rA   get_mm_mappingz5AudioFlamingo3ForConditionalGeneration.get_mm_mapping  s'     /,.&
 
 
 	
r@    )prefixvllm_configr  c          	      :   t                                                       |j        j        }|j        }|j        j        }|| _        || _        || _        |                     |d          5  t          |j	                  | _
        t          |          | _        d d d            n# 1 swxY w Y   |                     |          5  t          ||j        t!          |d          dg          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nr   r  Qwen2ForCausalLM)r  	hf_configr  architectures)rQ   rR   model_configr  quant_configmultimodal_configrL   _mark_tower_modelrK   r|   audio_towerrx   multi_modal_projector_mark_language_modelr+   r~   r,   r  make_empty_intermediate_tensors)rW   r  r  rL   r  r  rX   s         rA   rR   z/AudioFlamingo3ForConditionalGeneration.__init__  s   )3"/'4F!2(##K99 	S 	S4#   D *K6)R)RD&		S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S &&{33 	 	"<' ,#F,<==12	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s$   ,.B&&B*-B*-C??DDr   Nc                 ,   |                     dd           }|                     dd           }|                     dd           }|                     dd           }||d S |t          d|          S |t          d|||          S t          d          )	Nr3   rD   r4   r6   )r0   rD   r/   )r0   r3   r4   r6   z This line should be unreachable.)r   rC   r.   AssertionError)rW   r   r3   rD   r4   r6   s         rA   _parse_and_validate_audio_inputzFAudioFlamingo3ForConditionalGeneration._parse_and_validate_audio_input  s      $4d;;zz.$77!',Dd!K!Kzz.$77!l&:4#0#,    %.%-'=)	    ?@@@r@   audio_input.c                     |d         dk    r|d         }t          |          S |d         }|d         }|                    d          }t          |t                    r,t	          j        |d          }t	          j        |d          }|dg|j        d         z  }nrt          |t          j                  r|                                }nCt          |t                    r.|r,t          |d         t          j                  rd	 |D             }|	                    d
          }|dz
  dz  dz   }|dz
  dz  dz   }|j        \  }	}
}|dz
  dz  dz   }t	          j
        d||j        |j                                      d                              |	|          }|                    d
                              |	|          }||k    }|                    |	dd|                              |	d||          }|                    | j        j        j        j        | j        j        j        j                  }t)          d          ||<   |                     ||          }|                     |          }|j        \  }}}|                    d          }t	          j
        |                              ||                              |j                  |k     }||                             d
|          }t	          j        ||                                                                          }g }d}|D ]=}||||z            }|                    t	          j        |d                     ||z  }>t          |          S )Nr0   rD   r3   r4   r6   r   r   r%   c                 6    g | ]}|                                 S r?   )r   ).0cs     rA   
<listcomp>zOAudioFlamingo3ForConditionalGeneration._process_audio_input.<locals>.<listcomp>  s     ;;;AFFHH;;;r@   r[   rN   )rh   devicez-inf)rY   )tupler   r]   r>   r<   catr   r=   r   r   arangerh   r,  r   expandviewrg   r  ra   re   floatr   splitflattenr   )rW   r&  rD   r3   r4   r6   rp   r   r   
batch_size_max_mel_seq_lenmax_seq_len	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskr/   r5   max_audio_tokens	embed_dimaudio_features_maskmasked_audio_featureschunk_embeddingsgrouped_embeddingscurrent_idxr   audio_chunkss                                rA   _process_audio_inputz;AudioFlamingo3ForConditionalGeneration._process_audio_input  s    v.00&~6L&&&$%56!,-E!F"~66nd++ 	N"Y~1===N%*Y/E1%M%M%M"3!5a!88LLel33 	<'..00LL|T**	<	< <?EL99	<
 <;l;;;L /22266%)a/!3 ,q 0Q6:)7)=&
A '*q014 L"(#*	   Yq\\VJ,, 	 &//33:::{SS N2 , 1 1*aK P P W W;!
 !
  577"(/5#)07  8  
  
 7<Fmm23 ))+? * 
 

 33NCC 3A2F/
$i3==a@@L)**VJ 011R$+,,"# 	 !//B C H HY W W !;!#7#?#?#A#A#H#H#J#J
 
  ! 	! 	!E+K+:M,MNL%%ei!&D&D&DEEE5 KK'(((r@   c                 R     | j         di |}|g S |                     |          }|S r   )r%  rF  )rW   r   r&  rA  s       rA   embed_multimodalz7AudioFlamingo3ForConditionalGeneration.embed_multimodal[  s?    :d:DDVDDI $ 9 9+ F F$$r@   r   	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)rK  )r  model)rW   r   rI  rJ  rK  r   rl   s          rA   ro   z.AudioFlamingo3ForConditionalGeneration.forwardb  s@      + M+11 '	 2 
 
 r@   rl   c                 6    | j                             |          S rt   )r  compute_logits)rW   rl   s     rA   rO  z5AudioFlamingo3ForConditionalGeneration.compute_logitsu  s     "11-@@@r@   weightsc                 J    t          |           }|                    |          S rt   )r*   load_weights)rW   rP  loaders      rA   rR  z3AudioFlamingo3ForConditionalGeneration.load_weights{  s#    "4((""7+++r@   )NN)r7   r8   r9   r:   packed_modules_mappingr   r  r   r   rR   r   rI   r%  r<   r=   r-  rF  r&   rH  r"   ro   rO  r   setrR  ru   rv   s   @rA   r  r    s         322$i0 

 
 
 
 
 BD 
 
 
z 
3 
 
 
 
 
 
6AA		$A A A A4\)/\)	elC/0	0\) \) \) \)|% %4H % % % % <@-1 < < 2D8	
 |d*  
+	+   &A|A 
	A A A A,HU33D-E$F ,3s8 , , , , , , , ,r@   r  )Ncollections.abcr   r   r   typingr   r   r   r	   r<   torch.nnrS   transformersr
   r   "transformers.models.audioflamingo3r   r   transformers.models.qwen2_audior   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   r   vllm.multimodal.processingr   r   r   r   r    r!   vllm.sequencer"   vllm.utils.tensor_schemar#   r$   
interfacesr&   r'   r(   r)   utilsr*   r+   r,   r   r.   rC   rI   r;   rK   Modulerx   r   r   r   r=   r   r   r   register_processorr  r?   r@   rA   <module>rj     s  ( 8 7 7 7 7 7 7 7 7 7 7 5 5 5 5 5 5 5 5 5 5 5 5        7 7 7 7 7 7 7 7        > = = = = = " " " " " " 3 3 3 3 3 3 < < < < < < D D D D D D / / / / / /         
                             . - - - - - > > > > > > > >                         ,   2    L   $  "?? i   
/- /- /- /- /-- /- /- /-d    	   ,    #5    
 
 
 
 
78
 
 
<GC4E,F    */ / / / /)= / / /R
 R
 R
 R
 R
89R
 R
 R
j ('%	%1  
N, N, N, N, N,I!:|N, N, 
N, N, N,r@   