
    .`i^r              	          U d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
mZmZmZ ddlZddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z?m@Z@ ddlAmBZBmCZCmDZDmEZE ddlFmGZGmHZHmIZImJZJmKZK dZLdZM G d de?          ZN G d  d!e?          ZOeNeOz  ZPeeQd"<    G d# d$e7          ZR G d% d&e5eR                   ZS G d' d(e6eR                   ZT G d) d*ejU                  ZV G d+ d,ejU                  ZW G d- d.ejU        e          ZX G d/ d0e          ZY e+jZ        eTeReS1           G d2 d3ejU        eDeEeC                      Z[d4ej\        e]ej\                 z  e]e]ej\                          z  d5ej\        fd6Z^dS )7zPyTorch Ultravox model.    N)IterableMappingSequence)SimpleNamespace)	AnnotatedAnyLiteral	TypeAlias)nn)
functional)BatchFeatureProcessorMixin)ModuleUtilsMixin)WhisperFeatureExtractor)WhisperEncoderWhisperEncoderLayer)
VllmConfig)BaseDummyOptions)
MulAndSilu
get_act_fn)RMSNorm)DefaultModelLoader)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)MultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)UltravoxConfig)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapper
flatten_bninit_vllm_registered_modelmaybe_prefix	<|audio|>   c                   N   e Zd ZU dZed         ed<   eej        e	ej                 z  e	e	ej                          z   e
ddd          f         ed<   eej         e
d          f         ed<   	 eej         e
d          f         ed	<   	 eej         e
d
          f         ed<   dS )UltravoxAudioFeatureInputszz
    Dimensions:
    - b: batch size
    - n: number of chunks
    - t: Time frames (M)
    - nmb: Number of mel bins
    audio_featurestypebnnmbtdatalens	token_lenn
num_chunksN__name__
__module____qualname____doc__r	   __annotations__r   torchTensorlistr)        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/ultravox.pyr7   r7   C   s           "
####
tEL))Del1C,DDD%%%	'    EL++d"3"33
4444 {{4'8'889999W%,C(8(889999MMrL   r7   c                       e Zd ZU dZed         ed<   eej        e	ej                 z   e
dddd          f         ed<   d	S )
UltravoxAudioEmbeddingInputszx
    Dimensions:
    - b: batch size
    - na: number of audios
    - afs: audio feature size
    - hs: hidden size
    audio_embedsr9   bnaafshsr=   NrB   rK   rL   rM   rO   rO   [   sp           .
!!!!
tEL));;sD%+N+NN     rL   rO   UltravoxAudioInputsc                   \    e Zd ZdedefdZdedefdZdefdZ	de
eedz  f         fdZdS )UltravoxProcessingInfokwargsreturnc                 |    | j         j        j        } | j         j        di |}t          |_        |j        |_        |S NrK   )ctxmodel_config	hf_configget_hf_processor_AUDIO_PLACEHOLDER_OVERRIDEaudio_token_replacementaudio_token_indexaudio_replacement_token_id)selfrX   confighf_processors       rM   r_   z'UltravoxProcessingInfo.get_hf_processorp   sE    &00tx0::6::
 0K,282J/rL   c                      | j         di |}|j        }t          |t                    r|S |j        }t          |t                    sJ |S r[   )r_   audio_processor
isinstancer   feature_extractor)rd   rX   rf   rh   rj   s        rM   get_feature_extractorz,UltravoxProcessingInfo.get_feature_extractor|   sd    ,t,66v66 '6o'>?? 	#""+=+-DEEEEE  rL   c                     dS )z8Return target audio channels for Ultravox models (mono).r*   rK   rd   s    rM   get_target_channelsz*UltravoxProcessingInfo.get_target_channels   s    qrL   Nc                 
    dd iS )NaudiorK   rm   s    rM   get_supported_mm_limitsz.UltravoxProcessingInfo.get_supported_mm_limits   s    rL   )rC   rD   rE   objectr   r_   r   rk   intrn   r   strrq   rK   rL   rM   rW   rW   o   s        
 
N 
 
 
 

!f 
!9P 
! 
! 
! 
!S    cDj)A      rL   rW   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	UltravoxDummyInputsBuilder	mm_countsrY   c                 8    |                     dd          }d|z  S )Nrp   r   r4   )get)rd   rw   
num_audioss      rM   get_dummy_textz)UltravoxDummyInputsBuilder.get_dummy_text   s     ]]7A..
Z''rL   Nseq_len
mm_optionsc                     | j                                         }|j        }|j        |z  t          z  }|                    dd          }|r|                    d          nd }d|                     |||          iS )Nrp   r   )lengthrz   	overrides)infork   sampling_ratechunk_length_MAX_ENCODER_BATCH_SIZEry   _get_dummy_audios)	rd   r|   rw   r}   rj   r   	audio_lenrz   audio_overridess	            rM   get_dummy_mm_dataz,UltravoxDummyInputsBuilder.get_dummy_mm_data   s     !I;;==)7*]:=TT 	 ]]7A..
5?I*..111T T++ Z? ,  
 	
rL   N)
rC   rD   rE   r   rt   rs   r{   r   r   r   rK   rL   rM   rv   rv      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rL   rv   c            
            e Zd ZdefdZdedeeef         deeef         deeef         def
 fdZ	ded	eeef         deee
f         fd
Zded	eeef         dedee         fdZ xZS )UltravoxMultiModalProcessorrY   c                     | j                                         }t          |j        | j                                                   S )N)	target_srtarget_channels)r   rk   r    r   rn   )rd   rj   s     rM   _get_data_parserz,UltravoxMultiModalProcessor._get_data_parser   sD     I;;==#'5 I99;;
 
 
 	
rL   promptmm_data	mm_kwargs
tok_kwargsc                    |                     dg           sc| j                                                            |d          }|                     |          }t          t          |g          d          S t          |          }|                    dg           }t          |t                    sJ  | j        j
        di |}t          di ||j        dd}t          di |d|i}|                    d	d            |                    d
d            t                                          ||||          }	|	                    d          |	d<   |	S )NaudiosF)add_special_tokens)	input_idspt)tensor_typeT)r   include_audio_num_chunkspadding
truncation)r   r   r   r   audio_valuesr8   rK   )ry   r   get_tokenizerencode_apply_hf_processor_tokens_onlyr   dictpopri   rJ   rk   r   super_call_hf_processor)rd   r   r   r   r   
prompt_idsr   rj   item_processor_dataoutput	__class__s             rM   r   z.UltravoxMultiModalProcessor._call_hf_processor   s    {{8R(( 	P0022995 :  J ==jIIJ
| < < <$OOOOw--Xr**&$''''';DI;HHiHH 
 

+9%)
 
 
 
	 #<<W<<V<<< 	y$'''|T***++'!	 , 
 
 $*::n#=#= rL   	hf_inputshf_processor_mm_kwargsc           	      4   |                     dt          j        d                    }t          t	          j        d|          t	          j        d|          t	          j        d|          t	          j        d          t	          j        d                    S )Naudio_num_chunksr   rp   )r8   audio_token_len
audio_lensr   rP   )ry   rH   zerosr   r   flat_from_sizesbatched)rd   r   r   rA   s       rM   _get_mm_fields_configz1UltravoxMultiModalProcessor._get_mm_fields_config   s    
 ]]#5u{1~~FF
 1@*UU1A':VV,<WjQQ2:7CC.6w??

 

 

 
	
rL   mm_itemsout_mm_kwargsc                   	  | j         j        d
i |}|j        	|                                                    dt          j        d                    }t          j        |dt
          j                  t          j	        t          j
        dgt
          j                  g          dt          f	fd}t          dd|	          gS )Nr   r   )dimdtyper   item_idxc                     |          }| dz            }d         ||                                          }gt          |          z  S )Nr*   r   )sumrs   )r   startendr   chunks_start_idxout_mm_datareplacement_ids       rM   get_replacement_ultravoxzQUltravoxMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_ultravox  sQ    $X.E"8a<0C)*;<U3YGKKMMO"#c/&:&:::rL   rp   r4   )modalitytargetreplacementrK   )r   r_   rc   get_datary   rH   r   cumsumint32cattensorrs   r$   )
rd   r   r   r   rf   rA   r   r   r   r   s
          @@@rM   _get_prompt_updatesz/UltravoxMultiModalProcessor._get_prompt_updates   s     2ty1KK4JKK%@
 $,,.. __%7QHH
).AU[*
 *
 *
 !9\1#U[1113CD
 
	;s 	; 	; 	; 	; 	; 	; 	; 	;  "4  
 	
rL   )rC   rD   rE   r    r   rt   r   rr   r   r   r   r   r   r   r   r   r%   r   __classcell__r   s   @rM   r   r      s1       
"6 
 
 
 
(( f%( 3;'	(
 CK(( 
( ( ( ( ( (T

 !(V 4
 
++	,	
 
 
 
$"
%"
 !(S 1"
 -	"

 
,	"
 "
 "
 "
 "
 "
 "
 "
rL   r   c                   N     e Zd ZdZddef fdZdej        dej        fdZ xZ	S )	StackAudioFrameszk
    Stack the audio embedding frames to reduce the sequence length by a factor
    of `stack_factor`.
       stack_factorc                 V    t                                                       || _        d S r   )r   __init__r   )rd   r   r   s     rM   r   zStackAudioFrames.__init__  s'    (rL   rP   rY   c                     |j         \  }}}|| j        z   dz
  | j        z  | j        z  }t          j        |ddd||z
  f          }|j         \  }}}|                    ||| j        z  || j        z            }|S )Nr*   r   )shaper   Fpadview)rd   rP   BTCT_pads         rM   forwardzStackAudioFrames.forward   s    $1aT&&*t/@@4CTTu\Aq!UQY+?@@$1a#((qD%%q4+<'<
 
 rL   )r   )
rC   rD   rE   rF   rs   r   rH   rI   r   r   r   s   @rM   r   r     sy         
) )S ) ) ) ) ) )EL U\        rL   r   c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )UltravoxFeedForwardProjectorre   c                    t                                                       |j        | _        t	          |j                  | _        |j        j        |j        z  }t          |          | _	        t          j        || j        d          | _        | j        }|j        dk    rt                      | _        |dz  }nt!          |j                  | _        |j        j        }t          j        ||d          | _        |j        r.t          |          | _        t          j                    | _        d S t          j                    | _        t          |          | _        d S )NF)biasswiglu   )r   r   hidden_size
hidden_dimr   r   _pad_and_stackaudio_configr   ln_prer   Linearlinear_1projector_actr   actr   text_configlinear_2projector_ln_midln_midIdentityln_post)rd   re   dim_indim_middim_outr   s        rM   r   z%UltravoxFeedForwardProjector.__init__,  s    ,.v/BCC$063FFfoo	&$/FFF/8++!||DHlGG!&"677DH$0	'7??? " 	,%,W%5%5DK;==DLLL+--DK"7++DLLLrL   r8   r   rY   c                 ,   |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r   r   r   r   r   r   r   )rd   r8   r   hidden_statess       rM   r   z$UltravoxFeedForwardProjector.forwardG  s     ,,^<<^44n55//M22m44]33rL   	rC   rD   rE   r'   r   rH   rI   r   r   r   s   @rM   r   r   +  sv        ,~ , , , , , ,6
#l
=B\
	
 
 
 
 
 
 
 
rL   r   c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )UltravoxTransformerProjectorre   c                    t                                                       t          d          | _        t	          |j                  | _        |j        j        |j        z  }t          j
        |j                  t          |          | _        t          j        |j                  | _        t          j        j        j                  | _        t          j        fdt+          |j                  D                       | _        t          j                  | _        t          j        j        |j        j                  | _        d S )NF)
is_decoderc                 .    g | ]}t                    S rK   )r   ).0_projector_audio_configs     rM   
<listcomp>z9UltravoxTransformerProjector.__init__.<locals>.<listcomp>g  s2        $$:;;  rL   )r   r   r   re   r   r   r   r   r   copydeepcopyr   r   r   r   d_model	linear_in	Embeddingmax_source_positionsembed_positions
ModuleListrangenum_projector_layerslayersr   r   
linear_out)rd   re   r   r   r   s      @rM   r   z%UltravoxTransformerProjector.__init__U  s)   %777.v/BCC$063FF!%v/B!C!Cfoo6+A+IJJ!|"7"* 
  

 m   v:;;  
 
 5=>>)"*F,>,J
 
rL   r8   r   rY   c                    |                      |          }|j        d         }t          j        ||j                  d d d f                             |d d d f                   }|                     ||j        |j                  }|                     |          }| 	                    |          }| 
                    t          j        |                    d          |j                            }||z   }i }dt          j        | j        d         j                  j        v rd |d<   | j        D ]}	 |	|fd|i|}
|
d         }|                     |          }|                     |          }|S )Nr*   devicelayer_head_maskr   attention_mask)r   r   rH   aranger  ltget_extended_attention_maskr   r   r  r  sizeinspect	signaturer  r   
parametersr   r  )rd   r8   r   max_len_stackedr  extended_attention_maskr   	positionsrX   layerlayer_outputss              rM   r   z$UltravoxTransformerProjector.forwardr  s    ,,^<<(.q1on>STTT!!!G

"_QQQW%
&
& 	 #'"B"BN0.2F#
 #
 N33}55((L++A..}7KLLL
 
	 &	1  1$+a.2H I I TTT(,F$%[ 	- 	-E!E 6  M
 *!,MM]3366rL   r   r   s   @rM   r   r   T  sv        
~ 
 
 
 
 
 
:%#l%=B\%	% % % % % % % %rL   r   c                        e Zd ZdZdZ fdZed             Zdej	        dz  dej	        fdZ
	 dd	ej	        dej	        dz  fd
Z xZS )ModifiedWhisperEncodera  
    Encoder portion of OpenAI's Whisper model.

    This implementation is a slightly modified version of HF Transformers'
    Whisper Encoder, with only a few fixes:
    1. base_model_prefix updated to allow for doing `.from_pretrained`
       directly on the encoder
    2. allow less than 30 second of audio padding to be passed in:
        - relaxed ValueError check for `input_features` length to be less
           than or equal to `expected_seq_length` instead of strictly equal
        - embed_pos is now sliced to match the length of `inputs_embeds`

    Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
    See commentary: https://github.com/huggingface/transformers/issues/25744
    zmodel.encoderc                 R     t                      j        |i | d| j        _        d S )NF)r   r   re   r   )rd   argsrX   r   s      rM   r   zModifiedWhisperEncoder.__init__  s/    $)&)))!&rL   c                 f    | j         j        | j        j        d         z  | j        j        d         z  S )Nr   )re   r  conv1strideconv2rm   s    rM   max_context_lengthz)ModifiedWhisperEncoder.max_context_length  s5     K,j"#j"#	
rL   r   Nr   c                 $   |dS |                      |          }|j        d         }t          j        ||j                  dddf                             |                    dd                    }|                     |d|j                  }|S )aH  
        Create attention mask based on audio lengths to mask out padding tokens
        For each sample in batch:
        - Convert raw audio length to feature length after convolutions
        - Create bool mask: True for valid positions and False for padding
        - Convert to attention mask format expected by transformer layers
        (1.0 for positions to attend to, large negative for positions to ignore)
        This masking ensures consistent behavior between training and inference
        by preventing the model from attending to padding tokens in both cases
        Nr*   r  r   )	 _get_feat_extract_output_lengthsr   rH   r  r  r  r   r  r   )rd   r   r   audio_feature_lenmax_seq_lenr  s         rM   get_attention_mask_by_audio_lenz6ModifiedWhisperEncoder.get_attention_mask_by_audio_len  s     4 AA*MM#)!,k-:NOOO!!!G

"##B**
+
+ 	 99% : 
 

 rL   input_featuresc           	         | j         }|j        d         |k    r$t          d| d|j        d          d| d          t          j                            |                     |                    }t          j                            |                     |                    }|                    ddd          }| j	        j
        d |                    d	                   }||z   }t          j                            || j        | j        
          }|                     ||          }i }dt          j        | j        d         j                  j        v rd |d<   | j        D ]}	 |	||fi |}
|
d         }|                     |          }|S )Nr(  z7Whisper expects the mel input features to be of length z or less, but found z-. Make sure to pad the input mel features to .r   r   r*   )ptrainingr  )r&  r   
ValueErrorr   r   gelur#  r%  permuter  weightr  dropoutr2  r,  r  r  r  r   r  
layer_norm)rd   r-  r   expected_seq_lengthinputs_embeds	embed_posr   r  rX   encoder_layerr  s              rM   r   zModifiedWhisperEncoder.forward  s   
 #5#&9996&6 6!'+6 6  36 6 6   **4::n+E+EFF**4::m+D+DEE%--aA66(/0H-2D2DR2H2H0HI	%	1--T\DM . 
 
 ==j-XX  1$+a.2H I I TTT(,F$%![ 	- 	-M)M   M *!,MM66rL   r   )rC   rD   rE   rF   base_model_prefixr   propertyr&  rH   rI   r,  r   r   r   s   @rM   r  r    s           (' ' ' ' ' 
 
 X
,->Cl   > +/+ ++ L4'+ + + + + + + +rL   r  )r   dummy_inputsc                   J    e Zd Zg dddgdZ eddi          Zeded	ed
edz  fd            Z	ddde
def fdZd
efdZdej        dej        dej        d
ej        fdZded
edz  fdZded
eeej        df         z  fdZded
efdZ	 d,ddddej        d edz  d!ej        dz  d"ed
ej        f
 fd#Z	 	 d-dej        d$ej        d%ej        dz  d&ej        dz  d
ej        ez  f
d'Zd(ej        d
ej        fd)Zd*eeeej        f                  d
ee         fd+Z  xZ!S ).UltravoxModel)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzaudio_tower.model.encoder.audio_tower.)orig_to_new_prefixr   irY   Nc                 N    |                     d          rdS t          d          )Nrp   r4   z Only audio modality is supported)
startswithr3  )clsr   rK  s      rM   get_placeholder_strz!UltravoxModel.get_placeholder_str  s,    w'' 	;;<<<rL    )prefixvllm_configrQ  c          	      f   t                                                       |j        j        }|j        j        }|| _        || _        | j        sJ g | _        |j        4| j        	                    t          j        |j        d d                     |j        4| j        	                    t          j        |j        d d                     |                     |d          5  t          |j                  | _        |j        dk    rt%          |          | _        nt)          |          | _        d d d            n# 1 swxY w Y   |                     |          5  t-          ||j        t1          |d                    | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )NrI  )model_or_pathrevisionrQ  language_model.rp   r   language_model)rR  r^   rQ  )r   r   r]   r^   multimodal_configre   multi_modal_configsecondary_weightsaudio_model_idappendr   Sourcetext_model_id_mark_tower_modelr  r   audio_towerr
  r   multi_modal_projectorr   _mark_language_modelr2   wrapped_model_configr3   rW  make_empty_intermediate_tensors)rd   rR  rQ  re   rX  r   s        rM   r   zUltravoxModel.__init__  sZ   !,!9!C'4F"3&&&&!# , "))")"("7!)     + "))")"("6!,     ##K99 	R 	R5f6IJJD*Q..-I&-Q-Q**-I&-Q-Q*	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R &&{33 	 	"<' 5#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   $AD>>EE+FFFc                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        rV  zmulti_modal_projector.rI  )rW  	connectortower_model)r   from_string_fieldrm   s    rM   get_mm_mappingzUltravoxModel.get_mm_mappingI  s'     /,.&
 
 
 	
rL   r-  r   r   c                    |                     | j        j                  }|                    d          }g }t	          d|t
                    D ]}t          |t
          z   |          }|                     |||         |||                   }	|	                     | j        j                  }	|                     |	|||                   }
|                    |
           t          j
        |d          }|S )Nr   )r   )tor`  r   r  r	  r   minra  r\  rH   r   )rd   r-  r   r   r8   
batch_sizeaudio_embeddingsr   r   batch_featuresbatch_embeddingss              rM   _audio_features_to_embeddingsz+UltravoxModel._audio_features_to_embeddingsS  s    (**4+;+ABB#((++
 1j*ABB 	6 	6Ee55zBBC!--uSy):eCi+@ N ,..t/?/EFFN  $99c	 :    ##$45555 !9%51===rL   rX   c                 Z   |                     dd           }|                     dd           }|                     dd           }|                     dd           }|                     dd           }||d S |t          d||||          S |t          d|          S t          d          )	Nr8   rP   r   r   r   )r9   r=   r>   r?   rA   )r9   r=   z This line should be unreachable.)r   r7   rO   AssertionError)rd   rX   r8   rP   r   r   r   s          rM   _parse_and_validate_audio_inputz-UltravoxModel._parse_and_validate_audio_inputp  s      $4d;;zz.$77ZZd33
 **%6==!::&8$??!l&:4%-%#)+    #/^,WWWW?@@@rL   audio_input.c                    |d         dk    r|d         S t          |d                   }|d         }|d         }|                     |||          }|j        d         }t          j        ||j                                      |j        d         d	          }||d d d f         k     }||         }	d
 |                    |d                                                   D             }
|	                    |
          S )Nr9   rP   r=   r>   r?   r*   r  r   r(  c                 Z    g | ](}|                                                                 )S rK   )r   item)r   
chunk_lenss     rM   r   z6UltravoxModel._process_audio_input.<locals>.<listcomp>  s>     
 
 
 NN!!##
 
 
rL   rA   )	pad_and_concat_to_dim3rq  r   rH   r  r  expandsplittolist)rd   ru  r8   r   r   
embeddingsmax_lenindicesmaskflattened_embeddings
embed_lenss              rM   _process_audio_inputz"UltravoxModel._process_audio_input  s    v.00v&& 0F0CDD (
%k277J
 

 "1%,wz/@AAAHHQ
 
 D11)$/
 
-33K4M4T4T4V4VWW
 
 

 $))*555rL   c                 R     | j         di |}|g S |                     |          }|S r[   )rt  r  )rd   rX   ru  rn  s       rM   embed_multimodalzUltravoxModel.embed_multimodal  s?    :d:DDVDDI44[AArL   T)is_multimodalhandle_oov_mm_tokenr   multimodal_embeddingsr  r  c                    ||!t                                          |          S t                                          ||||          S )N)r  r  r  )r   embed_input_ids)rd   r   r  r  r  r   s        rM   r  zUltravoxModel.embed_input_ids  sU     !(M,A77**9555ww&&"7' 3	 ' 
 
 	
rL   r  intermediate_tensorsr:  c                 |    |d}| j         }t          |d          r|j         }|                    ||||          }|S )a  Run forward pass for Ultravox

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted audio embeddings. The to-be-inserted
        audio has a size that is essentially 6.25 tokens per second of audio.

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Position indices for the input tokens.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.

        NrW  )r:  )rW  hasattrmodel)rd   r   r  r  r:  rX   rW  r   s           rM   r   zUltravoxModel.forward  s^    4  + M,>#344 	;+:N&,,y"6m - 
 
 rL   r   c                 6    | j                             |          S r   )rW  compute_logits)rd   r   s     rM   r  zUltravoxModel.compute_logits  s    "11-@@@rL   weightsc                 ^    t          | dg          }|                    || j                  S )NrI  )ignore_unexpected_prefixes)mapper)r/   load_weightshf_to_vllm_mapper)rd   r  loaders      rM   r  zUltravoxModel.load_weights  s3    "4^DTUUU""743I"JJJrL   r   )NN)"rC   rD   rE   packed_modules_mappingr0   r  classmethodrt   rs   rO  r   r   r   ri  rH   rI   rq  rr   rU   rt  r   tupler  r+   r  boolr  r&   r   r  r   setr  r   r   s   @rM   rA  rA    s        322$i0 
 &8.I   =3 =3 =3: = = = [= BD .
 .
 .
z .
3 .
 .
 .
 .
 .
 .
`
 
 
 
 
   L  	 
 
       :AA	t	#A A A A4$6($6 
u|S01	1$6 $6 $6 $6L   4H         >B

 .2$(
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
0 59-1$ $<$ <$ $lT1	$
 |d*$ 
+	+$ $ $ $LAEL AU\ A A A AKHU33D-E$F K3s8 K K K K K K K KrL   rA  featuresrY   c                    t          | t          j                  r| j        dk    rt	          |           } | S d | D             } t          d | D                       d | D             } fd| D             } t          j        |           S )a  
    Pad and concatenate a list of tensors.

    output:
        Tensor of shape [B, C, M] where M is the maximum length of the input
        tensors, B is the sum of the batch sizes of the input tensors.
        C must be the same for all input tensors.
       c                 ,    g | ]}t          |          S rK   )rz  r   fs     rM   r   z*pad_and_concat_to_dim3.<locals>.<listcomp>  s!    <<<a&q))<<<rL   c              3   0   K   | ]}|j         d          V  dS )r(  N)r   r  s     rM   	<genexpr>z)pad_and_concat_to_dim3.<locals>.<genexpr>  s(      00!!'"+000000rL   c                 D    g | ]} |j         d g|j        dd         R  S )r(  r0  N)r   r   r  s     rM   r   z*pad_and_concat_to_dim3.<locals>.<listcomp>  s6    <<<ar)AGBCCL)))<<<rL   c           	      Z    g | ]'}t          j        |d |j        d         z
  f          (S )r   r(  )r   r   r   )r   r  r  s     rM   r   z*pad_and_concat_to_dim3.<locals>.<listcomp>  s5    GGGa!Wqwr{2344GGGrL   )ri   rH   rI   ndimr1   maxr   )r  r  s    @rM   rz  rz    s     (EL)) =1!(++H<<8<<<H00x00000G<<8<<<H HGGGhGGGH9XrL   )_rF   r  r  collections.abcr   r   r   typesr   typingr   r   r	   r
   rH   r   torch.nnr   r   transformersr   r   transformers.modeling_utilsr   transformers.models.whisperr   ,transformers.models.whisper.modeling_whisperr   r   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   r   $vllm.model_executor.layers.layernormr    vllm.model_executor.model_loaderr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r    vllm.multimodal.processingr!   r"   r#   r$   r%   vllm.sequencer&   (vllm.transformers_utils.configs.ultravoxr'   vllm.utils.tensor_schemar(   r)   
interfacesr+   r,   r-   r.   utilsr/   r0   r1   r2   r3   r`   r   r7   rO   rU   rG   rW   rv   r   Moduler   r   r   r  register_processorrA  rI   rJ   rz  rK   rL   rM   <module>r     s  
      7 7 7 7 7 7 7 7 7 7 ! ! ! ! ! ! 5 5 5 5 5 5 5 5 5 5 5 5        $ $ $ $ $ $ 5 5 5 5 5 5 5 5 8 8 8 8 8 8 ? ? ? ? ? ?       
 # " " " " " 3 3 3 3 3 3 H H H H H H H H 8 8 8 8 8 8 ? ? ? ? ? ? D D D D D D / / / / / /            L K K K K K K K              . - - - - - C C C C C C > > > > > > > >                         *  N N N N N N N N0    <     !== Y   
    /   B
 
 
 
 
!78N!O 
 
 
:f
 f
 f
 f
 f
"9:P"Q f
 f
 f
R    ry   *& & & & &29 & & &RC C C C C29.> C C CLf f f f f^ f f fR ('	+  
nK nK nK nK nKBI1:| nK nK 
nKblT%,//$tEL7I2JJ
\     rL   