
    .`it                        d dl mZmZmZ d dlmZmZmZmZ d dl	Z
d dlZd dlmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z< d dl=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZF d dlGmHZH d dlImJZJmKZK ddlLmMZMmNZNmOZO ddlPmQZQmRZRmSZSmTZT  e"eU          ZVdZWdZX G d d eJ          ZY G d! d"eJ          ZZeYZ[ G d# d$e?          Z\ G d% d&e<e\                   Z] G d' d(e>e\                   Z^ G d) d*ej_                  Z` e2ja        e^e\e]+           G d, d-ej_        eNeO                      ZbdS ).    )IterableMappingSequence)	AnnotatedAnyLiteralcastN)nn)	AutoModelBatchFeature)Gemma3nAudioConfigGemma3nAudioFeatureExtractorGemma3nConfigGemma3nProcessorGemma3nTextConfigGemma3nVisionConfig)SiglipImageProcessorFast)ModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)
PromptType)init_logger)RMSNorm)RowParallelLinear)VocabParallelEmbedding)Gemma3nForCausalLM)(adjust_audio_features_to_expected_length)MultiModelKeys)ISO639_1_SUPPORTED_LANGS)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilder)	BaseMultiModalProcessorBaseProcessingInfoMultiModalPromptUpdates"MultiModalPromptUpdatesApplyResultPlaceholderFeaturesInfoPromptReplacementPromptUpdatePromptUpdateDetailsreplace_token_matches)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModalSupportsTranscription)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix      c                   j    e Zd ZU dZdZed         ed<   eej	         e
dddd          f         ed<   dS )	Gemma3nImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each patch
        - w: Width of each patch
    pixel_valuestypebn   hwN__name__
__module____qualname____doc__rB   r   __annotations__r   torchTensorr4        y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/gemma3n_mm.pyr@   r@   J   s[           %3D'.
!222EL++dAsC*H*HHIIIIIIrP   r@   c                       e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   eej	         e
dd          f         ed<   d	S )
Gemma3nAudioInputszs
    Dimensions:
        - bn: Batch size * number of audios
        - s: seq_length
        - f: num_features
    audiorB   rC   sfinput_features_paddedinput_features_maskNrG   rO   rP   rQ   rS   rS   W   s}           %D''
$$$$U\;;tS#3N3N%NOOOO"5<T31G1G#GHHHHHHrP   rS   c                       e Zd Zd ZdefdZdeeedz  f         fdZ	dedeeef         deeef         dz  fd	Z
d
edededz  defdZdedz  defdZdS )Gemma3nProcessingInfoc                 @    | j                             t                    S N)ctxget_hf_configr   selfs    rQ   r^   z#Gemma3nProcessingInfo.get_hf_configh   s    x%%m444rP   kwargsc                 2     | j         j        t          fi |S r\   )r]   get_hf_processorr   )r`   ra   s     rQ   rc   z&Gemma3nProcessingInfo.get_hf_processork   s     (tx()9DDVDDDrP   returnNc                     d d dS NimagerT   rO   r_   s    rQ   get_supported_mm_limitsz-Gemma3nProcessingInfo.get_supported_mm_limitsn   s    ---rP   seq_len	mm_countsc                      t           t          dS rf   )TOKENS_PER_IMAGETOKENS_PER_AUDIO)r`   rj   rk   s      rQ   get_max_tokens_per_itemz-Gemma3nProcessingInfo.get_max_tokens_per_itemq   s     *4DEEErP   image_widthimage_height	processorc                l    ||                                  }t          j        |j        |j                  S )z
        Get the replacement text for image tokens.

        For Gemma3n, this should return the full_image_sequence which includes
        BOI token, repeated image tokens, and EOI token.
        )rc   r0   select_token_idfull_image_sequenceimage_token_id)r`   rp   rq   rr   s       rQ   get_image_replz$Gemma3nProcessingInfo.get_image_replv   s;     --//I"2)9+C
 
 	
rP   c                l    ||                                  }t          j        |j        |j                  S )z
        Get the replacement text for audio tokens.

        For Gemma3n, this should return the full_audio_sequence which includes
        BOA token, repeated audio tokens, and EOA token.
        )rc   r0   rt   full_audio_sequenceaudio_token_id)r`   rr   s     rQ   get_audio_replz$Gemma3nProcessingInfo.get_audio_repl   s=     --//I #2)9+C
 
 	
rP   )rH   rI   rJ   r^   objectrc   r   strintri   ro   r   rw   r{   rO   rP   rQ   rZ   rZ   g   s#       5 5 5E E E E E.cDj)A . . . .FF'.sCx'8F	c	T	!F F F F

 
 	

 $d*
 

 
 
 
(
 $d*
 
	
 
 
 
 
 
rP   rZ   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	Gemma3nDummyInputsBuilderrk   rd   c                     |                     dd          }|                     dd          }| j                                        }|j        }|j        }||z  ||z  z   S )Nrh   r   rT   )getinforc   image_tokenaudio_token)r`   rk   
num_images
num_audiosrr   r   r   s          rQ   get_dummy_textz(Gemma3nDummyInputsBuilder.get_dummy_text   s`    ]]7A..
]]7A..
I..00	++Z'+
*BBBrP   Nrj   
mm_optionsc                    |                     dd          }|                     dd          }| j                                        }|j        }|j        }|j        }	|	j                             dd          }
|	j                             dd          }|r|                     d          nd }|r|                     d          nd }|                     |
|||          |                     |||          d	S )
Nrh   r   rT   width   height)r   r   r   	overrides)lengthr   r   rg   )	r   r   rc   feature_extractor
fft_lengthimage_processorsize_get_dummy_images_get_dummy_audios)r`   rj   rk   r   r   r   rr   audio_feature_extractor	audio_lenr   	img_width
img_heightimage_overridesaudio_overridess                 rQ   get_dummy_mm_dataz+Gemma3nDummyInputsBuilder.get_dummy_mm_data   s    ]]7A..
]]7A..
I..00	' 	  ,6	4=4M#(,,Wc::	$)--h<<
5?I*..111T5?I*..111T ++!%)	 ,   ++ Z? ,  

 

 
	
rP   r\   )
rH   rI   rJ   r   r}   r~   r   r   r"   r   rO   rP   rQ   r   r      s        CS(9 Cc C C C C =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rP   r   c            
       X    e Zd ZdefdZdedeeef         deeef         deeef         def
 fdZ	ded	eeef         deee
f         fd
Zded	eeef         dedee         fdZdee         dedeee         ef         f fdZdee         dedeeee         f         f fdZ xZS )Gemma3nMultiModalProcessorrd   c                 h    | j                                         j        }t          |j                  S )N)	target_sr)r   rc   r   r'   sampling_rate)r`   r   s     rQ   _get_data_parserz+Gemma3nMultiModalProcessor._get_data_parser   s.     I6688J#.?.MNNNNrP   promptmm_data	mm_kwargs
tok_kwargsc                     d|v r|                     d          |d<   t                                          ||||          }d|v r6|d         |d<   d t          |d         |d                   D             }||d<   |S )NaudiosrT   input_featuresrW   c                 $    g | ]\  }}||         S rO   rO   ).0rV   masks      rQ   
<listcomp>zAGemma3nMultiModalProcessor._call_hf_processor.<locals>.<listcomp>   s0     ! ! !At $! ! !rP   rX   )popsuper_call_hf_processorzip)r`   r   r   r   r   processed_outputsunpadded_features	__class__s          rQ   r   z-Gemma3nMultiModalProcessor._call_hf_processor   s     w&{{844GG!GG66	
 
 0009J :56! !"%&67%&;<   ! ! ! 3D./  rP   	hf_inputshf_processor_mm_kwargsc                     t          t          j        d          t          j        d          t          j        d                    S )Nrh   rT   )rA   rW   rX   )dictr#   batched)r`   r   r   s      rQ   _get_mm_fields_configz0Gemma3nMultiModalProcessor._get_mm_fields_config   sE    
 .6w??"7"?"H"H 5 =g F F
 
 
 	
rP   mm_itemsout_mm_kwargsc                 .   	   j         j        di |	g }dv r;	j        }dt          f	 fd}|                    t          d||                     dv r:	j        }dt          f	 fd}|                    t          d||                     |S )Nrh   item_idxc                                          dt                    }|                    |           }j                            |j        |j                  S )Nrh   )rp   rq   rr   )	get_itemsr%   get_image_sizer   rw   r   r   )r   images
image_sizehf_processorr   r`   s      rQ   get_replacement_imagezMGemma3nMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_image  sZ    !++G5HII#228<<
y// * 0!+!2* 0   rP   )modalitytargetreplacementrT   c                 :    j                                       S )N)rr   )r   r{   )r   r   r`   s    rQ   get_replacement_audiozMGemma3nMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_audio"  s%    y//* 0   rP   rO   )r   rc   r   r~   appendr.   r   )
r`   r   r   r   prompt_updatesr   r   r   r   r   s
   ``       @rQ   _get_prompt_updatesz.Gemma3nMultiModalProcessor._get_prompt_updates   s    2ty1KK4JKK h&2K         !!!$& 5     h&2K       
 !!!$& 5     rP   mm_prompt_updatesc                 h   t                                          ||          \  }}| j                                        }|                                }|d         }|d         }|d         }	|d         }
t          |||g|	g          }t          |||g|	g          }t          |||g|
g          }||fS )N









)r   _apply_token_matchesr   get_tokenizer	get_vocabr1   )r`   r   r   	token_idsres	tokenizervocab	newline_1	newline_2	newline_3	newline_4r   s              rQ   r   z/Gemma3nMultiModalProcessor._apply_token_matches1  s    
 55f>OPP	3 I++--	##%%$K	&M	(O	*%	)	"K
 
	
 *	"K
 
	
 *	"K
 
	 #~rP   new_token_idsc           	        
 | j                                         }|                                }|d         
|d         |d         |d         dt          dt          t                   f
fd}t	          t                               }t	          t                               t          |          D ]`\  } ||          }|                    |                               fdt          t          |                    D                        at                      
                    ||          }	fd	|	                                D             S )
Nr   r   r   r   tokrd   c                 2    | k    rgS | k    rgS | gS r\   rO   )r   r   r   r   r   s    rQ   get_repl_tokszGGemma3nMultiModalProcessor._find_mm_placeholders.<locals>.get_repl_toksb  s6    i!9--i!9--5LrP   c              3      K   | ]}V  d S r\   rO   )r   _orig_idxs     rQ   	<genexpr>zCGemma3nMultiModalProcessor._find_mm_placeholders.<locals>.<genexpr>o  s#      !J!Jq(!J!J!J!J!J!JrP   c                 4    i | ]\  }}|fd |D             S )c           	      v    g | ]5}t          |j        |j        |j                 |j        |j                   6S ))r   r   	start_idxtokensis_embed)r-   r   r   r   r   r   )r   prepl_orig_idxss     rQ   r   zOGemma3nMultiModalProcessor._find_mm_placeholders.<locals>.<dictcomp>.<listcomp>t  sX     	 	 	  (ZZ,Q[98Z  	 	 	rP   rO   )r   r   placeholdersr   s      rQ   
<dictcomp>zDGemma3nMultiModalProcessor._find_mm_placeholders.<locals>.<dictcomp>s  sW     
 
 
 ',  	 	 	 	 &	 	 	
 
 
rP   )r   r   r   r~   list	enumerateextendrangelenr   _find_mm_placeholdersitems)r`   r   r   r   r   r   repl_token_idsorig_tok	repl_toksreplsr   r   r   r   r   r   r   s             @@@@@@rQ   r   z0Gemma3nMultiModalProcessor._find_mm_placeholdersU  sy    I++--	##%%$K	&M	(O	*%		s 	tCy 	 	 	 	 	 	 	 	 	 cc"+M":": 	K 	KHh%h//I!!),,,!!!J!J!J!JE#i..4I4I!J!J!JJJJJ--n>OPP
 
 
 
 +0++--
 
 
 	
rP   )rH   rI   rJ   r'   r   r}   r   r|   r   r   r#   r   r&   r   r$   r   r/   r   r   r~   r+   tupler,   r   r-   r   __classcell__r   s   @rQ   r   r      s       O"6 O O O O#!#! f%#! 3;'	#!
 CK(#! 
#! #! #! #! #! #!J	
	
 !(V 4	
 
++	,		
 	
 	
 	
0%0 !(S 10 -	0
 
,	0 0 0 0d"S	" 3" 
tCy<<	=	" " " " " "H*
Cy*
 3*
 
d233	4	*
 *
 *
 *
 *
 *
 *
 *
 *
 *
rP   r   c                   v     e Zd ZdZdeez  def fdZ	 	 d
dej	        dz  dej
        dz  dej
        fd	Z xZS )Gemma3nMultimodalEmbedderzUEmbeds token ids or soft tokens for multimodal content into language
    model space.multimodal_configtext_configc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        t          | j        | j                  | _
        t          | j        | j                  | _        t          | j        | j                  | _        t          | j        | j        d          | _        t          | j        | j        d          | _        d S )N)epsF)bias)r  
has_weight)r   __init__hidden_sizemultimodal_hidden_sizerms_norm_epsr  vocab_offset
vocab_sizetext_hidden_sizer   	embeddingr   hard_embedding_normsoft_embedding_normr   embedding_projectionembedding_post_projection_norm)r`   r  r  r   s      rQ   r  z"Gemma3nMultimodalEmbedder.__init__  s	   
 	&7&C#$1-:+6 + 7/O'
 

 $+'$
 $
 $
 
 $+'$
 $
 $
 
 %6'!%
 %
 %
! /6!/
 /
 /
+++rP   N	input_idsinputs_embedsrd   c                     |du |duz  rt          d          ||                     |          }n2|                     || j        z
            }|                     |          }|                     |          \  }}|                     |          S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nz:You must specify exactly one of input_ids or inputs_embeds)
ValueErrorr  r  r  r  r  r  )r`   r  r  emb_normhard_embemb_norm_projr   s          rQ   forwardz!Gemma3nMultimodalEmbedder.forward  s     -t";< 	L   $//>>HH~~i$2C&CDDH//99H44X>>q22=AAArP   NN)rH   rI   rJ   rK   r   r   r   r  rM   
LongTensorrN   r  r   r   s   @rQ   r   r     s         &
-0CC&
 '&
 &
 &
 &
 &
 &
T .2-1B B#d*B |d*B 
	B B B B B B B BrP   r   )r   dummy_inputsc                       e Zd ZeZg dddgdZ eddddd	d
ddd          Zdddede	f fdZ
dededz  fdZdededz  fdZdedefdZdedeej                 fdZdedeej                 fdZdedefdZ	 d<ddd d!ej        d"edz  d#ej        dz  d$edej        f
 fd%Z	 	 d=d!ej        d&ej        d'edz  d(ej        dz  dedefd)Zd*ej        dej        dz  fd+Zd,eee	ej        f                  de e	         fd-Z!de"fd.Z#e$d/e	d0e%de	dz  fd1            Z&e$d2e'j(        d3e)d4e*d5e	dz  d6e+d7         d8e	d9e	dz  de,fd:            Z-e$d4e*d6e	de)fd;            Z. xZ/S )>Gemma3nForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzembed_audio.zembed_vision.zlanguage_model.model.zvision_tower.zaudio_tower.zmulti_modal_projector.zlanguage_model.lm_head.zlanguage_model.model)zmodel.embed_audio.zmodel.embed_vision.zmodel.language_model.zmodel.vision_tower.zmodel.audio_tower.zmodel.multi_modal_projector.zlm_head.model)orig_to_new_prefix )prefixvllm_configr*  c          	      &   t                                                       |j        j        }|j        }|j        j        }|| _        || _        || _        |j        j        | _        | 	                    |d          5  t          j        |j                  | _        t          |j        |j                  | _        d d d            n# 1 swxY w Y   | 	                    |d          5  t          j        |j                  | _        t          |j        |j                  | _        d d d            n# 1 swxY w Y   |                     |          5  t)          ||j        t+          |d          dg          | _        t/          j        |j        j        | j        j        j        | j        j        j        | j        j        j        j        j         | j        j        j        j        j!                  | _"        d d d            d S # 1 swxY w Y   d S )Nrh   )configrT   language_modelr   )r+  	hf_configr*  architectures)devicedtype)#r   r  model_configr/  quant_configr  r-  r  r  _mark_tower_modelr   from_configvision_configvision_towerr   embed_visionaudio_configaudio_towerembed_audio_mark_language_modelr;   r<   r.  rM   zerosscheduler_configmax_num_batched_tokensnum_hidden_layershidden_size_per_layer_inputr'  embed_tokensweightr1  r2  per_layer_embeddings)r`   r+  r*  r-  r4  r  r   s         rQ   r  z(Gemma3nForConditionalGeneration.__init__  s   )3"/'4F(!2 ,7##K99 	 	 ) 5V=Q R R RD 9$f&8! !D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ##K99 	 	(4F<OPPPD8#V%7   D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	6P' ,#F,<==34	7 7 7D ).,C'9'C*0=DK)/<CI) ) )D%	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s7   =?CCC)?D44D8;D8B%HH
H
ra   rd   Nc                     |                     dd           }|                     dd           }|
J d            |d S t          |          S )NrA   image_embedsz&Gemma3n does not support image_embeds.)rA   )r   r@   )r`   ra   rA   rG  s       rQ   _parse_and_validate_image_inputz?Gemma3nForConditionalGeneration._parse_and_validate_image_input  s[     zz.$77zz.$77##%M###4&LAAAArP   c                     |                     dd           }|d S |                     dd           }|d S t          ||          S )NrW   rX   )rW   rX   )r   rS   )r`   ra   rW   rX   s       rQ   _parse_and_validate_audio_inputz?Gemma3nForConditionalGeneration._parse_and_validate_audio_input$  sa     !'

+BD I I (4$jj)>EE&4!"7 3
 
 
 	
rP   c                 x    i }|D ]4}|dv rd|vr | j         di ||d<   |dk    rd|vr | j        di ||d<   5|S )N)rA   rG  rh   rW   rT   rO   )rH  rJ  )r`   ra   mm_input_by_modality	input_keys       rQ   %_parse_and_validate_multimodal_inputszEGemma3nForConditionalGeneration._parse_and_validate_multimodal_inputs4  s    !   	 	I===#7770T0T 1 11 1$W- 444#7770T0T 1 11 1$W- $#rP   image_inputc                    |d         }|                      |dd          j        }|                    |j        d         | j        j        j        | j        j                                      ddd          	                                }|| j        j        j        dz  z  }| 
                    |	                              d          S )
NrA   FT)rA   
do_poolingreturn_dictr      r5   g      ?r  )r8  last_hidden_statereshapeshaper-  r7  r  vision_soft_tokens_per_imagepermute
contiguousr9  unbind)r`   rO  rA   vision_outputss       rQ   _process_image_inputz4Gemma3nForConditionalGeneration._process_image_inputJ  s     #>2**%%T + 
 

 	 ""$Q')58 
 WQ1Z\\ 	 	$+3?DD  ~ >>EEaHHHrP   audio_inputc                 Z   |d                              d          }|d                              d          }|                     ||           \  }}|                     |          }t          j        | j        dz
  ggt          j        |j                  }|                     |          }t          j        |	                    d          ||          }| j
        j        }	t          ||	|          \  }}
|
dk    rt                              d	|
|	           |                    d          S )
NrW   r5   rX   rT  )r2  r1  )r  r   z\Gemma3n audio encoder produced %d extra tokens. Truncating to match placeholder count of %d.)squeezer;  r<  rM   tensorr  longr1  where	unsqueezer-  audio_soft_tokens_per_imager   loggerwarningr[  )r`   r^  r   rX   audio_outputs
audio_maskaudio_featuresaudio_padding_toksaudio_padding_embsexpected_tokenstokens_truncateds              rQ   _process_audio_inputz4Gemma3nForConditionalGeneration._process_audio_inputb  sR   
 %%<=EEaHH)*?@HHKK$($4$400%
 %
!z )))FF #\o!"#5:n>S
 
 
 "--8J-KK  $$&8.
 
 +A+SO-?,
 ,
(( aNN? 	   $$Q'''rP   c                     | j         di |}|g S g }|D ]j}||         }|dk    r*|                     |          }|                    |           |dk    r*|                     |          }|                    |           k|S )Nrh   rT   rO   )rN  r]  r   rp  )r`   ra   rL  multimodal_embeddingsr   multimodal_inputvision_embeddingsaudio_embeddingss           rQ   embed_multimodalz0Gemma3nForConditionalGeneration.embed_multimodal  s    ItISSFSS'I46 - 	? 	?H3H=7""$($=$=>N$O$O!%,,->???7""#'#<#<=M#N#N %,,-=>>>$$rP   F)is_multimodalhandle_oov_mm_tokenr  rr  rw  rx  c                   || j         j                            |          }|                    d| j        j        j        | j        j        j                  }| j        d |j	        d                  
                    |           ||!t                                          |          S t                                          ||||          S )Nr`  r   )rr  rw  rx  )r.  r'  get_per_layer_input_embeddingsrV  r-  r  rA  rB  rE  rW  copy_r   embed_input_ids)r`   r  rr  rw  rx  per_layer_inputsr   s         rQ   r|  z/Gemma3nForConditionalGeneration.embed_input_ids  s      #28WW     077'9'C   
 %&A(8(>q(A&ABHH   
 !(M,A77**9555ww&&"7' 3	 ' 
 
 	
rP   	positionsintermediate_tensorsr  c                 r    |d }| j         d |j        d                  } | j        j        ||f|||d|}|S )Nr   )r}  r  r  )rE  rW  r.  r'  )r`   r  r~  r  r  ra   r}  hidden_statess           rQ   r  z'Gemma3nForConditionalGeneration.forward  sq      + M  45M}7J17M5MN1+1
 .!5'
 
 
 
 rP   r  c                 6    | j                             |          S r\   )r.  compute_logits)r`   r  s     rQ   r  z.Gemma3nForConditionalGeneration.compute_logits  s     "11-@@@rP   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r9   load_weightshf_to_vllm_mapper)r`   r  loaders      rQ   r  z,Gemma3nForConditionalGeneration.load_weights  s+    "4((""743I"JJJrP   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        r.  multi_modal_projectorr8  )r.  	connectortower_model)r   from_string_fieldr_   s    rQ   get_mm_mappingz.Gemma3nForConditionalGeneration.get_mm_mapping  s'     /+-&
 
 
 	
rP   r   ic                 F    |dk    rdS |dk    rdS t          d|           )Nrh   z<image_soft_token>rT   z<audio_soft_token>zUnsupported modality: )r  )clsr   r  s      rQ   get_placeholder_strz3Gemma3nForConditionalGeneration.get_placeholder_str  s;    w''  ''@h@@AAArP   rT   
stt_configr3  language	task_type)
transcribe	translaterequest_promptto_languagec                 @   d}||dk    rdndz  }|dz  }| j                             |d          }	| j                             |d          }
|dk    r|	r	|d|	 z  }n|dk    r|	r|d	|	 z  }|
r|d|
 z  }|d
z  }||j        f}d|i|d}t          t          |          S )z
        Gemma3n supports "free-form" transcription.
        We fix its prompt here to standardize transcriptions/translations
        requests.
        z<start_of_turn>user
r  
Transcribe	Translatez this audior)  z into r  z from z7: <audio_soft_token><end_of_turn>
<start_of_turn>model
rT   )multi_modal_datar   )supported_languagesr   sample_rater	   r   )r  rT   r  r3  r  r  r  r  r   full_lang_namefull_lang_name_toprompts_dicts               rQ   get_generation_promptz5Gemma3nForConditionalGeneration.get_generation_prompt  s    $ ))|";";,,L- 044XrBB377RHH$$$/~///FF+%% 43>333  76#4666MM
./-4e,<OOJ---rP   c                 &    t          ddd           S )N   i>  )max_audio_clip_sr  min_energy_split_window_size)r   )r  r3  r  s      rQ   get_speech_to_text_configz9Gemma3nForConditionalGeneration.get_speech_to_text_config'  s&     "  )-
 
 
 	
rP   r\   r  )0rH   rI   rJ   r    r  packed_modules_mappingr:   r  r   r}   r  r|   Gemma3nImageInputsrH  rS   rJ  r   rN  r   rM   rN   r]  rp  r6   rv  boolr|  r2   r  r  r   r   setr  r   r  classmethodr~   r  npndarrayr   r   r   r   r  r  r   r   s   @rQ   r  r    s5        3
 
 
 

 
 & #1#2%<#2"0,D1+

 

   BD & & &z &3 & & & & & &P
B
B	d	"
B 
B 
B 
B

	d	"
 
 
 
 $f $ $ $ $ $,I'I 
el	I I I I0+('+( 
el	+( +( +( +(Z% %4H % % % %, >B 

 .2$) 
  
  
< 
  4d: 

 |d* 
 " 
 
 
  
  
  
  
  
L <@-1 < < 2D8	
 |d*  
   :A|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
 B3 B3 B3: B B B [B &.z&. '&. "	&.
 *&. 45&. &. 4Z&. 
&. &. &. [&.P 

&

36

	

 

 

 [

 

 

 

 

rP   r  )ccollections.abcr   r   r   typingr   r   r   r	   numpyr  rM   r
   transformersr   r   transformers.models.gemma3nr   r   r   r   r   r   transformers.models.siglipr   vllm.configr   r   r   vllm.config.multimodalr   vllm.inputs.datar   vllm.loggerr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   3vllm.model_executor.layers.vocab_parallel_embeddingr   "vllm.model_executor.models.gemma3nr   .vllm.model_executor.models.gemma3n_audio_utilsr   )vllm.model_executor.models.module_mappingr   "vllm.model_executor.models.whisperr    vllm.multimodalr!   vllm.multimodal.inputsr"   r#   r$   vllm.multimodal.parser%   r&   r'   vllm.multimodal.processingr(   $vllm.multimodal.processing.processorr)   r*   r+   r,   r-   r.   r/   r0   r1   vllm.sequencer2   vllm.utils.tensor_schemar3   r4   
interfacesr6   r7   r8   utilsr9   r:   r;   r<   rH   rg  rm   rn   r@   rS   r  rZ   r   r   Moduler   register_processorr  rO   rP   rQ   <module>r     s    8 7 7 7 7 7 7 7 7 7 0 0 0 0 0 0 0 0 0 0 0 0            0 0 0 0 0 0 0 0                @ ? ? ? ? ? C C C C C C C C C C 3 3 3 3 3 3 ' ' ' ' ' ' # # # # # # 8 8 8 8 8 8 ? ? ? ? ? ? V V V V V V A A A A A A      E D D D D D G G G G G G / / / / / /         
         
 > = = = = =
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 . - - - - - > > > > > > > > W W W W W W W W W W            
X		   
J 
J 
J 
J 
Jl 
J 
J 
J
I 
I 
I 
I 
I 
I 
I 
I - 4
 4
 4
 4
 4
. 4
 4
 4
n)
 )
 )
 )
 )
 67L M )
 )
 )
Xu
 u
 u
 u
 u
!89N!O u
 u
 u
pGB GB GB GB GB	 GB GB GBT ('	*  
a
 a
 a
 a
 a
I!#8a
 a
 
a
 a
 a
rP   