
    .`i<              	       (   d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z>m?Z?m@Z@ d dlAmBZBmCZC d dlDmEZEmFZFmGZGmHZHmIZI d dlJmKZK d dlLmMZM d dlNmOZOmPZP d dlQmRZR d d lSmTZT d!d"lUmVZVmWZWmXZX d!d#lYmZZZm[Z[m\Z\m]Z]m^Z^  e&e_          Z` G d$ d%e ja                  Zb G d& d'eO          Zc G d( d)e,          Zd G d* d+eje                  Zf G d, d-ejg                  Zh G d. d/eh          Zi G d0 d1ejg                  Zj G d2 d3ejg                  Zk G d4 d5ejg                  Zl G d6 d7ejg                  Zm ed d8d9:           G d; d<ejg                              Zn G d= d>ejg                  Zo G d? d@eF          Zp G dA dBeEep                   Zq G dC dDeGep                   Zr e<js        erepeqE           G dF dGejg        eXeW                      ZtdHeeuevejw        f                  dIevdJeeuevejw        f                  fdKZxdS )L    N)IterableMappingSequence)nullcontext)	AnnotatedLiteralcast)nn)BatchFeatureWhisperConfigWhisperFeatureExtractor)	sinusoids)	Attention)support_torch_compile)CacheConfigModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
PromptType)init_logger)
get_act_fn)CrossAttention)MMEncoderAttention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)ParallelLMHead)default_weight_loader)ISO639_1_SUPPORTED_LANGS)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseProcessingInfoEncDecMultiModalProcessorPromptReplacementPromptUpdate)cached_processor_from_config)json_map_leaves)TensorSchemaTensorShape)set_default_torch_dtype)AttentionType   )MultiModalEmbeddingsSupportsMultiModalSupportsTranscription)AutoWeightsLoaderWeightsMappercast_overflow_tensorsmake_layersmaybe_prefixc                       e Zd ZdZdZdZdS )WhisperPosEmbedType
sinusoidalropelearnedN)__name__
__module____qualname__
SINUSOIDALROPELEARNED     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/whisper.pyr?   r?   L   s        JDGGGrJ   r?   c                   `    e Zd ZU dZeeej                 dz   eddd          f         e	d<   dS )WhisperAudioInputszl
    Dimensions:
        - b: Batch size
        - nmb: Number of mel bins
        - t: Time frames (M)
    Nbnmbtinput_features)
rC   rD   rE   __doc__r   listtorchTensorr2   __annotations__rI   rJ   rK   rM   rM   R   s^           U\T!C$$	&     rJ   rM   c                   \     e Zd ZdZdej        dej        dej        dej        f fdZ xZS )WhisperEncoderAttentionzBMulti-headed attention for Whisper encoder with 2D tensor support.querykeyvaluereturnc                 .   |                                 dk    }|r?|                    d          }|                    d          }|                    d          }t                                          |||          }|r|                    d          }|S )zo
        Input shape: batch_size x seq_len x hidden_size
                     or seq_len x hidden_size
           r   )dim	unsqueezesuperforwardsqueeze)selfrY   rZ   r[   is_2dout	__class__s         rK   rb   zWhisperEncoderAttention.forwardc   s     		q  	'OOA&&E--""COOA&&E ggooeS%00 	!++a..C
rJ   )rC   rD   rE   rR   rT   rU   rb   __classcell__rg   s   @rK   rX   rX   `   sq        LL| \ |	
 
         rJ   rX   c                   .     e Zd Zdedef fdZd Z xZS )WhisperPositionalEmbeddingnum_positionsembedding_dimc                 L    t                                          ||           d S N)ra   __init__)rd   rl   rm   rg   s      rK   rp   z#WhisperPositionalEmbedding.__init__}   s#    66666rJ   c                     | j         |         S ro   )weight)rd   position_idss     rK   rb   z"WhisperPositionalEmbedding.forward   s    {<((rJ   )rC   rD   rE   intrp   rb   rh   ri   s   @rK   rk   rk   |   sZ        7c 7# 7 7 7 7 7 7) ) ) ) ) ) )rJ   rk   c                        e Zd Zdej        ddddfdedededededz  d	edz  d
edz  de	f fdZ
	 	 	 ddeded
edz  de	ddf
dZdej        fdZ xZS )WhisperAttentionTN 	embed_dim	num_headsbias	attn_typeper_layer_sliding_windowcache_configquant_configprefixc	                 F   t                                                       || _        t                      }	|| _        | j        |	z  dk    sJ | j        |	z  | _        | j        |	k    r| j        |	z  dk    sJ n|	| j        z  dk    sJ t          d| j        |	z            | _        | j        | j        z  | _        | j        | j        z  | _	        | j        | j        z  | _
        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        |                     ||||           t          ||||| d	          | _        |t"          j        k    r.t'          | j        | j        | j        | j        
          | _        d S | j        t"          j        k    r:t-          | j        | j        | j        | j        ||| d| j                  | _        d S t/          | j        | j        | j        | j        ||| d| j        |	  	        | _        d S )Nr   r5   z;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r   z	.out_proj
input_sizeoutput_sizerz   r~   r   )num_kv_headsz.attn)r   r}   r~   r   r{   )r   r}   r~   r   r{   r|   )ra   rp   rx   r   total_num_headsry   maxr   head_dimq_sizekv_sizer{   
ValueErrorscaling	_init_qkvr   out_projr4   ENCODERrX   attnENCODER_DECODERr   r   )rd   rx   ry   rz   r{   r|   r}   r~   r   tp_sizerg   s             rK   rp   zWhisperAttention.__init__   s    	"688(#g-2222-87** ''1Q66666 T11Q66664#77#BCC$*>>nt}4(4=8"MI%$.88C>C C5>C C C   }d*y$VDDD) !%'''
 
 
 ---/!.	  DIII ^}<<<&!.)) '''.	 	 	DIII "!.)) '''.)A
 
 
DIIIrJ   r\   c           
      `    t          || j        | j        | j        ||| d          | _        d S )Nz	.qkv_projhidden_size	head_sizer   total_num_kv_headsrz   r~   r   )r   r   r   qkv_projrd   rx   rz   r~   r   s        rK   r   zWhisperAttention._init_qkv   sD     *!m 0#3%'''
 
 
rJ   hidden_statesc                     |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     |||          }|                     |          \  }}|S Nr_   )r   splitr   r   r   r   )	rd   r   qkv_qkvattn_outputoutputs	            rK   rb   zWhisperAttention.forward   sp     }--Q))T[$,E2)NN1aii1a((MM+..	rJ   TNrw   )rC   rD   rE   r4   DECODERrt   boolr   r    strrp   r   rT   rU   rb   rh   ri   s   @rK   rv   rv      s<       
 #0#8/3+/26K KK K 	K
 !K #&*K "D(K )4/K K K K K K K` 26
 

 
 )4/	

 
 

 
 
 
"|       rJ   rv   c                        e Zd Z	 	 	 	 ddededededz  dedz  d	ef fd
Z	 	 	 ddedededz  d	eddf
dZ	de
j        de
j        dz  fdZ xZS )WhisperCrossAttentionTNrw   rx   ry   rz   r}   r~   r   c           	      l    t                                          ||||||t          j                   d S )N)rx   ry   rz   r}   r~   r   r{   )ra   rp   r4   r   )rd   rx   ry   rz   r}   r~   r   rg   s          rK   rp   zWhisperCrossAttention.__init__   sH     	%%#3 	 	
 	
 	
 	
 	
rJ   r\   c           
          t          ||||| d          | _        t          || j        d| j        ||| d          | _        d S )Nz.q_projr   r   z.kv_projr   )r   q_projr   r   r   kv_projr   s        rK   r   zWhisperCrossAttention._init_qkv  sq     + !%%%%
 
 
 )!m#3%&&&
 
 
rJ   r   encoder_hidden_statesc                    |                      |          \  }}|?|                     |          \  }}|                    | j        | j        gd          \  }}nd x}}|                     |||          }|                     |          \  }	}|	S r   )r   r   r   r   r   r   )
rd   r   r   r   r   kvr   r   r   r   s
             rK   rb   zWhisperCrossAttention.forward  s    
 {{=))1 !,LL!677EB88T\4<8b8AADAqqLAii1a((MM+..	rJ   )TNNrw   r   )rC   rD   rE   rt   r   r   r    r   rp   r   rT   rU   rb   rh   ri   s   @rK   r   r      s       
 +/26
 

 
 	

 "D(
 )4/
 
 
 
 
 
 
, 26
 

 
 )4/	

 
 

 
 
 
0|  %|d2       rJ   r   c                   V     e Zd Z	 	 ddededededz  def
 fdZd	ej        fd
Z	 xZ
S )
WhisperMLPNrw   rx   ffn_dimact_fnr~   r   c                     t                                                       t          |          | _        t	          |||| d          | _        t          |||| d          | _        d S )Nz.fc1)r   r   r~   r   z.fc2)ra   rp   r   activation_fnr   fc1r   fc2)rd   rx   r   r   r~   r   rg   s         rK   rp   zWhisperMLP.__init__4  s     	'//' %???	
 
 
 %!%???	
 
 
rJ   r   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S ro   )r   r   r   )rd   r   r   s      rK   rb   zWhisperMLP.forwardL  sG    88M22q**=9988M22qrJ   )Nrw   )rC   rD   rE   rt   r   r    rp   rT   rU   rb   rh   ri   s   @rK   r   r   3  s         37
 

 
 	

 )4/
 
 
 
 
 
 
0U\        rJ   r   c                   D     e Zd Zdddedef fdZdej        fdZ xZ	S )WhisperEncoderLayerrw   r   vllm_configr   c          
         t                                                       |j        j        }t	          |dd           }|j        }|j        }|j        | _        t          | j        |j
        t          j        |||| d          | _        t          j        | j                  | _        t#          |j        |j        |j        || d          | _        t          j        | j                  | _        d S )Nsliding_window
.self_attn)rx   ry   r{   r|   r}   r~   r   .mlprx   r   r   r~   r   )ra   rp   model_config	hf_configgetattrr}   r~   d_modelrx   rv   encoder_attention_headsr4   r   	self_attnr
   	LayerNormself_attn_layer_normr   encoder_ffn_dimactivation_functionmlpfinal_layer_norm)rd   r   r   configr   r}   r~   rg   s          rK   rp   zWhisperEncoderLayer.__init__T  s    )3 )94@@"/"/)n4#+%3%%(((
 
 
 %'L$@$@!n*-%???
 
 
 !#T^ < <rJ   r   c                     |}|                      |          }|                     |          }||z   }|}|                     |          }|                     |          }||z   }t	          |          }|S )Nr   )r   r   r   r   r;   )rd   r   residuals      rK   rb   zWhisperEncoderLayer.forwardo  s     !11-@@]CC =0 --m<<// =0-m<<rJ   
rC   rD   rE   r   r   rp   rT   rU   rb   rh   ri   s   @rK   r   r   S  sv        AC = = =z =3 = = = = = =6|       rJ   r   c                   X     e Zd Zdddedef fdZdej        dej        dz  fd	Z xZ	S )
WhisperDecoderLayerrw   r   r   r   c          	      4   t                                                       |j        j        }|j        }|j        }t          |j        |j        t          j
        ||| d          | _        t          j        |j                  | _        t          |j        |j        ||| d          | _        t          j        |j                  | _        t%          |j        |j        |j        || d          | _        t          j        |j                  | _        d S )Nr   )rx   ry   r{   r}   r~   r   z.encoder_attn)rx   ry   r}   r~   r   r   r   )ra   rp   r   r   r}   r~   rv   r   decoder_attention_headsr4   r   r   r
   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   )rd   r   r   r   r}   r~   rg   s         rK   rp   zWhisperDecoderLayer.__init__  s   )3"/"/)n4#+%%(((
 
 
 %'L$@$@!1n4%%+++
 
 
 (*|FN'C'C$n*-%???
 
 
 !#V^ < <rJ   r   r   Nc                 2   |}|                      |          }|                     |          }||z   }|}|                     |          }|                     ||          }||z   }|}|                     |          }|                     |          }||z   }|S )Nr   )r   r   )r   r   r   r   r   r   )rd   r   r   r   s       rK   rb   zWhisperDecoderLayer.forward  s    
 !11-@@]CC =0 44]CC))'"7 * 
 
 !=0 --m<<// =0rJ   r   ri   s   @rK   r   r     s        AC = = =z =3 = = = = = =@|  %|d2       rJ   r   c                   t     e Zd Zddddededef fdZdej        e	ej                 z  d	ej        fd
Z
 xZS )WhisperEncoderrw   F)r   init_in_fp32r   r   r   c                   t                                                       j        j        }|j        }t          t          |dd                    | _        |j        | _        |j	        | _	        |j
        rt          j        |          nd| _        t          j        | j        |dd          | _        t          j        ||ddd          | _        | j        j        d	         | j        j        d	         z  | _        t)          |j        fd
| d          \  | _        | _        | _        t          j        |j                  | _        | j        t
          j        t
          j        fvrt;          d| j                   |rt=          t>          j                   ntC                      }t?          j"                    5  |5  t          j#        | j	        |          | _$        | j$        j%        &                    tO          | j$        j%        j(                    d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )N	pos_embedr@         ?   r5   )kernel_sizepaddingr^   )strider   r   r   c                 ,    t          |  d          S N.layersr   r   )r   r   r   s    rK   <lambda>z)WhisperEncoder.__init__.<locals>.<lambda>  %    .'60B0B0B   rJ   r   r   z\Only sinusoidal or learned position embeddings are supported for non-causal models, but got ))ra   rp   r   r   r   r?   r   pos_embed_typenum_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler
   Conv1dconv1conv2r   total_strider<   encoder_layersstart_layer	end_layerlayersr   
layer_normrF   rH   r   r3   rT   float32r   no_grad	Embeddingembed_positionsrr   copy_r   shape)rd   r   r   r   r   rx   maybe_fp32_init_ctxrg   s    `     rK   rp   zWhisperEncoder.__init__  s    	)3N	1FK66
 
 #/$*$?!393IR49Y///sYt0)TUVVV
Yy)A1VWXXX
 J-a04:3DQ3GG8C!    %%%9
 9
 9
5$.$+ ,v~66*''
 
 
 H262EH H   7CU#EM222 	
 MOO	 		 	 $&<0I9#U#UD  '--4/6<=  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s7   =H9 AH!H9!H%	%H9(H%	)H99H= H=rQ   r\   c                    g }d}|D ]}t           j                            |                     |                    }t           j                            |                     |                    }|                    dd          }|| j        j        d |                    d          d d f         z   	                    |j
                  }|                    |           |j        dk    }|rt          j        |          }nt          j        |d          }| j        D ]} ||          }|                     |          }|S )NFr   r^   r   r   )r
   
functionalgelur   r   	transposer  rr   sizetodtypeappendndimrT   catstackr   r   )rd   rQ   r   input_is_batchedfeaturesembedsencoder_layers          rK   rb   zWhisperEncoder.forward  sC     & 
	/ 
	/H]''

8(<(<==F]''

6(:(:;;F%%b"--Ft3:;LV[[__;Laaa;OPPTT F   (((%{Q 	>!Im44MM!K1===M![ 	9 	9M)M-88MM66rJ   )rC   rD   rE   r   r   r   rp   rT   rU   rS   rb   rh   ri   s   @rK   r   r     s        8:QV/ / /(/25/JN/ / / / / /b#lT%,-??	       rJ   r   r   )	input_ids	positions)dynamic_arg_dimsc                   |     e Zd Zdddedef fdZdej        dej        dz  fd	Zd
ej        dej        fdZ	 xZ
S )WhisperDecoderrw   r   r   r   c                >   t                                                       j        j        }|j        | _        |j        | _        |j        | _        |j	        | _	        |j
        rt          j        |j                  nd| _        t          j        |j        |j        | j                  | _        t'          | j        |j                  | _        t+          |j        fd| d          \  | _        | _        | _        t          j        |j                  | _        d S )Nr   c                 ,    t          |  d          S r   )r   r   s    rK   r   z)WhisperDecoder.__init__.<locals>.<lambda>  r   rJ   r   r   )ra   rp   r   r   decoder_layerdrop	layerdroppad_token_idpadding_idxmax_target_positionsr   r   r   r   r   r   r
   r  
vocab_sizeembed_tokensrk   r  r<   decoder_layersr   r   r   r   r   )rd   r   r   r   rg   s    `  rK   rp   zWhisperDecoder.__init__  s   )31!.$*$?!$*$?!8>8NW49V^444TWLv~t/?
 
  :%v~ 
  
 9D!    %%%9
 9
 9
5$.$+ ,v~66rJ   r  r   Nc                     |                      |          }|                     |          }||z   }| j        D ]} |||          }|                     |          }|S )N)r   )embed_input_idsr  r   r   )rd   r  r  r   inputs_embedsr   decoder_layers          rK   rb   zWhisperDecoder.forward&  s|     ,,Y77((33	%	1![ 	 	M)M&;  MM
 66rJ   r  r\   c                 ,    |                      |          S ro   )r#  )rd   r  s     rK   r&  zWhisperDecoder.embed_input_ids9  s      +++rJ   )rC   rD   rE   r   r   rp   rT   rU   rb   r&  rh   ri   s   @rK   r  r    s        AC 7 7 7z 73 7 7 7 7 7 70 <  %|d2	   &, ,%, , , , , , , , ,rJ   r  c                       e Zd Zdddedef fdZdej        dz  dej        d	eej                 d
ej        fdZ	dej        eej                 z  dz  d
ej        dz  fdZ
deeeej        f                  d
ee         fdZ xZS )WhisperModelrw   r   r   r   c                    t                                                       t          || d          | _        t	          || d          | _        d S )Nz.encoderr   z.decoder)ra   rp   r   encoderr  decoder)rd   r   r   rg   s      rK   rp   zWhisperModel.__init__>  sg    %#v,?,?,?
 
 
 &#v,?,?,?
 
 
rJ   r  Nr  encoder_outputsr\   c                     t          |          rt          j        |d          nd }|                     |||          }|S )Nr   r   )r  r  r   )lenrT   r  r.  )rd   r  r  r/  
enc_statesdecoder_outputss         rK   rb   zWhisperModel.forwardG  sT     ;>o:N:NXUYA6666TX
,,", ' 
 

 rJ   rQ   c                 4    |d S |                      |          S ro   )r-  )rd   rQ   s     rK   get_encoder_outputsz WhisperModel.get_encoder_outputsU  s!     !4||N+++rJ   weightsc                    g d}t          |                                           }t                      }|D ]\  }}|D ]X\  }}}	||vr|                    ||          }|                    d          r||vr;||         }
|
j        } ||
||	            nD|                    d          r||vrz||         }
t          |
dt                    } ||
|           |                    |           |S )N)).self_attn.qkv_projz.self_attn.q_projr   )r8  z.self_attn.k_projr   )r8  z.self_attn.v_projr   ).encoder_attn.kv_projz.encoder_attn.k_projr   )r9  z.encoder_attn.v_projr   z.biasweight_loader)	dictnamed_parameterssetreplaceendswithr:  r   r"   add)rd   r6  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr:  s               rK   load_weightszWhisperModel.load_weights]  s=   "
 "
 "
 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<==)) d+.E.E#D) % 3e]H=== ==)) d+.E.E#D) '@U V Ve]333d####rJ   )rC   rD   rE   r   r   rp   rT   rU   rS   rb   r5  r   tupler=  rJ  rh   ri   s   @rK   r+  r+  =  s       AC 
 
 
z 
3 
 
 
 
 
 
<$& < el+	
 
   ,tEL'99D@, 
	, , , ,!HU33D-E$F !3s8 ! ! ! ! ! ! ! !rJ   r+  c                       e Zd ZdefdZedefd            Zdee	e
dz  f         fdZdedefdZde
fdZde
fd	ZdS )
WhisperProcessingInfor\   c                 @    | j                             t                    S ro   )ctxget_hf_configr   rd   s    rK   rP  z#WhisperProcessingInfo.get_hf_config  s    x%%m444rJ   c                     dS )NTrI   rQ  s    rK   skip_prompt_length_checkz.WhisperProcessingInfo.skip_prompt_length_check  s    trJ   Nc                 
    ddiS )Naudior5   rI   rQ  s    rK   get_supported_mm_limitsz-WhisperProcessingInfo.get_supported_mm_limits  s    |rJ   kwargsc                 \     | j         di |}|j        }t          |t                    sJ |S )NrI   )get_hf_processorfeature_extractor
isinstancer   )rd   rW  hf_processorrZ  s       rK   get_feature_extractorz+WhisperProcessingInfo.get_feature_extractor  sA    ,t,66v66(:+-DEEEEE  rJ   c                     dS )z7Return target audio channels for Whisper models (mono).r5   rI   rQ  s    rK   get_target_channelsz)WhisperProcessingInfo.get_target_channels  s    qrJ   c                 4    |                                  j        S ro   )rP  r   rQ  s    rK   get_num_audio_tokensz*WhisperProcessingInfo.get_num_audio_tokens  s    !!##88rJ   )rC   rD   rE   r   rP  propertyr   rS  r   r   rt   rV  objectr   r]  r_  ra  rI   rJ   rK   rM  rM    s        5} 5 5 5 5 $    XcDj)A    !f !9P ! ! ! !S    9c 9 9 9 9 9 9rJ   rM  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	WhisperDummyInputsBuilder	mm_countsr\   c                 8    |                     dd          }d|z  S )NrU  r   z<|startoftranscript|>)get)rd   rf  
num_audioss      rK   get_dummy_textz(WhisperDummyInputsBuilder.get_dummy_text  s     ]]7A..
&33rJ   Nseq_len
mm_optionsc                     | j                                         }|j        }|j        |z  }|                    dd          }|r|                    d          nd }d|                     |||          iS )NrU  r   )lengthri  	overrides)infor]  sampling_ratechunk_lengthrh  _get_dummy_audios)	rd   rk  rf  rl  rZ  rq  	audio_lenri  audio_overridess	            rK   get_dummy_mm_dataz+WhisperDummyInputsBuilder.get_dummy_mm_data  s     !I;;==)7%2]B	]]7A..
5?I*..111T T++ Z? ,  
 	
rJ   ro   )
rC   rD   rE   r   r   rt   rj  r   r%   rv  rI   rJ   rK   re  re    s        4S(9 4c 4 4 4 4 =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rJ   re  c            
           e Zd ZdefdZdeee         z  dedeee         z  fdZ	dede
eef         de
eef         de
eef         def
 fdZd	ed
e
eef         de
eef         fdZded
e
eef         dedee         fdZ xZS )WhisperMultiModalProcessorr\   c                     | j                                         }t          |j        | j                                                   S )N)	target_srtarget_channels)rp  r]  r)   rq  r_  )rd   rZ  s     rK   _get_data_parserz+WhisperMultiModalProcessor._get_data_parser  sD     I;;==#'5 I99;;
 
 
 	
rJ   promptmm_datac                     dgS )Nr   rI   )rd   r}  r~  s      rK   create_encoder_promptz0WhisperMultiModalProcessor.create_encoder_prompt  s     s
rJ   	mm_kwargs
tok_kwargsc                 "   |rJ | j         j        di |}t          |                    d                    }t          di |d|j        i}t                                          ||||          }d|v r|                    d          |d<   |S )Naudios)rU  rq  )r}  r~  r  r  labelsr  rI   )rp  r]  r;  poprq  ra   _call_hf_processor)rd   r}  r~  r  r  rZ  processed_outputsrg   s          rK   r  z-WhisperMultiModalProcessor._call_hf_processor  s      	 ?	 ? L L) L LX!6!6777G   /=  I "GG66!	 7 
 
 (((->-B-B8-L-Lk*  rJ   	hf_inputshf_processor_mm_kwargsc                 F    t          t          j        d                    S )NrU  rQ   )r;  r&   batched)rd   r  r  s      rK   _get_mm_fields_configz0WhisperMultiModalProcessor._get_mm_fields_config  s!    
 #8#@#I#IJJJJrJ   mm_itemsout_mm_kwargsc                 d    | j                                         }t          ddgdg|z            gS )NrU  r   )modalitytargetreplacement)rp  ra  r-   )rd   r  r  r  
num_tokenss        rK   _get_prompt_updatesz.WhisperMultiModalProcessor._get_prompt_updates  sE     Y3355
 sC*,  
 	
rJ   )rC   rD   rE   r)   r|  r   rS   rt   r%   r  r   rc  r   r  r&   r  r(   r'   r   r.   r  rh   ri   s   @rK   rx  rx    sw       
"6 
 
 
 
	d3i	 $	 
tCy		 	 	 	!! f%! 3;'	!
 CK(! 
! ! ! ! ! !0KK !(V 4K 
++	,	K K K K
%
 !(V 4
 -	

 
,	
 
 
 
 
 
 
 
rJ   rx  )rp  dummy_inputsc                       e Zd Zg dddgdZ eddd          Zd	Zd	ZeZ	e
d
edz  dedz  f fd            Ze
dej        deded
edz  ded         dededz  defd            Ze
dedededz  fd            Ze
dededefd            Ze
dededededz  fd            Zdddedef fd Z	 d2d!ej        d"ej        d#eej                 dz  dej        fd$Zd%ede fd&Z!	 d2dd'd(d!ej        d)e dz  d*ej        dz  d+e"dej        f
d,Z#d%ede$fd-Z%d.ej        dej        fd/Z&d0e'e(eej        f                  de)e         fd1Z* xZ+S )3WhisperForConditionalGeneration)zself_attn.q_projzself_attn.k_projzself_attn.v_projzencoder_attn.k_projzencoder_attn.v_proj)zself_attn.qkv_projzencoder_attn.kv_projz	.mlp.fc1.z	.mlp.fc2.)z.fc1.z.fc2.)orig_to_new_substrTlanguageNr\   c                     |t                               d           d}t                                          |          S )NzDefaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field in the TranscriptionRequest.en)loggerwarningra   validate_language)clsr  rg   s     rK   r  z1WhisperForConditionalGeneration.validate_language  sE     NN/  
 Hww((222rJ   rU  r   
stt_config	task_type)
transcribe	translaterequest_promptto_languagec                     |t          d          dd||j        fid|rd| ndd| dz   d| d	z   d
}t          t          |          S )Nz;Language must be specified when creating the Whisper promptrw   rU  )r}  multi_modal_dataz<|prev|>z<|startoftranscript|><|z|>z<|z|><|notimestamps|>)encoder_promptdecoder_prompt)r   sample_rater	   r   )	r  rU  r   r  r  r  r  r  r}  s	            rK   get_generation_promptz5WhisperForConditionalGeneration.get_generation_prompt  s     M   eZ%;<%  1?F,N,,,B8H88894y4445
 
 J'''rJ   r  ic                 N    |                     d          rd S t          d          )NrU  z Only audio modality is supported)
startswithr   )r  r  r  s      rK   get_placeholder_strz3WhisperForConditionalGeneration.get_placeholder_str>  s,    w'' 	4;<<<rJ   c                 j    t          |          }t          |j        j        |j        j                  S )N)max_audio_clip_sr  )r/   r   rZ  rr  rq  )r  r   r  	processors       rK   get_speech_to_text_configz9WhisperForConditionalGeneration.get_speech_to_text_configE  s:     1>>	!&8E!3A
 
 
 	
rJ   audio_duration_sc                 ~    t          |          }|j        j        }|J t          j        ||j        z  |z            S ro   )r/   rZ  
hop_lengthr   ceilr  )r  r  r  r   r  r  s         rK   ra  z4WhisperForConditionalGeneration.get_num_audio_tokensP  sH     1>>	0;
%%%
 y)J,BBZOPPPrJ   rw   r   r   r   c          	      V   t                                                       |j        j        }|j        }|| _        |j        j        | _        |                     |t          dt          i          5  t          ||          | _        d d d            n# 1 swxY w Y   t          |j        |j        |t          |d                    | _        | j                            | j        j        j                  | _        t)          |dd          }t+          |j        |          | _        d S )	NrU  )language_targetstower_targetsr   proj_out)r~   r   logit_scaler   )scale)ra   rp   r   r   r~   r   r  _mark_composite_modelr  r   r+  modelr!   r"  r   r=   r  tie_weightsr.  r#  r   r   logits_processor)rd   r   r   r   r~   r  rg   s         rK   rp   z(WhisperForConditionalGeneration.__init__`  sh   )3"/ -3
''+"N3 ( 
 
 	N 	N
 &+fMMMDJ	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 'N%
33	
 
 
 11$*2D2QRRfmS99 /0A U U Us   1BBBr  r  r/  c                 >    |g }|                      |||          }|S )N)r  r  r/  )r  )rd   r  r  r/  rW  r3  s         rK   rb   z'WhisperForConditionalGeneration.forwardx  s8     " O**+ % 
 

 rJ   rW  c                      | j         di |}| j                            |d                   }|                    d          S )NrQ   r   r   rI   )_parse_and_validate_audio_inputr  r5  unbind)rd   rW  audio_input
enc_outputs       rK   embed_multimodalz0WhisperForConditionalGeneration.embed_multimodal  sL    :d:DDVDDZ33K@P4QRR
  Q '''rJ   F)is_multimodalhandle_oov_mm_tokenmultimodal_embeddingsr  r  c                @    | j         j                            |          S ro   )r  r.  r&  )rd   r  r  r  r  s        rK   r&  z/WhisperForConditionalGeneration.embed_input_ids  s     z!11)<<<rJ   c                 z     |                     dd           }|t           fd|          }t          |          S )NrQ   c                 8    |                      j                  S ro   )r  r  )xrd   s    rK   r   zQWhisperForConditionalGeneration._parse_and_validate_audio_input.<locals>.<lambda>  s    qttDJ7G7G rJ   r  )r  r0   rM   )rd   rW  rQ   s   `  rK   r  z?WhisperForConditionalGeneration._parse_and_validate_audio_input  sG    $4d;;%,-G-G-G-GXXN!@@@@rJ   r   c                 <    |                      | j        |          }|S ro   )r  r  )rd   r   logitss      rK   compute_logitsz.WhisperForConditionalGeneration.compute_logits  s    &&t}mDDrJ   r6  c                 ~    t          | dg          }t          |d          }|                    || j                  S )Nz	proj_out.)skip_prefixesz.k_proj.weight)mapper)r9   _create_fake_bias_for_k_projrJ  hf_to_vllm_mapper)rd   r6  loaders      rK   rJ  z,WhisperForConditionalGeneration.load_weights  sD    "4}EEE /w8HII""743I"JJJrJ   ro   ),rC   rD   rE   packed_modules_mappingr:   r  supports_transcription_onlysupports_segment_timestampr#   supported_languagesclassmethodr   r  npndarrayr   r   r   r   r  rt   r  r  floatra  r   rp   rT   rU   rS   rb   rc  r6   r  r   r&  rM   r  r  r   rK  r=  rJ  rh   ri   s   @rK   r  r    s       
 
 

 "78M N  &%0;GG  
 #'!%23t 3d
 3 3 3 3 3 [3 (z( "( '	(
 *( 45( ( 4Z( 
( ( ( [(< =3 =3 =3: = = = [= 
&
36
	
 
 
 [
 QQ 'Q "	Q
 
tQ Q Q [Q BD V V Vz V3 V V V V V V8 6:	 < < el+d2	 
    ( (4H ( ( ( ( >B
=
 .2$)
= 
= 
=<
=  4d:
=
 |d*
= "
= 

= 
= 
= 
=A ACU A A A AEL U\    KHU33D-E$F K3s8 K K K K K K K KrJ   r  r6  fake_bias_key_namer\   c              #      K   | D ]k\  }}|                     |          rKt          j        |                    d                    }|                    dd          }||f||fgE d{V  ||fV  ldS )z
    Create full zeros bias for k_proj weight in self-attn and x-attn layers.
    So that the bias for k_proj in qkv_proj can be initialized with zeros.
    r   rr   rz   N)r?  rT   zerosr  r>  )r6  r  rD  rr   rz   	bias_names         rK   r  r    s          f==+,, 	;;v{{1~~..DXv66IvD(9::::::::Fl rJ   )yenumr   collections.abcr   r   r   
contextlibr   typingr   r   r	   numpyr  rT   r
   transformersr   r   r   ,transformers.models.whisper.modeling_whisperr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r   r   r   vllm.config.multimodalr   vllm.distributedr   vllm.inputs.datar   vllm.loggerr   %vllm.model_executor.layers.activationr   4vllm.model_executor.layers.attention.cross_attentionr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr    3vllm.model_executor.layers.vocab_parallel_embeddingr!   -vllm.model_executor.model_loader.weight_utilsr"   (vllm.model_executor.models.whisper_utilsr#   vllm.multimodalr$   vllm.multimodal.inputsr%   r&   r'   vllm.multimodal.parser(   r)   vllm.multimodal.processingr*   r+   r,   r-   r.   !vllm.transformers_utils.processorr/   vllm.utils.jsontreer0   vllm.utils.tensor_schemar1   r2   vllm.utils.torch_utilsr3   vllm.v1.attention.backendr4   
interfacesr6   r7   r8   utilsr9   r:   r;   r<   r=   rC   r  Enumr?   rM   rX   r  rk   Modulerv   r   r   r   r   r   r  r+  rM  re  rx  register_processorr  rK  r   rU   r  rI   rJ   rK   <module>r     sk     7 7 7 7 7 7 7 7 7 7 " " " " " " + + + + + + + + + +                    
 C B B B B B * * * * * * = = = = = = P P P P P P P P P P P P 3 3 3 3 3 3 A A A A A A ' ' ' ' ' ' # # # # # # < < < < < < O O O O O O X X X X X X         
 H G G G G G F F F F F F N N N N N N O O O O O O      0 / / / / /         
 L K K K K K K K              K J J J J J / / / / / / > > > > > > > > : : : : : :      X W W W W W W W W W              
X		    $)              0   8) ) ) ) ) ) ) )j j j j jry j j jZ? ? ? ? ?, ? ? ?D       @+ + + + +") + + +\8 8 8 8 8") 8 8 8vM M M M MRY M M M` ab(I(IJJJ-, -, -, -, -,RY -, -, KJ-,`A A A A A29 A A AH9 9 9 9 9. 9 9 92
 
 
 
 
 67L M 
 
 
6?
 ?
 ?
 ?
 ?
!:;P!Q ?
 ?
 ?
D ('	*  
qK qK qK qK qKI$&8qK qK 
qKheC-./EHeC%&'     rJ   