
    .`i              
          d Z ddlZddlmZmZ ddlmZmZmZ ddl	Z
ddlZddlmc mZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZEmFZFmGZG ddddddZH G d d e:          ZI G d! d"e0          ZJ G d# d$e/eJ                   ZK G d% d&e.eJ                   ZL G d' d(ejM                  ZN G d) d*ejM                  ZO G d+ d,ejM                  ZP G d- d.ejM                  ZQ G d/ d0ejM                  ZR G d1 d2ejM                  ZS G d3 d4ejM                  ZT e$jU        eKeJeL5           G d6 d7ejM        eAeBe@eC                      ZVdS )8z(Inference-only IBM Granite speech model.    N)IterableMapping)	AnnotatedLiteralcast)nn)BatchFeaturePretrainedConfig)CacheConfigModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)
PromptType)ColumnParallelLinearRowParallelLinear)QuantizationConfig)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)AudioProcessorItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)cached_tokenizer_from_config)cached_processor_from_config)TensorSchemaTensorShape   )Blip2QFormerModel)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsTranscription)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixEnglishFrenchGerman
PortugueseSpanish)enfrdeptesc                       e Zd ZU dZeej         eddd          f         ed<   	 eej         edd          f         ed<   	 ee	e
          ed          f         ed<   d	S )
GraniteSpeechAudioInputsa(  
    Audio input features for Granite Speech model.

    Dimensions:
        - b: Batch size
        - fi: Number of input features from the Mel spectrogram.
        - fo: Number of output features, i.e. the embedding size.
        - 160: Fixed feature dimension for Mel spectrogram features
    bfi   input_featuresfoinput_features_maskaudio_embed_sizesN)__name__
__module____qualname____doc__r   torchTensorr%   __annotations__listint     }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/granite_speech.pyr;   r;   Z   s           elKKT3,G,GGHHHH"5<S$1G1G#GHHHH2 cKK,<,<!<====??rM   r;   c                   <    e Zd Zdeeedz  f         fdZd Zd ZdS )%GraniteSpeechMultiModalProcessingInforeturnNc                 
    ddiS )Naudior&   rL   selfs    rN   get_supported_mm_limitsz=GraniteSpeechMultiModalProcessingInfo.get_supported_mm_limitsp   s    |rM   c                     dS )Ni  rL   rT   s    rN   get_max_audio_tokensz:GraniteSpeechMultiModalProcessingInfo.get_max_audio_tokensw   s    trM   c                     dS )Ni z rL   rT   s    rN   get_max_audio_lenz7GraniteSpeechMultiModalProcessingInfo.get_max_audio_lenz   s    wrM   )	rC   rD   rE   r   strrK   rV   rX   rZ   rL   rM   rN   rP   rP   o   sZ        cDj)A          rM   rP   c            
            e Zd ZdefdZdedeeef         deee	f         fdZ
dedeeef         dedee         fdZd	ed
eeef         deeef         deeef         def
 fdZ xZS ) GraniteSpeechMultiModalProcessorrQ   c                 x    | j                                         j        }|j        d         }t	          |          S )Nsample_rate)	target_sr)infoget_hf_processoraudio_processormelspec_kwargsr   )rU   feature_extractorsampling_rates      rN   _get_data_parserz1GraniteSpeechMultiModalProcessor._get_data_parser   s7     I6688H)8G#m<<<<rM   	hf_inputshf_processor_mm_kwargsc                 l    t          t          j        d          t          j        d                    S )NrS   )r?   rB   )dictr   batched)rU   rh   ri   s      rN   _get_mm_fields_configz6GraniteSpeechMultiModalProcessor._get_mm_fields_config   s7    
 08AA3;GDD
 
 
 	
rM   mm_itemsout_mm_kwargsc                   	
  | j         j        di |}| j                                         }|j        
|                                }t          |dd          }||         	dt          f	
fd}t          d	g|          gS )Naudio_token	<|audio|>item_idxc                                          dt                    }|                    |           }|j        d         }                    |g          d         }g|z  S )NrS   r   )	get_itemsr   getshape_get_num_audio_features)rs   audiosrS   audio_lengthnum_projector_featuresaudio_token_idre   rn   s        rN   get_replacementzMGraniteSpeechMultiModalProcessor._get_prompt_updates.<locals>.get_replacement   si    ''1DEEFJJx((E ;r?L%6%N%N& &&" ##&<<<rM   rS   )modalitytargetreplacementrL   )ra   rb   get_tokenizerrc   	get_vocabgetattrrK   r   )rU   rn   ri   ro   	processor	tokenizervocabrq   r~   r}   re   s    `       @@rN   _get_prompt_updatesz4GraniteSpeechMultiModalProcessor._get_prompt_updates   s     /DI.HH1GHH	I++--	%5##%% iDD{+	=c 	= 	= 	= 	= 	= 	= 	= 	=  &'+  
 	
rM   promptmm_data	mm_kwargs
tok_kwargsc                 2   t          |          }|                    dg           }|r||d<   t                                          ||||          }d|v r@| j                                        j        }|d         |k                        d          |d<   |S )Nrz   rS   )r   r   r   r   	input_idsru   rB   )rk   popsuper_call_hf_processorra   get_hf_configaudio_token_indexsum)	rU   r   r   r   r   rz   processed_outputsr   	__class__s	           rN   r   z3GraniteSpeechMultiModalProcessor._call_hf_processor   s     w--Xr** 	&%GG!GG66!	 7 
 
 g !%	 7 7 9 9 K!+.2CCc"gg 12 ! rM   )rC   rD   rE   r   rg   r	   r   r[   objectr   rm   r   r   rJ   r    r   r   __classcell__r   s   @rN   r]   r]      s1       ="6 = = = =


 !(V 4
 
++	,	
 
 
 

%
 !(V 4
 -	

 
l	
 
 
 
@!! f%! 3;'	!
 CK(! 
! ! ! ! ! ! ! ! ! !rM   r]   c            	       p    e Zd Z	 ddedeeef         deeef         dz  defdZdeeef         defdZ	dS )	GraniteSpeechDummyInputsBuilderNseq_len	mm_counts
mm_optionsrQ   c                     |                     dd          }|r|                     d          nd }d|                     | j                                        ||          iS )NrS   r   )length
num_audios	overrides)rw   _get_dummy_audiosra   rZ   )rU   r   r   r   r   audio_overridess         rN   get_dummy_mm_dataz1GraniteSpeechDummyInputsBuilder.get_dummy_mm_data   so     ]]7A..
5?I*..111T T++y2244%) ,  
 	
rM   c                     |                     dd          }| j                                        }t          |dd          }||z  S )NrS   r   rq   rr   )rw   ra   rb   r   )rU   r   r   hf_processorrq   s        rN   get_dummy_textz.GraniteSpeechDummyInputsBuilder.get_dummy_text   sD    ]]7A..
y1133lM;GGZ''rM   N)
rC   rD   rE   rK   r   r[   r   r   r   r   rL   rM   rN   r   r      s         =A	
 

 38$
 C!112T9	

 

 
 
 
"(S(9 (c ( ( ( ( ( (rM   r   c            	       `     e Zd Z	 	 ddedededz  def fdZdej	        d	ej	        fd
Z
 xZS )GraniteSpeechEncoderProjectorN configcache_configquant_configprefixc                    t                                                       |j        j        | _        |j        | _        |j        | _        |j        |j        z  | _        t          j        t          j
        d| j        |j        j                            | _        t          |j        ||| d          | _        t          j        |j        j        |j        j                  | _        d S )Nr&   z.qformer)r   r   r   )r   __init__projector_confighidden_sizedownsample_ratewindow_sizenum_queriesr   	ParameterrG   zerosqueryr'   qformerLineartext_configlinear)rU   r   r   r   r   r   s        rN   r   z&GraniteSpeechEncoderProjector.__init__   s     	!2>%5!-!-1GG\K4+V-D-PQQ
 

 )#%%&&&	
 
 
 i#/1C1O
 
rM   hidden_statesrQ   c                    |                                 \  }}}t          j        || j        z            }|| j        z  |z
  }t          j                            |ddd|fdd          }|                    ||z  | j        |          }|                     | j	        j
        |          }|                     |                    ||| j        z  | j        z  d                    }|S )Nr   constant)query_embedsencoder_hidden_statesru   )sizemathceilr   r   
functionalpadviewr   r   datar   r   )	rU   r   
batch_sizer   dimnblocksr   last_hidden_state
query_projs	            rN   forwardz%GraniteSpeechEncoderProjector.forward  s    #0#5#5#7#7 
GS)Gd&6677((72))-!Q3UVWW%**:+?AQSVWW LL"/ ) 
 

 [[""$**d.BB 
 

 rM   Nr   )rC   rD   rE   r
   r   r   r[   r   rG   rH   r   r   r   s   @rN   r   r      s        
 37
 
 
 "
 )4/	

 
 
 
 
 
 
<U\ el        rM   r   c                   `     e Zd ZdZ	 	 ddededz  def fdZdej	        d	ej	        fd
Z
 xZS )!GraniteSpeechConformerFeedForwardz0Feedforward module for conformer encoder blocks.Nr   r   r   r   c                 h   t                                                       t          j        |j                  | _        t          |j        |j        |j        z  || d          | _        t          j	                    | _
        t          |j        |j        z  |j        || d          | _        d S )Nz.up_proj)
input_sizeoutput_sizer   r   z
.down_proj)r   r   r   	LayerNorm
hidden_dimpre_normr   feedforward_multup_projSiLUsilur   	down_proj)rU   r   r   r   r   s       rN   r   z*GraniteSpeechConformerFeedForward.__init__(  s     	V%677+()F,CC%&&&	
 
 
 GII	*(6+BB)%(((	
 
 
rM   r   rQ   c                     |                      |          }|                     |          \  }}|                     |          }|                     |          \  }}|S r   )r   r   r   r   )rU   r   _s      rN   r   z)GraniteSpeechConformerFeedForward.forward@  sV    m44<<66q		-00>>-88qrM   r   )rC   rD   rE   rF   r
   r   r[   r   rG   rH   r   r   r   s   @rN   r   r   %  s        ::
 37	
 
 
 )4/
 	
 
 
 
 
 
0U\ el        rM   r   c                   `     e Zd ZdZd
dedef fdZdej        dej        dej        fd	Z	 xZ
S )GraniteSpeechConformerAttentionzAttention for conformer blocks using Shaw's relative positional
    embeddings. See the following [paper](https://arxiv.org/pdf/1803.02155)
    for more details.
    r   r   r   c                    t                                                       |j        |j        z  }|j        | _        |j        | _        |j        | _        |j        | _        | j        dz  | _        t          j        |j	                  | _
        t          j        |j	        |d          | _        t          j        |j	        |dz  d          | _        t          j        ||j	                  | _        t          j        d| j        z  dz   | j                  | _        | j        dk    s| j        | j        k    r t#          d| j         d| j         d	          d S )
Ng      Fbias   r&   r   z/Context size should be > 0 and <= max_pos_emb (z), got .)r   r   dim_head	num_headsmax_pos_embcontext_sizescaler   r   r   r   r   to_qto_kvto_out	Embeddingrel_pos_emb
ValueErrorrU   r   r   	inner_dimr   s       rN   r   z(GraniteSpeechConformerAttention.__init__N  sI   Of&66	!-"/)]D(
V%677If/GGG	Yv0)a-eLLL
i	6+<==<D,<(<q(@$-PP!!T%69I%I%I,#'#3, ,(, , ,   &J%IrM   r   attention_distsrQ   c                    |                      |          }|j        \  }}}t          j        || j        z            }|| j        z  }|dk    r1t
          j        j                            |ddd| j        |z
  f          }| 	                    |          }| 
                    |                              dd          \  }	}
|                    ||| j        | j        d                              dd          }|	                    ||| j        | j        d                              dd          }	|
                    ||| j        | j        d                              dd          }
|                    |j                  }|                     |          }|                    g dt'          |j                  z             }t          j        |                    d          |z  d          | j        z  }|dk    ryt          j        | j        | j        t0          |j                  }d|d |d |f<   t          j        |j                  j         }|d d dd d f                             ||           t
          j        j                            t
          j        j        j        j                   5  tC          j"        ||	|
|| j        	          }d d d            n# 1 swxY w Y   |                    dd                              ||j        d
         d          }| #                    |d d d |d d f                   S )Nr   r   ru   r      )r&   r&   r&   )dtypedevice)	attn_maskr   r&   )$r   rx   r   r   r   rG   r   r   r   r   r   chunkreshaper   	transposetor   r   r   rJ   r   	unsqueezer   onesboolfinfor   maxmasked_fill_	attentionsdpa_kernel
SDPBackendMATHFscaled_dot_product_attentionr   )rU   r   r   bsznum_featuresr   
num_blocks	remainderquery_states
key_statesvalue_statesdistr   rel_pos_emb_expandedpos_attnmask
mask_valueouts                     rN   r   z'GraniteSpeechConformerAttention.forwardd  sq    m44,2\1Y|d.??@@
 4#44	q==!H/331a):Y)FG M yy//#'::m#<#<#B#B1"#B#M#M 
L#++T.
 

)Aq// 	  ''T.
 

)Aq// 	 $++T.
 

)Aq// 	
 !!-"677&&t,,*//			DAR<S<S0STTIl,,R003GGRPPPj 	
 q==:!!$+	  D ,-D)ZiZ'(+hn5599JQQQAAAX++D*===X++EH,>,I,NOO 	 	0"j  C	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 mmAq!!))#}/B1/ErJJ{{3qqq-<-23444s   0KK"Kr   rC   rD   rE   rF   r
   r[   r   rG   rH   r   r   r   s   @rN   r   r   H  s         
 /       ,;5"\;5<AL;5	;5 ;5 ;5 ;5 ;5 ;5 ;5 ;5rM   r   c            	       Z     e Zd ZdZddedededef fdZdej        d	ej        fd
Z	 xZ
S )%GraniteSpeechConformerDepthWiseConv1dz,Wrapper for padded 1D pointwise convolution.r   chan_inchan_outkernel_sizer   c                     t                                                       |dz  }|dz   dz  }|||z
  f| _        t          j        ||||d          | _        d S )Nr   r&   F)groupsr   )r   r   paddingr   Conv1dconv)rU   r  r  r   r   r   
pad_offsetr   s          rN   r   z.GraniteSpeechConformerDepthWiseConv1d.__init__  si    Q!Ao*
S:-.IX{7
 
 
			rM   r   rQ   c                 `    t          j        || j                  }|                     |          S r   )r  r   r#  r%  rU   r   s     rN   r   z-GraniteSpeechConformerDepthWiseConv1d.forward  s'    mT\::yy'''rM   r  )rC   rD   rE   rF   rK   r[   r   rG   rH   r   r   r   s   @rN   r  r    s        66	
 	
 	
s 	
 	
c 	
 	
 	
 	
 	
 	
(U\ (el ( ( ( ( ( ( ( (rM   r  c                   R     e Zd ZdZd	dedef fdZdej        dej        fdZ	 xZ
S )
 GraniteSpeechConformerConvModulezZConformer conv module consisting of several 1D/depthwise 1D
    convolutional layers.
    r   r   r   c                    t                                                       |j        |j        z  }t	          j        |j                  | _        t	          j        |j        |dz  d          | _        t	          j	        d          | _
        t          |||j        | d          | _        t	          j                    | _        t	          j        |          | _        t	          j        ||j        d          | _        d S )Nr   r&   r   z.depth_conv)r   r   )r   r   r   conv_expansion_factorr   r   normr$  up_convGLUglur  conv_kernel_size
depth_convr   r   BatchNorm1d
batch_norm	down_convr   s       rN   r   z)GraniteSpeechConformerConvModule.__init__  s    %(DD	L!233	y!2IM1EE6a===?/)))	
 
 
 GII	.339f.?CCrM   r   rQ   c                 |   |                      |          }|                     |                    ddd                    }|                     |          }|                     |          }|                     |                     |                    }|                     |                              ddd          }|S )Nr   r   r&   )r-  r.  permuter0  r2  r   r4  r5  r(  s     rN   r   z(GraniteSpeechConformerConvModule.forward  s    		-00]%:%:1a%C%CDD//66		$//-"@"@AA}55==aAFFrM   r  r  r   s   @rN   r*  r*    s         D D/ D D D D D D D"U\ el        rM   r*  c                   `     e Zd ZdZd
dedef fdZdej        dej        dej        fd	Z	 xZ
S )GraniteSpeechConformerBlockz^Conformer block, consisting largely of linear layers,
    attention, and convolutional layers.r   r   r   c                 L   t                                                       t          || d          | _        t	          || d          | _        t          || d          | _        t          || d          | _        t          j
        |j                  | _        d S )Nz.ff1r   z.attnz.convz.ff2)r   r   r   ff1r   attnr*  r%  ff2r   r   r   	post_norm)rU   r   r   r   s      rN   r   z$GraniteSpeechConformerBlock.__init__  s    4VvOOOTTT3FfCSCSCSTTT	4VvDTDTDTUUU	4VvOOOTTTf&788rM   r   r   rQ   c                     d|                      |          z  |z   }|                     ||          |z   }|                     |          |z   }d|                     |          z  |z   }|                     |          }|S )Ng      ?r   )r<  r=  r%  r>  r?  )rU   r   r   s      rN   r   z#GraniteSpeechConformerBlock.forward  s     dhh}555EIIm_IEEU 	 		-00=@dhh}555E}55rM   r  r  r   s   @rN   r9  r9    s        , ,9 9/ 9 9 9 9 9 9 9
"\
<AL
	
 
 
 
 
 
 
 
rM   r9  c                   P     e Zd ZdZ	 d	dedededz  f fdZdej	        fdZ
 xZS )
GraniteSpeechCTCEncoderzECTC Encoder comprising conformer blocks and additional linear layers.Nr   r   r   c                    t                                                       | _        t          j        j                  }|                    dd          |                    dd          z
  }t          j        |j         j                  j        z   | _	        t          j        j        j        d          | _        t          j        fdt!          j                  D                       | _        t'          j        j        d| d          | _        t-          j        j        d| d          | _        t          j        d	          | _        j        | _        d S )
Nru   r&   Tr   c                 <    g | ]}t           d |           S )z.layers.r;  )r9  ).0idxr   r   s     rN   
<listcomp>z4GraniteSpeechCTCEncoder.__init__.<locals>.<listcomp>  sN       
 	 ,$33c33    rM   z.out)r   r   r   r   r   z.out_midr   )r   r   r   rG   aranger   r   clampr   r   r   r   	input_dimr   input_linear
ModuleListrange
num_layerslayersr   
output_dimr  r   out_midSoftmaxsoftmax)rU   r   r   r   seqrelpos_distr   s    ``   rN   r   z GraniteSpeechCTCEncoder.__init__  s    	 l6.//hhr1ooB7Kf&9%96;NOO ! 	
 If&68IPTUUUm    
 !!233  
 
 (()%???
 
 
 )()%&&&
 
 
 zb))) +rM   r   c                 d   |                      |          }t          | j        d          D ]\  }} ||| j                  }|| j        dz  k    r^|                                }|                     |          \  }}|                     |          }|                     |          \  }}||z  }|S )Nr&   )startrA  r   )	rL  	enumeraterP  r   rO  cloner  rT  rR  )rU   r   rG  layerhidden_states_midr   s         rN   r   zGraniteSpeechCTCEncoder.forward  s    ))-88#DKq999 	3 	3JC!E-AUVVVMdo***$1$7$7$9$9!'+xx0A'B'B$!1$(LL1B$C$C!'+||4E'F'F$!1!22rM   r   )rC   rD   rE   rF   r
   r[   r   r   rG   rH   r   r   r   s   @rN   rC  rC    s        OO 37	,, ,, ,, ,, )4/	,, ,, ,, ,, ,, ,,\U\        rM   rC  )ra   dummy_inputsc                       e Zd ZeZg dddgdZededededz  fd	            Z	d
dde
def fdZdededz  fdZdej        dej        fdZdeej                 dej        fdZdedeej                 fdZdedefdZ	 d4ddddej        dedz  dej        dz  dedej        f
 fdZ	 	 d5dej        dej        d edz  d!ej        dz  dedej        ez  fd"Zd#ej        dej        dz  fd$Zd%eeeej        f                  dee         fd&Zde fd'Z!ed(e"j#        d)e$d*e%d+edz  d,e&d-         d.ed/edz  de'fd0            Z(ed1e)d*e%d)e$dedz  fd2            Z*ed)e$d,ede%fd3            Z+ xZ,S )6%GraniteSpeechForConditionalGeneration)q_projk_projv_proj	gate_projr   )qkv_projgate_up_projr   irQ   Nc                 N    |                     d          rdS t          d          )NrS   rr   z Only audio modality is supported)
startswithr   )clsr   rf  s      rN   get_placeholder_strz9GraniteSpeechForConditionalGeneration.get_placeholder_strG  s,    w'' 	;;<<<rM   r   r;  vllm_configr   c          	      F   t                                                       |j        j        }|j        }|j        }|| _        || _        || _        |                     |          5  t          ||j	        t          |d                    | _        d d d            n# 1 swxY w Y   |                     |d          5  t          |j        || d          | _        t!          |||| d          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nlanguage_model)rk  	hf_configr   rS   z.encoder)r   r   r   z
.projector)r   r   r   r   )r   r   model_configrn  r   r   r   _mark_language_modelr.   r   r/   rm  _mark_tower_modelrC  encoder_configencoderr   	projectormake_empty_intermediate_tensors)rU   rk  r   r   r   r   r   s         rN   r   z.GraniteSpeechForConditionalGeneration.__init__N  s   )3"/"/((&&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ##K99 	 	2,) ***  DL ;)) ,,,	  DN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	" ? 	,,,s$   &+BB!$B!>;DD	D	kwargsc                    |                     dd           }|                     dd           }|                     dd           }|d S ||                     |          }t          |t          j        t
          f          st          dt          |                     |9t          |t          j                  st          dt          |                     t          |t          j                  rt          |j	                  dk    r|
                    d          }t          |j	                  dk    rt          d	|j	                   |                    | j        j        j        j                  }nHd
 |D             }|                     |                              | j        j        j        j                  }t#          |||                                                                          S )Nr?   rA   rB   z2Incorrect type of audio input features. Got type: z7Incorrect type of audio input features mask. Got type:    r&   r   z6Squeezed input features should be 3D but are of shape c                 P    g | ]#}|j         d k    |                    d          $S )r   r   r   )ndimr  )rF  feats     rN   rH  zYGraniteSpeechForConditionalGeneration._parse_and_validate_audio_input.<locals>.<listcomp>  s5       *.DIQRNN1%%NNNrM   )r?   rA   rB   )r   _build_input_features_mask
isinstancerG   rH   rJ   r   typelenrx   squeezer   rs  rL  weightr   _pad_and_stack_input_featuresr;   flattentolist)rU   rv  r?   rA   rB   s        rN   _parse_and_validate_audio_inputzEGraniteSpeechForConditionalGeneration._parse_and_validate_audio_inputt  s)     $4d;;$jj)>EE"JJ':DAA!4 &"&"A"ABS"T"T.5<*>?? 	4!.114 4  
 *:4
 4
* 9!"5669 9  
 nel33 	9 >'((A--!/!7!7!:!:>'((A-- .%+. .   ,..t|/H/O/UVVNN 2@  N
 "?? b*1788  () 3/7799@@BB
 
 
 	
rM   rB   c                     t          j        |                                          }t          j        ||j                                      dd          }||                    dd          k     }|S )a  Calculate the input features mask, which will generally be used
        to mask the padded features for all entries in the batch except
        for those with the most audio features.

        Args:
            audio_embed_sizes: torch.Tensor
                Tensor of num features in each seq in the batch.
        Returns:
            torch.Tensor: Mask of shape (bsz, num_features) to be applied to
            the audio features prior to splitting the audio embeddings.
        )r   r&   ru   )rG   r  itemrI  r   r   )rU   rB   most_audio_featuresmask_indicesrA   s        rN   r|  z@GraniteSpeechForConditionalGeneration._build_input_features_mask  sv     $i(9::??AA|$+
 
 
 $q"++ 	 +->-C-CB-J-JJ""rM   r?   c                     d |D             fdD             }d t          ||          D             }t          j        |d                              |d                   }|S )a  Given a list of input features of varying length, pad them to the
        same length and stack them into a torch.Tensor.

        NOTE: Usually, padding is done in the input processor/feature extractor
        and zero padded prior to the computation of the Mel features; the
        resulting values are only constant within a batch and generally nonzero
        (i.e., slightly negative nums); we should validate that this is okay
        since we don't use a feature attention mask, but the more important
        thing is that we apply the input_features_mask with variable len
        batches.

        Args:
            input_features: list[torch.Tensor]
                3D Input features to be coerced into a tensor.
        Returns:
            torch.Tensor: Tensor of shape [bsz, num_features, 160], where
            num_features is the max number of features of any entry in the
            batch.
        c                 (    g | ]}|j         d          S )r&   )rx   )rF  featss     rN   rH  zWGraniteSpeechForConditionalGeneration._pad_and_stack_input_features.<locals>.<listcomp>  s    @@@U[^@@@rM   c                 4    g | ]}t                    |z
  S rL   )r  )rF  r   	feat_lenss     rN   rH  zWGraniteSpeechForConditionalGeneration._pad_and_stack_input_features.<locals>.<listcomp>  s$    CCCv3y>>F*CCCrM   c                 j    g | ]0\  }}t           j        j                            |d d d |d d f          1S )r   )rG   r   r   r   )rF  r  r   s      rN   rH  zWGraniteSpeechForConditionalGeneration._pad_and_stack_input_features.<locals>.<listcomp>  sN     
 
 
s H##EAq!S!Q+?@@
 
 
rM   r   r   )ziprG   catr   )rU   r?   r#  paddedstacked_featuresr  s        @rN   r  zCGraniteSpeechForConditionalGeneration._pad_and_stack_input_features  s    . A@@@@	CCCCCCC
 
!.'::
 
 
 !9V33366~a7HIIrM   audio_inputc                     |                      |d                   }|                     |          }||d                  }t          j        ||d                   S )an  Compute the audio features to be merged into the LLM embeddings.

        Args:
            audio_input: GraniteSpeechAudioInputs
                Audio inputs object containing Mel features, an input features
                mask, and the (flattened) number of audio tokens per instance.
        Returns:
            tuple[torch.Tensor]: List of length bsz.
        r?   rA   rB   )rs  rt  rG   split)rU   r  encoder_embedsprojected_embedsmasked_embedss        rN   _process_audio_inputz:GraniteSpeechForConditionalGeneration._process_audio_input  sU     k2B&CDD>>.99(5J)KL{=+6I*JKKKrM   c                 R     | j         di |}|g S |                     |          }|S )z9Compute the audio embeddings if audio inputs are present.NrL   )r  r  )rU   rv  r  audio_featuress       rN   embed_multimodalz6GraniteSpeechForConditionalGeneration.embed_multimodal  s@    
 ;d:DDVDDI22;??rM   T)is_multimodalhandle_oov_mm_tokenr   multimodal_embeddingsr  r  c                    ||!t                                          |          S t                                          ||||          S )N)r  r  r  )r   embed_input_ids)rU   r   r  r  r  r   s        rN   r  z5GraniteSpeechForConditionalGeneration.embed_input_ids  sU     !(M,A77**9555ww&&"7' 3	 ' 
 
 	
rM   	positionsintermediate_tensorsinputs_embedsc                 >    |d }|                      ||||          }|S r   )rm  )rU   r   r  r  r  rv  model_outputs          rN   r   z-GraniteSpeechForConditionalGeneration.forward'  s6      + M**y"6
 
 rM   r   c                 6    | j                             |          S r   )rm  compute_logitsr(  s     rN   r  z4GraniteSpeechForConditionalGeneration.compute_logits7  s     "11-@@@rM   weightsc                 J    t          |           }|                    |          S r   )r-   load_weights)rU   r  loaders      rN   r  z2GraniteSpeechForConditionalGeneration.load_weights=  s%     #4((""7+++rM   c                 0    t          j        ddd          S )z+Get the module prefix in multimodal models.rm  rt  rs  )rm  	connectortower_model)r   from_string_fieldrT   s    rN   get_mm_mappingz4GraniteSpeechForConditionalGeneration.get_mm_mappingD  s%    /+!!
 
 
 	
rM   rS   ro  
stt_configlanguage	task_type)
transcribe	translaterequest_promptto_languagec                    |                      dd          }|dk    r#| j                            ||          }	| d|	 }
n|dk    r| d}
nt          d|           t	          |          }t          d|
	          g}|                    |d
d          }|                    |          }|d|id}t          t          |          S )z@Get the generation prompt to be used for transcription requests.rS   r   r  ztranslate the speech to r  z4can you transcribe the speech into a written format?zUnsupported task type user)rolecontentFT)tokenizeadd_generation_prompt)prompt_token_idsmulti_modal_data)
rj  supported_languagesrw   r   r"   rk   apply_chat_templateencoder   r   )ri  rS   ro  r  r  r  r  r  	audio_tokfull_lang_name_touser_promptr   chatr   r  s                  rN   get_generation_promptz;GraniteSpeechForConditionalGeneration.get_generation_promptM  s    ++GQ77	## # 7 ; ;K U U&SS@QSSKK,&&RRR K AiAABBB0>>	&+6667.."& / 
 
 %++F33 0!(% 0
 
 J'''rM   audio_duration_sc                     t          |          }|j        j        d         }|j        j        }|j        j        }||z  }||j        z  }	|	|z  dz   }
|
dz  }t          j        ||z            }||z  S )z<Get the number of audio tokens for an audio duration in sec.
hop_lengthr&   r   )r#   rc   rd   projector_window_sizeprojector_downsample_rater_   r   r   )ri  r  r  ro  r   r  proj_win_sizeds_rateeffective_window_size
raw_length
mel_lengthencoder_lengthr   s                rN   get_num_audio_tokensz:GraniteSpeechForConditionalGeneration.get_num_audio_tokensv  s     1>>	.=lK
!1G+E - 8%
(>>
  :-1
#q)N]:;;...rM   c                     t                      S )z"Get the stt config for this model.)r   )ri  ro  r  s      rN   get_speech_to_text_configz?GraniteSpeechForConditionalGeneration.get_speech_to_text_config  s     "###rM   r   )NN)-rC   rD   rE   ISO639_1_SUPPORTED_LANGSr  packed_modules_mappingclassmethodr[   rK   rj  r   r   r   r;   r  rG   rH   r|  rJ   r  tupler  r(   r  r  r  r!   r   r  r   setr  r   r  npndarrayr   r   r   r   r  floatr  r  r   r   s   @rN   r_  r_  -  s'        3
 
 
 

 
 =3 =3 =3: = = = [= BD $
 $
 $
z $
3 $
 $
 $
 $
 $
 $
LA
A
 
"D	(A
 A
 A
 A
F# <# 
# # # #.! U\*!  
!  !  !  ! FL-L 
u|	L L L L,

 

 
 
 
 >B

 .2$(
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
0 <@-1 < < 2D8	
 |d*  
+	+    A|A 
	A A A A,%U\ 123, 
S, , , ,
 
 
 
 
 %(z%( "%( '	%(
 *%( 45%( %( 4Z%( 
%( %( %( [%(P // '/ "	/
 
t/ / / [/. $&$36$	$ $ $ [$ $ $ $ $rM   r_  )WrF   r   collections.abcr   r   typingr   r   r   numpyr  rG   torch.nn.functionalr   r   r  transformersr	   r
   vllm.configr   r   r   r   vllm.config.multimodalr   vllm.inputs.datar   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r    vllm.sequencer!   vllm.tokenizersr"   !vllm.transformers_utils.processorr#   vllm.utils.tensor_schemar$   r%   blip2r'   
interfacesr(   r)   r*   r+   r,   utilsr-   r.   r/   r  r;   rP   r]   r   Moduler   r   r   r  r*  r9  rC  register_processorr_  rL   rM   rN   <module>r     sP  2 / .  - - - - - - - - + + + + + + + + + +                     7 7 7 7 7 7 7 7 P P P P P P P P P P P P 3 3 3 3 3 3 ' ' ' ' ' ' U U U U U U U U F F F F F F D D D D D D / / / / / /         
         
              . - - - - - 8 8 8 8 8 8 J J J J J J > > > > > > > > $ $ $ $ $ $              O N N N N N N N N N 



  @ @ @ @ @| @ @ @*    ,>    O! O! O! O! O!ABO! O! O!d( ( ( ( (@A( ( (82 2 2 2 2BI 2 2 2p         	      FW5 W5 W5 W5 W5bi W5 W5 W5t( ( ( ( (BI ( ( (&    ry   @    ")   2< < < < <bi < < <~ ('$	.0  
d$ d$ d$ d$ d$Id$ d$ 
d$ d$ d$rM   