
    .`ixl                        U d Z ddlZddlZddlmZmZmZmZ ddlmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z> ddl?m@Z@mAZAmBZB ddlCmDZDmEZEmFZF eGeHeGeGf         z  eeG         z  ZIe
eJd<   deIdeHeGeGf         fdZK	 	 	 	 	 dBd!eGd"eGd#eGd$eGd%eGdeGfd&ZL G d' d(ejM                  ZN G d) d*ejM                  ZO G d+ d,ejM                  ZP G d- d.ejM                  ZQ G d/ d0ejM                  ZR G d1 d2ejM                  ZS G d3 d4ejM                  ZT G d5 d6ejM                  ZU G d7 d8e=          ZV G d9 d:e4          ZW G d; d<e2eW                   ZX G d= d>e3eW                   ZY e)jZ        eYeWeX?           G d@ dAejM        eAeB                      Z[dS )CzEInference-only MiDashengLM model compatible with HuggingFace weights.    N)CallableIterableMappingSequence)	AnnotatedAny	TypeAliascast)scaled_dot_product_attention)BatchFeature)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
get_act_fn)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)DashengConfig)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefix_Tuple2xreturnc                    t          | t          j        j                  rdt	          |           dk    sJ d|  dt	          |                        t          t          t          t          f         t          |                     S | | fS )N   z%Expected a sequence of length 2, got z with length )
isinstancecollectionsabcr   lenr
   tupleint)r.   s    z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/midashenglm.py_resolve_tuple2r9   J   su    ![_-.. /1vv{{{LALLCFFLL {{ E#s(OU1XX...q6M             T   audio_length_samplesn_ffthop_sizedasheng_subsamplingmodel_subsamplingc                 L    |r| |z   } t          d| |z
  |z  z             |z  |z  S )z/Calculate the number of Mel-spectrogram frames.r&   )r7   )r?   r@   rA   rB   centerrC   s         r8   calculate_mel_frames_dashengrF   S   sI      <3e; 	A&.(:;<<		r:   c                   v     e Zd Z	 	 	 	 	 	 	 ddeded	ed
edededz  def fdZdej	        dej	        fdZ
 xZS )AudioPatchEmbed@      r&      NF
input_size
patch_sizepatch_stridein_chans	embed_dim
norm_layerflattenc                    t                                                       t          |          | _        t          |          | _        t          |          | _        | j        d         | j        d         z  | j        d         | j        d         z  f| _        | j        d         | j        d         z  | _        || _        t          ||| j        | j                  | _
        |r ||          nt          j                    | _        d S )Nr   r&   )kernel_sizestride)super__init__r9   rL   rM   rN   	grid_sizenum_patchesrR   r   projnnIdentitynorm)	selfrL   rM   rN   rO   rP   rQ   rR   	__class__s	           r8   rW   zAudioPatchEmbed.__init__g   s     	)*55)*55+L99OA$"3A"66OA$"3A"66
  >!,t~a/@@$	
 
 
	 .8JJJy)))R[]]			r:   r.   r/   c                     |                      |          }| j        r)t          j        t          j        |dd          d          }|                     |          }|S )Nr1      r   r1   r&   )rZ   rR   torchpermuter]   r^   r.   s     r8   forwardzAudioPatchEmbed.forward   sU    IIaLL< 	aA&&	 A IIaLLr:   )rI   rJ   rJ   r&   rK   NF)__name__
__module____qualname__r-   r7   r   boolrW   rc   Tensorrf   __classcell__r_   s   @r8   rH   rH   f   s         !  "&*K KK K 	K
 K K tOK K K K K K K: %,        r:   rH   c                   D     e Zd Zd fd	Zdej        dej        fdZ xZS )
LayerScaleh㈵>Fc                     t                                                       || _        t          j        |t          j        |          z            | _        d S N)rV   rW   inplacer[   	Parameterrc   onesgamma)r^   diminit_valuesrs   r_   s       r8   rW   zLayerScale.__init__   sB    \+
3"?@@


r:   r.   r/   c                 X    | j         r|                    | j                  n	|| j        z  S rr   )rs   mul_rv   re   s     r8   rf   zLayerScale.forward   s(    %)\Eqvvdj!!!q4:~Er:   )rp   F)rg   rh   ri   rW   rc   rk   rf   rl   rm   s   @r8   ro   ro      ss        A A A A A A
F F%, F F F F F F F Fr:   ro   c                   t     e Zd Z	 	 	 	 ddededz  dedz  dedz  def
 fdZd	ej        d
ej        fdZ	 xZ
S )
DashengMlpN in_featureshidden_featuresout_featuresquant_configprefixc                     t                                                       |p|}|p|}t          |||| d          | _        t	          d          | _        t          |||| d          | _        d S )Nz.fc1rL   output_sizer   r   geluz.fc2)rV   rW   r   fc1r   actr   fc2)r^   r~   r   r   r   r   r_   s         r8   rW   zDashengMlp.__init__   s     	#2{)8['"'%???	
 
 
 f%%$&$%???	
 
 
r:   r.   r/   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S rr   )r   r   r   )r^   r.   _s      r8   rf   zDashengMlp.forward   s<    xx{{1HHQKKxx{{1r:   )NNNr}   )rg   rh   ri   r7   r   strrW   rc   rk   rf   rl   rm   s   @r8   r|   r|      s         '+#'26
 

 t
 Dj	

 )4/
 
 
 
 
 
 
2 %,        r:   r|   c                   p     e Zd Z	 	 	 	 ddededededz  d	ef
 fd
Zddej	        dej	        dz  fdZ
 xZS )DashengAttention   FNr}   rw   	num_headsqkv_biasr   r   c           
         t                                                       ||z  dk    s
J d            || _        t                      }|| _        | j        |z  dk    sJ | j        |z  | _        | j        |k    r| j        |z  dk    sJ n|| j        z  dk    sJ t          d| j        |z            | _        | j        | j        z  | _        | j        | j        z  | _	        | j        | j        z  | _
        | j        dz  | _        t          | j        | j        | j        | j        ||| d          | _        t          |||| d          | _        d S )	Nr   z$dim should be divisible by num_headsr&   g      z.qkv)hidden_size	head_sizetotal_num_headstotal_num_kv_headsbiasr   r   z.projr   )rV   rW   rP   r   r   r   maxnum_kv_headshead_dimq_sizekv_sizescaler   qkvr   rZ   )r^   rw   r   r   r   r   tp_sizer_   s          r8   rW   zDashengAttention.__init__   s    	Y!###%K###688(#g-2222-87** ''1Q66666 T11Q66664#77#BCC$*>>nt}4(4=8]D(
$m 0#3%???
 
 
 &%###	
 
 
			r:   r.   maskc                    |j         \  }}}|                     |          \  }}|                    ||d| j        || j        z            }|                    ddddd          }|                    d          \  }}	}
t          ||	|
||d d d d d d f         nd           }|                    dd                              |||          }|                     |          \  }}|S )Nra   r1   r   r&   r=   )	attn_mask)	shaper   reshaper   rd   unbindr   	transposerZ   )r^   r.   r   BNCr   r   qkvs              r8   rf   zDashengAttention.forward   s    '1a!Qkk!Q4>13FGGkk!Q1a((**Q--1a(040@d111dD!!!+,,d	
 
 
 KK1%%aA..yy||1r:   )r   FNr}   rr   )rg   rh   ri   r7   rj   r   r   rW   rc   rk   rf   rl   rm   s   @r8   r   r      s         26+
 +
+
 +
 	+

 )4/+
 +
 +
 +
 +
 +
 +
Z  U\D-@        r:   r   c                        e Zd Z	 	 	 	 	 ddedededed	edz  d
edz  def fdZ	 dde	j
        de	j
        dz  de	j
        fdZ xZS )DashengBlock      @FNr}   rw   r   	mlp_ratior   rx   r   r   c                    t                                                       t          j        |d          | _        t          ||||| d          | _        |rt          ||          nt          j                    | _	        t          j        |d          | _
        t          |t          ||z            || d          | _        |rt          ||          nt          j                    | _        d S )Nư>epsz.attn)r   r   r   r   )rx   z.mlp)r~   r   r   r   )rV   rW   r[   	LayerNormnorm1r   attnro   r\   ls1norm2r|   r7   mlpls2)	r^   rw   r   r   r   rx   r   r   r_   s	           r8   rW   zDashengBlock.__init__   s    	\#4000
$%###
 
 
	 9DVJs4444 	 \#4000
i00%???	
 
 
 9DVJs4444 	r:   r.   r   r/   c                     ||                      |                     |                     |          |                    z   }||                     |                     |                     |                              z   }|S rr   )r   r   r   r   r   r   )r^   r.   r   s      r8   rf   zDashengBlock.forward  sc    
 4::a==$77888$**Q--00111r:   )r   FNNr}   rr   )rg   rh   ri   r7   floatrj   r   r   rW   rc   rk   rf   rl   rm   s   @r8   r   r      s        
 $(26 
  
 
  
 	 

  
 T\ 
 )4/ 
  
  
  
  
  
  
L %) < lT! 
	       r:   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )DashengFrontendconfigc                    t                                                       || _        t          j        | j        j                  }|                     d|d           |  t          j        | j        j	        dz  dz   | j        j
        | j        j        | j        j        | j        j                  }|                     d|d           |  d S )Nspectrogram_windowF)
persistentr1   r&   )n_freqsf_minf_maxn_melssample_ratemelscale_fbanks)rV   rW   r   rc   hann_window
win_lengthregister_bufferFr   r@   r   r   r   r   )r^   r   r   r   r_   s       r8   rW   zDashengFrontend.__init__*  s    ".t{/EFF  	 	
 	
 	

 	-+K%*Q.+#+#;%/
 
 
 	.ERRR***r:   waveformr/   c                    t          j        |                    t          j                  d| j        | j        j        | j        j        | j        j	        dd| j        j
        	  	        }|j        | j                            t          j                  z  j        }t          j        |                    d          dddd	                              d          }|                    |j                  S )
Nr   r1   F)	r   padwindowr@   
hop_lengthr   power
normalizedrE   r&   
   g|=x   )
multiplieramindb_multipliertop_db)r   spectrogramtorc   float32r   r   r@   r   r   rE   mTr   amplitude_to_DB	unsqueezesqueezedtype)r^   r   r   mel_spectrogramlog_mel_spectrograms        r8   rf   zDashengFrontend.forward@  s    m[[//*+#{-{-;%

 

 

 '>D,@,C,CEM,R,RRV  /%%a((
 
 
 '!** 	 #%%hn555r:   )	rg   rh   ri   r#   rW   rc   rk   rf   rl   rm   s   @r8   r   r   )  sj        +} + + + + + +,6 6 6 6 6 6 6 6 6 6r:   r   c            
            e Zd Z	 	 ddededz  def fdZ	 ddej        dej        dz  d	ej        fd
Z	dej        de
d	ej        fdZ	 ddej        dej        dz  d	eej        ej        dz  f         fdZ xZS )DashengAudioTransformerNr}   r   r   r   c           	         t                                                       j        | _        j        | _        t	                    | _        t          j        j        d          | _	        t          j        j        fj        j        j        dj                  | _        t          j        t#          j        dj        d| j        j        d                             | _        t          j        t#          j        dj        | j        j        d         d                    | _        t          j        fdt/          j                  D                       | _        t          j        j        d	          | _        d S )
Ng{Gz?)momentumF)rL   rP   rO   rM   rR   rN   r&   r   c              3      K   | ]8}t          j        j        j        j        j         d |           V  9dS )z.blocks.)rw   r   r   r   rx   r   r   N)r   rP   r   r   r   rx   ).0ir   r   r   s     r8   	<genexpr>z3DashengAudioTransformer.__init__.<locals>.<genexpr>}  sy       $
 $
  $ * *".) --!--  $
 $
 $
 $
 $
 $
r:   r   r   )rV   rW   target_lengthr   r   	front_endr[   BatchNorm2dr   init_bnrH   rP   input_channelsrM   rN   patch_embedrt   rc   emptyrX   time_pos_embedfreq_pos_embed
ModuleListrangedepthblocksr   r]   )r^   r   r   r   r_   s    ```r8   rW   z DashengAudioTransformer.__init___  sw    	#1 +(00~fmdCCC*v';<&*(,
 
 
 !lK6+Q0@0J10MNN
 
 !lK6+T-=-G-JANN
 
 m $
 $
 $
 $
 $
 $
 6<(($
 $
 $
 
 
 L!1t<<<			r:   r.   r   r/   c                 8   |j         d         }|| j        d d d d d d d |f         z   }|| j        d d d d d d d d f         z   }t          j        t          j        |dd          d          }| j        D ]} |||          }|                     |          }|S )Nr1   ra   rb   )r   r   r   rc   rd   rR   r   r]   )r^   r.   r   tblocks        r8   forward_featuresz(DashengAudioTransformer.forward_features  s    
 GBK#AAAqqq!!!RaRK00#AAAqqq!!!QQQJ// 	
 MM!Q""I
 
 [ 	 	EaAAIIaLLr:   lengths
max_lengthc                    t          |          }t          j        ||j                  }|                    |                              ||          }||                    d          k                                     }|S )Ndevicer   )r5   rc   aranger  repeatviewr   rj   )r^   r   r   
batch_sizeidxr   s         r8   _to_maskz DashengAudioTransformer._to_mask  sm    \\
l:gn===jj$$))*jAAg''+++1133r:   x_lengthc                    |                      |          }|                    | j        j                  }| j        dz  }|                    d          }t          j        |d          }|                     |          }t          j        |d          }| 	                    |          }|j
        d         }|                    |d          }|t          |          t          |          k    s
J d            |j        dk    s
J d            || j        dz  z                                  }|                     ||          }|                    |d          }nd }d gt          |          z  }g }	t#          ||          D ]/\  }
}i }||d	<    | j        |
fi |}
|	                    |
           0t          j        |	d          }||fS )
Nr=   r&   )r   r1   r&   ra   r   rw   z2batchsizes of input x and x_length need to be samezLengths are of size (B,))r   r   r   )r   r   r   r   r   r   rc   rd   r   r   r   splitr5   ndimr   longr	  zipr   appendcat)r^   r.   r
  target_length_in_patchesr   input_splitsscaled_lengthsr   split_masksoutputssplit_x
split_maskforward_kwargss                r8   rf   zDashengAudioTransformer.forward  s   
 NN1DD$*++#'#5#: KKNNM!\**LLOOM!\**QGBKww7Rw@@x==CFF***D +** =A%%%'A%%%&$/A*=>DDFFN==A~=FFD**%=2*FFKKD&3|#4#44K#&|[#A#A 	$ 	$GZN%/N6"+d+GFF~FFGNN7####Ig1%%%$wr:   )Nr}   rr   )rg   rh   ri   r#   r   r   rW   rc   rk   r   r7   r	  r6   rf   rl   rm   s   @r8   r   r   ^  s4        37	*= *=*= )4/*= 	*= *= *= *= *= *=^ %) < lT! 
	   $ # %,     )-& &<& ,%& 
u|U\D00	1	& & & & & & & &r:   r   c                   \     e Zd Z	 	 	 	 ddededej        dz  dedz  def
 fd	Zdd
Z	 xZ
S )AudioProjectorSubsampler>   Nr}   in_dimout_dimr   r   r   c                 
   t                                                       || _        t          j        t          || j        z  ||| dd          t          d          t          |||| dd                    | _        d S )Nz.net.0F)rL   r   r   r   return_biasr   z.net.2)	rV   rW   r   r[   
Sequentialr   r   r   net)r^   r  r  downsample_rater   r   r   r_   s          r8   rW   z AudioProjectorSubsample.__init__  s     	 = !DF?#) (((!   v"#) (((!  
 
r:   c                    |j         \  }}}|| j        z  }|dk    r#|d d d | d d f         }||d d d | f         }|3t          j        |j         d d         t          j        |j                  }|                    |d| j        |z            }| j        D ]} ||          }|                    |d| j                  }|                    d                                          }||fS )Nr   r   )r   r  r  )	r   r   rc   ru   r  r  r   r"  any)r^   r.   r   r  seq_lenrw   num_frames_to_discardlayers           r8   rf   zAudioProjectorSubsample.forward  s   #$7 
GS '$& 0 1$$!!!,,,,aaa/0AAAA6!6 6667<:agcrcl%*QXNNNDIIDFSL
 
 X 	 	EaAA||DF
 
 xxBx$$&&$wr:   )r>   NNr}   rr   )rg   rh   ri   r7   rc   r   r   r   rW   rf   rl   rm   s   @r8   r  r    s        
 $(26
 

 

 {T!
 )4/
 
 
 
 
 
 
:       r:   r  c                       e Zd ZU dZeej         edd          f         ed<   eej         ed          f         ed<   dS )MiDashengLMAudioInputszi

    Dimensions:
        - bn: Batch size * number of audios
        - p: Number of sampling points
    npinput_valuesaudio_lengthN)	rg   rh   ri   __doc__r   rc   rk   r%   __annotations__ r:   r8   r*  r*    sd           EL++c3*?*??@@@@EL++c*:*::;;;;;;r:   r*  c                   H    e Zd Zd Zd Zdeeedz  f         fdZd Z	d Z
dS )MiDashengLMProcessingInfoc                 4    | j                                         S rr   )ctxget_hf_configr^   s    r8   r6  z'MiDashengLMProcessingInfo.get_hf_config  s    x%%'''r:   c                 <    |                                  }|j        }|S rr   )get_hf_processorfeature_extractor)r^   hf_processorr:  s      r8   get_feature_extractorz/MiDashengLMProcessingInfo.get_feature_extractor  s"    ,,..(:  r:   r/   Nc                 
    dd iS )Naudior1  r7  s    r8   get_supported_mm_limitsz1MiDashengLMProcessingInfo.get_supported_mm_limits  s    r:   c                     dS )Ni  r1  r7  s    r8   get_min_audio_lenz+MiDashengLMProcessingInfo.get_min_audio_len  s    tr:   c                     dS )Ni q r1  r7  s    r8   get_max_audio_lenz+MiDashengLMProcessingInfo.get_max_audio_len  s    vr:   )rg   rh   ri   r6  r<  r   r   r7   r?  rA  rC  r1  r:   r8   r3  r3    sx        ( ( (! ! !
cDj)A          r:   r3  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	MiDashengLMDummyInputsBuilder	mm_countsr/   c                     |                     dd          }| j                                        }|j        }|j        }|j        }| | | }||z  S )Nr>  r   )getinfor9  audio_tokenaudio_bos_tokenaudio_eos_token)r^   rF  
num_audiosr;  rJ  rK  rL  single_audio_texts           r8   get_dummy_textz,MiDashengLMDummyInputsBuilder.get_dummy_text!  sa    ]]7A..
y1133".&6&6.NN_NN :--r:   Nr&  
mm_optionsc                     |                     dd          }|r|                     d          nd }d|                     | j                                        ||          iS )Nr>  r   )lengthrM  	overrides)rH  _get_dummy_audiosrI  rC  )r^   r&  rF  rP  rM  audio_overridess         r8   get_dummy_mm_dataz/MiDashengLMDummyInputsBuilder.get_dummy_mm_data,  so     ]]7A..
5?I*..111T T++y2244%) ,  
 	
r:   rr   )
rg   rh   ri   r   r   r7   rO  r   r   rV  r1  r:   r8   rE  rE     s        	.S(9 	.c 	. 	. 	. 	. =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r:   rE  c            
            e Zd ZdefdZdedeeef         deeef         deeef         de	f
 fdZ
de	d	eeef         deeef         fd
Zded	eeef         dedee         fdZ xZS )MiDashengLMMultiModalProcessorr/   c                 ^    | j                                         }t          |j                  S )N)	target_sr)rI  r<  r   sampling_rate)r^   r:  s     r8   _get_data_parserz/MiDashengLMMultiModalProcessor._get_data_parserB  s+     I;;==#.?.MNNNNr:   promptmm_data	mm_kwargs
tok_kwargsc                    |                     dg           }| j                                        fd|D             }|r||d<   |                    dg           sa| j                                                            |          }|                     |          }t          t          |g          d          S t          di |}t                      
                    ||||          S )	Naudiosc           	          g | ]W}t          |t          j                  r9|j        d          k     r(t          j        |d|j        d          z
  fdd          n|XS )r   r   constant)modeconstant_values)r2   npndarrayr   r   )r   r>  min_audio_lens     r8   
<listcomp>zEMiDashengLMMultiModalProcessor._call_hf_processor.<locals>.<listcomp>Q  s     

 

 

  %,, 27R=1P1P FMEKO34 !	    

 

 

r:   r>  )	input_idspt)tensor_type)r]  r^  r_  r`  r1  )poprI  rA  rH  get_tokenizerencode_apply_hf_processor_tokens_onlyr   dictrV   _call_hf_processor)
r^   r]  r^  r_  r`  rb  processed_audios
prompt_idsri  r_   s
           @r8   rs  z1MiDashengLMMultiModalProcessor._call_hf_processorF  s#    Xr** 	3355

 

 

 

  

 

 

  	0/GG{{7B'' 	P002299&AAJ==jIIJ
| < < <$OOOO 
 

 
	 ww))!	 * 
 
 	
r:   	hf_inputshf_processor_mm_kwargsc                 l    t          t          j        d          t          j        d                    S )Nr>  r-  r.  )rr  r   batched)r^   rv  rw  s      r8   _get_mm_fields_configz4MiDashengLMMultiModalProcessor._get_mm_fields_configp  s7    
 .6w??.6w??
 
 
 	
r:   mm_itemsout_mm_kwargsc                     | j         j        d	i |}| j                                         }|                                }t	          |dd          }||         |                                }|                    d          }	|	g nNt          |	t          j	                  r&|	
                                                                n|	}
d |
D             dt          ffd}t          d||          gS )
NrJ  z	<|AUDIO|>r.  c           
      b    g | ],}t          d t          t          |                              -S r&   r   rF   r7   r   rR  s     r8   rj  zFMiDashengLMMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>  sA     $ $ $ A3CKK@@AA$ $ $r:   item_idxc                 L    |          }g|z  }t          j        |          S )N)embed_token_id)r!   select_token_id)r  num_featuresaudio_tokensaudio_output_lengthsaudio_token_ids      r8   get_replacement_midashenglmzWMiDashengLMMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_midashenglm  s:    /9L*+l:L&6-   r:   r>  )modalitytargetreplacementr1  )rI  r9  ro  	get_vocabgetattrget_datarH  r2   rc   rk   cpunumpyr7   r   )r^   r|  rw  r}  	processor	tokenizervocabrJ  out_mm_datar.  audio_length_npr  r  r  s               @@r8   _get_prompt_updatesz2MiDashengLMMultiModalProcessor._get_prompt_updatesz  s<    /DI.HH1GHH	I++--	##%%iDD{+#,,.."~66#%   lEL99"  ""((***! 
$ $-$ $ $ 
	# 	 	 	 	 	 	 	  "7  
 	
r:   )rg   rh   ri   r   r\  r   r   objectr   r   rs  r   r{  r   r   r   r    r  rl   rm   s   @r8   rX  rX  ?  s6       O"6 O O O O(
(
 f%(
 38$	(

 CK((
 
(
 (
 (
 (
 (
 (
T

 !(V 4
 
++	,	
 
 
 
+
%+
 !(V 4+
 -	+

 
,	+
 +
 +
 +
 +
 +
 +
 +
r:   rX  )rI  dummy_inputsc                       e Zd Zg dddgdZededededz  fd	            Zd
ddedef fdZ	de
dedz  fdZdedeej        df         fdZde
defdZ	 	 ddej        dej        dedz  dej        dz  de
dej        ez  fdZdej        dej        dz  fdZdeeeej        f                  dee         fdZ xZS )MiDashengLMModel)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr  r   r/   Nc                 N    |                     d          rdS t          d          )Nr>  z#<|audio_bos|><|AUDIO|><|audio_eos|>z Only audio modality is supported)
startswith
ValueError)clsr  r   s      r8   get_placeholder_strz$MiDashengLMModel.get_placeholder_str  s-    w'' 	988;<<<r:   r}   )r   vllm_configr   c                   t                                                       |j        j        }|j        }|| _        || _        |                     |d          5  t          |j        |t          |d                    | _
        t          |j        j        |j        j        |j        |t          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t%          ||j        t          |d          dg          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )	Nr>  audio_encoder)r   r   audio_projector)r  r  r#  r   r   decoderQwen2ForCausalLM)r  	hf_configr   architectures)rV   rW   model_configr  r   r   _mark_tower_modelr   audio_encoder_configr,   r  r  rP   text_configr   subsample_factorr  _mark_language_modelr+   r  make_empty_intermediate_tensors)r^   r  r   r   r   r_   s        r8   rW   zMiDashengLMModel.__init__  s   )3"/(##K99 	 	!8+)#FO<<" " "D
 $;2<*6 & 7)#F,=>>$ $ $D 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	5' ,#FI6612	  DL	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 L8 	,,,s%   A+CCC0-D))D-0D-kwargsc                    |                     dd           }|                     dd           }|d S t          |t                    r+t          j        j        j                            |d          }t          ||          S )Nr-  r.  T)batch_firstry  )	rn  r2   listrc   r[   utilsrnnpad_sequencer*  )r^   r  r-  r.  s       r8   _parse_and_validate_audio_inputz0MiDashengLMModel._parse_and_validate_audio_input  s     zz.$77zz.$774lD)) 	 8>-::  ;  L
 &%%
 
 
 	
r:   audio_input.c                    |d         }|d         }|                      ||          \  }}|                     ||          \  }}|                    |d         j                  }|j        \  }}	}
d |                                D             }t          j        ||j                  }t          j	        |	|j                  
                    d                              ||	          |
                    d          k     }||                             d|
          }t          j        ||                                          S )Nr-  r.  c           
      b    g | ],}t          d t          t          |                              -S r  r  r  s     r8   rj  z9MiDashengLMModel._process_audio_input.<locals>.<listcomp>  sA      
  
  
 /F<<== 
  
  
r:   r  r   r&   r   )r  r  r   r   r   tolistrc   tensorr  r  r   expandr  r  )r^   r  r-  r.  encoder_outencoder_attsaudio_embeddingsr   r  max_audio_tokensrP   r  audio_feature_maskmasked_audio_featuress                 r8   _process_audio_inputz%MiDashengLMModel._process_audio_input  s[   
 #>2">2$($6$6|\$R$R!\"22;MM!+..{>/J/PQQ2B2H/
$i 
  
&--// 
  
  
  %| #* 
  
  

 #\%5%<
 
 

)A,,vv(
 
 **1--	. !11C D I I"i X X{02F2M2M2O2OPPPr:   c                 N     | j         di |}|g S |                     |          S )Nr1  )r  r  )r^   r  r  s      r8   embed_multimodalz!MiDashengLMModel.embed_multimodal  s9    :d:DDVDDI((555r:   rk  	positionsintermediate_tensorsinputs_embedsc                 F    |d }| j                             ||||          S )N)r  )r  model)r^   rk  r  r  r  r  s         r8   rf   zMiDashengLMModel.forward  s:      + M|!! '	 " 
 
 	
r:   hidden_statesc                 6    | j                             |          S rr   )r  compute_logits)r^   r  s     r8   r  zMiDashengLMModel.compute_logits/  s     |**=999r:   weightsc                 J    t          |           }|                    |          S rr   )r*   load_weights)r^   r  loaders      r8   r  zMiDashengLMModel.load_weights5  s#    "4((""7+++r:   )NN)rg   rh   ri   packed_modules_mappingclassmethodr   r7   r  r   rW   r  r*  r  r6   rc   rk   r  r'   r  r"   rf   r  r   setr  rl   rm   s   @r8   r  r    s       
 
 
 

 
 =3 =3 =3: = = = [= BD 
 
 
z 
3 
 
 
 
 
 
B

	$	&
 
 
 
(Q+Q 
u|S 	!Q Q Q Q@6 64H 6 6 6 6 <@-1
 
<
 <
 2D8	

 |d*
 
 
+	+
 
 
 
$:|: 
	: : : :,HU33D-E$F ,3s8 , , , , , , , ,r:   r  )r;   r<   r=   Tr>   )\r/  r3   collections.abcr   r   r   r   typingr   r   r	   r
   r  rg  rc   torch.nnr[   torchaudio.functional
functionalr   torch.nn.functionalr   transformersr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   r    r!   vllm.sequencer"   +vllm.transformers_utils.configs.midashenglmr#   vllm.utils.tensor_schemar$   r%   
interfacesr'   r(   r)   r  r*   r+   r,   r7   r6   r-   r0  r9   rF   ModulerH   ro   r|   r   r   r   r   r  r*  r3  rE  rX  register_processorr  r1  r:   r8   <module>r     s  0 L K K         A A A A A A A A A A A A 2 2 2 2 2 2 2 2 2 2 2 2            ! ! ! ! ! ! < < < < < < % % % % % % " " " " " " 3 3 3 3 3 3 A A A A A A < < < < < < 7 7 7 7 7 7         
 G F F F F F / / / / / /         
 L K K K K K K K                . - - - - - E E E E E E > > > > > > > > L L L L L L L L L L N N N N N N N N N N5c?*Xc]: : : :w 5c?         	  	   &% % % % %bi % % %PF F F F F F F F       B? ? ? ? ?ry ? ? ?D+ + + + +29 + + +\26 26 26 26 26bi 26 26 26jl l l l lbi l l l^0 0 0 0 0bi 0 0 0h	< 	< 	< 	< 	<\ 	< 	< 	<     2   &
 
 
 
 
$:;T$U 
 
 
>f
 f
 f
 f
 f
56f
 f
 f
R ('"	".  
J, J, J, J, J,ry"4j J, J, 
J, J, J,r:   