
    .`i              
       `   U d dl mZmZmZ d dlmZmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF d dlGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZPmQZQ ddlRmSZSmTZTmUZUmVZVmWZW ddlXmYZYmZZZm[Z[ ddl\m]Z]  G d dej^                  Z_ G d  d!ej^                  Z` G d" d#ej^                  Za G d$ d%ej^                  Zb G d& d'          Zc G d( d)ej^                  Zd G d* d+eH          Ze G d, d-eH          Zfeeefz  Zgeehd.<    G d/ d0ej^                  Zi G d1 d2e=          Zj G d3 d4e;ej                   Zkd5eelejm        f         d6enele2f         fd7Zo G d8 d9e9          Zp G d: d;e<d2                   Zq e/jr        eqejek<           G d= d>ej^        eUeVeTeW                      ZsdS )?    )IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliascastN)BatchFeature)GlmAsrConfigGlmAsrProcessor)WhisperFeatureExtractor)ModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
PromptType)
get_act_fn)MMEncoderAttention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)DictEmbeddingItemsModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)cached_tokenizer_from_config)cached_processor_from_config)TensorSchemaTensorShape   )DEFAULT_CONV_PARAMSDEFAULT_MAX_AUDIO_LEN_SDEFAULT_MERGE_FACTOR!_flatten_audio_features_by_length#_get_audio_output_lengths_for_tower_group_audio_embeddings_normalize_chunk_counts)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsTranscription)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefix)ISO639_1_SUPPORTED_LANGSc                   >     e Zd ZdZd fdZdedej        fdZ xZ	S )GlmAsrEncoderRotaryEmbeddinga$  
    Rotary Position Embedding for GLM-ASR encoder.

    Computes rotary position embeddings on-demand for efficiency.
    Only caches inv_freq as a buffer; cos/sin are computed during forward
    to avoid wasted computation during initialization and ensure correct
    device placement.
    returnNc                 Z   t                                                       t          |d|j        |j        z            }t          |d          rp|j        ri|j                            dd          }|j                            dd          }t          ||z            }|j                            dd          | _	        nt          |dd          }|}d| _	        || _
        || _        d|t          j        d|d	t          j        
          |z  z  z  }|                     d|d           d S )Nhead_dimrope_parameters
rope_thetag     @partial_rotary_factorg      ?attention_scalingr      dtypeinv_freqF)
persistent)super__init__getattrhidden_sizenum_attention_headshasattrrG   getintrJ   dimrF   torcharangefloatregister_buffer)selfconfigrF   baserI   rX   rN   	__class__s          u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/glmasr.pyrQ   z%GlmAsrEncoderRotaryEmbedding.__init__T   sD    J 2f6P P
 

 6,-- 	)&2H 	))--lGDDD$*$:$>$>'% %! h!6677C%+%;%?%?#S& &D"" 6<99DC%(D"  $5<3#M#M#MPS#STUZeDDDDD    seq_lenc                     t          j        || j        j        | j        j                  }t          j        || j                  }|| j        z  S )a9  
        Compute rotary position frequencies for given sequence length.

        Args:
            seq_len: The sequence length to compute embeddings for.

        Returns:
            Frequency tensor with shape [seq_len, dim/2]. Use .cos() and
            .sin() to get the rotary embedding components.
        )devicerM   )rY   rZ   rN   re   rM   outerrJ   )r]   rc   seqfreqss       ra   forwardz$GlmAsrEncoderRotaryEmbedding.forwardr   sN     lDM08K
 
 
 C//t---rb   )rD   N)
__name__
__module____qualname____doc__rQ   rW   rY   Tensorri   __classcell__r`   s   @ra   rC   rC   J   sq         E E E E E E<.s .u| . . . . . . . .rb   rC   c                   x     e Zd ZdZ	 	 ddedz  def fdZdej        dej        d	ej        d
ej        fdZ	 xZ
S )GlmAsrEncoderAttentiona$  
    Optimized Multi-headed Grouped Query Attention for GLM-ASR encoder.

    Uses vLLM's QKVParallelLinear for fused projections, ApplyRotaryEmb for
    rotary position embeddings, and MMEncoderAttention for hardware-optimized
    attention computation with automatic backend selection.
    N quant_configprefixc           
      d   t                                                       || _        |j        | _        |j        | _        t          |d|j                  | _        | j        | j        z  | _        t                      | _
        | j        | j
        z  | _        t          d| j        | j
        z            | _        t          | j        | j        | j        | j        d|| d          | _        t!          | j        | j        d|| d          | _        t          |dd           }|r|                    dd	          }nt          |dd	          }t'          | j        |z            | _        t+          d
          | _        t/          | j        | j        | j        dz  | j        | d          | _        d S )Nnum_key_value_headsr1   Tz	.qkv_projbiasrt   ru   z.o_projrG   rI   g      ?)enforce_enableg      z.attn)	num_heads	head_sizescalenum_kv_headsru   )rP   rQ   r^   rS   rT   r{   rR   r~   rF   r   tp_sizenum_heads_per_rankmaxnum_kv_heads_per_rankr   qkv_projr   o_projrV   rW   
rotary_dimr   apply_rotary_embr   attn)r]   r^   rt   ru   rope_paramsrI   r`   s         ra   rQ   zGlmAsrEncoderAttention.__init__   s    	!-3#)6+E
 
 (DN:;=="&.DL"@%(D,=,M%N%N"
 *MN%'''
 
 
 (%%%%
 
 
 f&7>> 	R$/OO4KS$Q$Q!!$+F4KS$Q$Q!dm.CCDD .d C C C '-m-%3###
 
 
			rb   hidden_statesrotary_pos_emb_cosrotary_pos_emb_sinrD   c                    |j         \  }}}|                     |          \  }}| j        | j        z  }| j        | j        z  }	|                    ||	|	gd          \  }
}}|
                    ||| j        | j                  }
|                    ||| j        | j                  }|                    ||| j        | j                  }|                     |
dd| j        f         ||          |
dd| j        f<   |                     |dd| j        f         ||          |dd| j        f<   | 	                    |
||          }|                    ||d          }| 
                    |          \  }}|S )@  
        Args:
            hidden_states: [batch_size, seq_len, hidden_size]
            rotary_pos_emb_cos: [seq_len, rotary_dim/2] - cosine of rotary embeddings
            rotary_pos_emb_sin: [seq_len, rotary_dim/2] - sine of rotary embeddings

        Returns:
            [batch_size, seq_len, hidden_size]
        rX   .N)shaper   r   rF   r   splitviewr   r   r   r   )r]   r   r   r   
batch_sizerc   _qkvq_sizekv_sizeqkvattn_outputoutputs                  ra   ri   zGlmAsrEncoderAttention.forward   s    "/!4
GQ }--Q (4=8,t}<))VWg6B)??1a FF:w(?OOFF:w(BDMRRFF:w(BDMRR
 %)$9$9c$T_$$%'9;M%
 %
#  
 ! %)$9$9c$T_$$%'9;M%
 %
#  
 ! ii1a(( "&&z7B?? KK,,	rb   Nrs   rj   rk   rl   rm   r   strrQ   rY   rn   ri   ro   rp   s   @ra   rr   rr      s          37	:
 :
 )4/:
 	:
 :
 :
 :
 :
 :
x1|1 "L1 "L	1
 
1 1 1 1 1 1 1 1rb   rr   c                   \     e Zd ZdZ	 	 d
dedz  def fdZdej        dej        fd	Z	 xZ
S )GlmAsrEncoderMLPzk
    Optimized MLP for GLM-ASR encoder.
    Uses vLLM's parallel linear layers for better performance.
    Nrs   rt   ru   c                 P   t                                                       || _        |j        | _        |j        | _        t          | j        | j        d|| d          | _        t          |j                  | _	        t          | j        | j        d|| d          | _        d S )NTz.fc1rx   z.fc2)rP   rQ   r^   rS   intermediate_sizer   fc1r   
hidden_actact_fnr   fc2r]   r^   rt   ru   r`   s       ra   rQ   zGlmAsrEncoderMLP.__init__  s     	!-!'!9'"%???
 
 
 !!233$"%???
 
 
rb   r   rD   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r   r   r   )r]   r   r   s      ra   ri   zGlmAsrEncoderMLP.forward!  sE    88M22qM2288M22qrb   r   r   rp   s   @ra   r   r      s          37	
 
 )4/
 	
 
 
 
 
 
:U\ el        rb   r   c                   x     e Zd ZdZ	 	 ddedz  def fdZdej        dej        d	ej        d
ej        fdZ	 xZ
S )GlmAsrEncoderLayerz
    Optimized Transformer encoder layer for GLM-ASR.
    Combines attention and MLP with residual connections and layer norms.
    Nrs   rt   ru   c                 j   t                                                       |j        | _        t          ||| d          | _        t          ||| d          | _        t          |dd          }t          j	        | j        |          | _
        t          j	        | j        |          | _        d S )Nz
.self_attnrt   ru   z.mlplayer_norm_epsh㈵>eps)rP   rQ   rS   rr   	self_attnr   mlprR   nn	LayerNorminput_layernormpost_attention_layernormr]   r^   rt   ru   r   r`   s        ra   rQ   zGlmAsrEncoderLayer.__init__.  s     	!-/%(((
 
 
 $%???
 
 
 !)94@@!|D,<.QQQ(*.)
 )
 )
%%%rb   r   r   r   rD   c                     |}|                      |          }|                     |||          }||z   }|}|                     |          }|                     |          }||z   }|S )r   )r   r   r   )r   r   r   r   )r]   r   r   r   residuals        ra   ri   zGlmAsrEncoderLayer.forwardI  s      !,,];;'11 ' 
 

 !=0 !55mDD// =0rb   r   r   rp   s   @ra   r   r   (  s          37	
 
 )4/
 	
 
 
 
 
 
6| "L "L	
 
       rb   r   c                   ,    e Zd ZdZdZdej        fdZdS )_GlmAsrEncoderOutputa  
    Simple output container compatible with transformers' BaseModelOutput.

    This lightweight container holds the encoder output and is compatible
    with the transformers library's output format while being more efficient
    than a full dataclass.

    Attributes:
        last_hidden_state: Final layer hidden states from the encoder.
            Shape: [batch_size, seq_len, hidden_size]
    last_hidden_stater   c                     || _         d S r   r   )r]   r   s     ra   rQ   z_GlmAsrEncoderOutput.__init__z  s    !2rb   N)rj   rk   rl   rm   	__slots__rY   rn   rQ    rb   ra   r   r   k  sB        
 
 'I3%, 3 3 3 3 3 3rb   r   c                        e Zd ZdZdg diZ	 	 ddedz  def fdZd	ej	        d
e
ej	        ej	        f         fdZdej	        d
efdZdee
eej	        f                  d
ee         fdZ xZS )GlmAsrEncodera  
    Optimized GLM-ASR Audio Encoder with vLLM native implementation.

    This encoder processes audio features through convolutional layers
    followed by transformer layers with rotary position embeddings.
    Optimized for performance with:
    - QKVParallelLinear for fused attention projections
    - Tensor parallelism support via ColumnParallelLinear/RowParallelLinear
    - Quantization support
    - Flash Attention (SDPA)
    r   q_projk_projv_projNrs   rt   ru   c                    t                                                       | _        t          j        j        j        dd          | _        t          j        j        j        ddd          | _        t          j	        fdt          j                  D                       | _        t          dd          }t          j        j        |	          | _        t!                    | _        d S )
N   r1   )kernel_sizepaddingrK   )r   strider   c           	      >    g | ]}t           d |           S )z.layers.r   )r   ).0	layer_idxr^   ru   rt   s     ra   
<listcomp>z*GlmAsrEncoder.__init__.<locals>.<listcomp>  sQ         #!-$99i99    rb   r   r   r   )rP   rQ   r^   r   Conv1dnum_mel_binsrS   conv1conv2
ModuleListrangenum_hidden_layerslayersrR   r   normrC   
rotary_embr   s    ``` ra   rQ   zGlmAsrEncoder.__init__  s    	 Y	
 
 

 Y
 
 

 m      "'v'?!@!@  	
 	
 !)94@@L!3HHH	 7v>>rb   input_lengthsrD   c                 B    |dz   dz
  dz  dz   }|dz   dz
  dz  dz   }||fS )z
        Compute the output length after convolutions.

        Args:
            input_lengths: Input sequence lengths [batch_size]

        Returns:
            Tuple of (output after conv1, output after conv2)
        rK   r   r1   r   )r]   r   output_lengths_conv1output_lengths_conv2s       ra    _get_feat_extract_output_lengthsz.GlmAsrEncoder._get_feat_extract_output_lengths  sF     !. 5 9a?!C !5u <q @QFJ#%999rb   input_featuresc                 z   t           j        j                            |                     |                    }t           j        j                            |                     |                    }|                    dd          }|j        d         }|                     |          }|	                                
                    |j                  }|                                
                    |j                  }| j        D ]} ||||          }|                     |          }t          |          S )ag  
        Forward pass through the encoder.

        Args:
            input_features: [batch_size, num_mel_bins, seq_len]

        Returns:
            _GlmAsrEncoderOutput: Object with .last_hidden_state attribute                 containing [batch_size, seq_len', hidden_size] where seq_len'                 is the sequence length after convolutions
        r1   rK   rL   r   )rY   r   
functionalgelur   r   	transposer   r   costorM   sinr   r   r   )r]   r   r   output_seq_lenrotary_pos_embr   r   encoder_layers           ra   ri   zGlmAsrEncoder.forward  s$    +00N1K1KLL+00M1J1JKK &//155&,Q/ 88+//1144=;N4OO+//1144=;N4OO "[ 	 	M)M13E MM
 		-00 $mDDDDrb   weightsc                    ddl m} g d}t          |                                           }t	                      }|D ]\  }}|D ]X\  }}	}
|	|vr|                    |	|          }|                    d          r||vr;||         }|j        } ||||
            nD|                    d          r||vrz||vr||         }t          |d|          } |||           |	                    |           |S )zICustom weight loading to handle q_proj/k_proj/v_proj -> qkv_proj mapping.r   )default_weight_loader))r   r   r   )r   r   r   )r   r   r   z.biasweight_loader)
-vllm.model_executor.model_loader.weight_utilsr   dictnamed_parameterssetreplaceendswithr   rR   add)r]   r   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   s                ra   load_weightszGlmAsrEncoder.load_weights  s]   WWWWWW"
 "
 "
 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<==)) d+.E.E#D) % 3e]H=== ==)) d+.E.E{**#D) '@U V Ve]333d####rb   r   )rj   rk   rl   rm   packed_modules_mappingr   r   rQ   rY   rn   tupler   r   ri   r   r   r   ro   rp   s   @ra   r   r   ~  s       
 
 	222 37	)? )? )4/)? 	)? )? )? )? )? )?V:"\:	u|U\)	*: : : :(#Eel #E7K #E #E #E #EJ$HU33D-E$F $3s8 $ $ $ $ $ $ $ $rb   r   c                   2   e Zd ZU dZed         ed<   eej        e	ej                 z   e
ddddh          f         ed<   eej        e	ej                 z   e
dddh          f         ed	<   eej        e	ej                 z   e
d
          f         ed<   dS )GlmAsrFeatureInputsz
    Dimensions:
        - num_chunks: Number of audio chunks (flattened)
        - nmb: Number of mel bins
        - num_audios: Number of original audio files
    audio_featurestype
num_chunksnmbchunk_lengthdynamic_dimsr   feature_attention_mask
num_audioschunk_countsN)rj   rk   rl   rm   r   __annotations__r   rY   rn   listr0   r   rb   ra   r  r    s           "
####tEL))L%~FVWWW	Y    &tEL))L.?OPPP	R    tEL))L!!	#     rb   r  c                   z    e Zd ZU dZdZed         ed<   eee	j
                  eddddh          f         ed<   dS )	GlmAsrEmbeddingInputsz
    Dimensions:
        - bn: Batch size
        - naf: Number of audio features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    audio_embedsr  bnnafhsr
  N)rj   rk   rl   rm   r  r   r  r   r  rY   rn   r0   r   rb   ra   r  r  2  su           %3D'.
!222U\D%UG<<<	>     rb   r  GlmAsrInputsc                   `     e Zd ZdZ	 	 ddededz  def fdZdej	        d	ej	        fd
Z
 xZS )GlmAsrMultiModalProjectora  
    Projects audio encoder outputs to language model hidden space.

    This projector uses a two-layer MLP to map audio features from the
    encoder's intermediate size to the language model's hidden size.
    Uses vLLM's parallel linear layers for tensor parallelism support.

    Architecture:
        - Linear layer: intermediate_size -> hidden_size * 2
        - Activation function (e.g., GELU)
        - Linear layer: hidden_size * 2 -> hidden_size
    Nrs   r^   rt   ru   c                 B   t                                                       t          |j        j        |j        j        dz  || d          | _        t          |j	                  | _
        t          |j        j        dz  |j        j        || d          | _        d S )NrK   z	.linear_1)
input_sizeoutput_sizert   ru   z	.linear_2)rP   rQ   r   audio_configr   text_configrS   linear_1r   projector_hidden_actactr   linear_2r   s       ra   rQ   z"GlmAsrMultiModalProjector.__init__S  s     	,*<*6:%'''	
 
 
 f9::))59*6%'''	
 
 
rb   r  rD   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S r   )r  r!  r"  )r]   r  r   r   s       ra   ri   z!GlmAsrMultiModalProjector.forwardh  sE    ==88q//==77qrb   r   )rj   rk   rl   rm   r   r   r   rQ   rY   rn   ri   ro   rp   s   @ra   r  r  E  s           37	
 

 )4/
 	
 
 
 
 
 
*el u|        rb   r  c                   `    e Zd ZdZdefdZdedefdZdede	fdZ
deeedz  f         fdZdS )	GlmAsrProcessingInfoz
    Processing information provider for GLM-ASR model.

    Provides access to model configuration, processor, and feature extractor
    needed for audio preprocessing and multimodal integration.
    rD   c                 @    | j                             t                    S r   )ctxget_hf_configr   r]   s    ra   r(  z"GlmAsrProcessingInfo.get_hf_configw  s    x%%l333rb   kwargsc                 2     | j         j        t          fi |S r   )r'  get_hf_processorr   r]   r*  s     ra   r,  z%GlmAsrProcessingInfo.get_hf_processorz  s    (tx(CCFCCCrb   c                 &     | j         di |j        S Nr   )r,  feature_extractorr-  s     ra   get_feature_extractorz*GlmAsrProcessingInfo.get_feature_extractor}  s    $t$..v..@@rb   Nc                 
    dd iS )Naudior   r)  s    ra   get_supported_mm_limitsz,GlmAsrProcessingInfo.get_supported_mm_limits  s    rb   )rj   rk   rl   rm   r   r(  objectr   r,  r   r1  r   r   rW   r4  r   rb   ra   r%  r%  o  s         4| 4 4 4 4D DO D D D DAf A9P A A A AcDj)A      rb   r%  c            	       t    e Zd ZdZdeeef         defdZ	 d	dedeeef         deeef         dz  de	fdZ
dS )
GlmAsrDummyInputsBuilderz
    Builder for dummy inputs used in profiling and testing.

    Generates dummy text prompts and audio data that match the expected
    format for GLM-ASR model inputs. Used for memory profiling and
    performance benchmarking.
    	mm_countsrD   c                 t    |                     dd          }| j                                        }|j        |z  S )Nr3  r   )rV   infor,  audio_token)r]   r8  r  hf_processors       ra   get_dummy_textz'GlmAsrDummyInputsBuilder.get_dummy_text  s6    ]]7A..
y1133'*44rb   Nrc   
mm_optionsc                 R   | j                                         }|j        }|                    dd          }|r|                    d          nd }t	          | j                                         dt                    }t          ||z            }	d|                     |	||          iS )Nr3  r   max_audio_len)lengthr  	overrides)	r:  r1  sampling_raterV   rR   r,  r3   rW   _get_dummy_audios)
r]   rc   r8  r>  r0  rC  r  audio_overridesr@  	audio_lens
             ra   get_dummy_mm_dataz*GlmAsrDummyInputsBuilder.get_dummy_mm_data  s     !I;;==)7]]7A..
5?I*..111TI&&((/;R
 
 566	 T++ Z? ,  
 	
rb   r   )rj   rk   rl   rm   r   r   rW   r=  r   r   rG  r   rb   ra   r7  r7    s         5S(9 5c 5 5 5 5 =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rb   r7  	hf_inputsrD   c           	         |                      d          }|at          t          j        d          t          j        d|d          t          j        d|d          t          j        d                    S t          t          j        d          t          j        d          t          j        d          t          j        d                    S )a  
    Configure multimodal field batching strategy for GLM-ASR.

    Determines how to batch audio inputs based on whether chunking is used.
    When chunk_counts is present, features are flattened across chunks;
    otherwise, they are batched normally.

    Args:
        hf_inputs: Dictionary of preprocessed inputs from HuggingFace processor.

    Returns:
        Dictionary mapping field names to MultiModalFieldConfig objects             that specify batching behavior.
    r  Nr3  r   r   )r  r   r  r  )rV   r   r   batchedflat_from_sizes)rH  r  s     ra   _glmasr_field_configrL    s    " ==00L.6w??0@1   $9#H1$ $ $ /6w??	
 	
 	
 		
 *27;;,4W==4<WEE*27;;	   rb   c                   n     e Zd ZdZdeeej        f         ee	         z  de
e	e	f         dz  f fdZ xZS )GlmAsrMultiModalDataParserz
    Custom parser for GLM-ASR multimodal data.

    Extends the base parser to handle GLM-ASR specific audio data formats,
    including both pre-computed audio embeddings and raw audio features.
    datarD   Nc                     t          |t                    rt          |ddht                    S t	                                          |          S )Nr3  r  )modalityrequired_fieldsfields_factory)
isinstancer   r!   rL  rP   _parse_audio_data)r]   rO  r`   s     ra   rU  z,GlmAsrMultiModalDataParser._parse_audio_data  sW     dD!! 	% !/ 03	    ww((...rb   )rj   rk   rl   rm   r   r   rY   rn   r"   r   r#   rU  ro   rp   s   @ra   rN  rN    s}         /3$%S(99/ 
38	$t	+/ / / / / / / / / /rb   rN  c            
           e Zd ZdZdefdZdee         dede	dee
         fdZded	eeef         d
eeef         deeef         def
 fdZdedeeef         deeef         fdZdedeeef         dedee         fdZ xZS )GlmAsrMultiModalProcessorz
    GLM-ASR processor that inherits directly from BaseMultiModalProcessor
    for better performance and cleaner implementation.
    rD   c                 ^    | j                                         }t          |j                  S )N)	target_sr)r:  r1  rN  rC  )r]   r0  s     ra   _get_data_parserz*GlmAsrMultiModalProcessor._get_data_parser  s+     I;;==)4E4STTTTrb   
audio_listr0  	processorc                 ~   |j         }|j        }t          |dt                    }t	          ||z            }t	          ||z            }g }	|D ]o}
t          |
t                    rt          |
          n|
j        d         }t          d||z   dz
  |z            }|	
                    t          ||                     p|	S )Nr@  r   r1   )rC  r	  rR   r3   rW   rT  r  lenr   r   appendmin)r]   r[  r0  r\  rC  r	  r@  window_sizemax_windowsr  r3  	n_samplesn_chunkss                ra   _calculate_chunk_countsz1GlmAsrMultiModalProcessor._calculate_chunk_counts  s     *7(5	?<STT-,677-<788 	< 	<E&0&=&=QE


5;q>I1y;6:{JKKHHk : :;;;;rb   promptmm_data	mm_kwargs
tok_kwargsc                    d|v r|                     d          |d<   |                    dg           }|rt          |t                    s|gn|}|sa| j                                                            |          }|                     |          }t          t          |g          d          S  | j        j
        di |}t          di |d|j        i}t                                          ||||          }	d|	v r|	                     d          |	d	<   nfd	|	vrbd
|	v r^|	d
         }
t          |
t          j                  r<t          j        |
j        d         |
j        d         t          j                  }||	d	<    | j        j        di |}|                     ||j        |          }t          j        |t          j                  |	d<   |	S )Naudiosr3  )	input_idspt)tensor_typerC  )rf  rg  rh  ri  input_feature_maskr  r   r   r   rL   r  r   )poprV   rT  r  r:  get_tokenizerencode_apply_hf_processor_tokens_onlyr   r   r1  rC  rP   _call_hf_processorrY   rn   onesr   longr,  re  r0  tensor)r]   rf  rg  rh  ri  r3  r[  
prompt_idsr0  outputsr   maskr\  r  r`   s                 ra   rt  z,GlmAsrMultiModalProcessor._call_hf_processor  s+    w&{{844GGGR(( %Pj.E.EPeWW5
  	P002299&AAJ==jIIJ
| < < <$OOOO <DI;HHiHH 
 

 
+9
 
 
	 '',,!	 - 
 
  7**07<P0Q0QG,--%W449IW9T9T$%56N.%,77 9z"(+"(,*  
 5901 /DI.;;;;	 33	3Y
 
 #(,|5:"N"N"Nrb   rH  hf_processor_mm_kwargsc                      t          |          S r   )rL  )r]   rH  r{  s      ra   _get_mm_fields_configz/GlmAsrMultiModalProcessor._get_mm_fields_config?  s    
 $I...rb   mm_itemsout_mm_kwargsc                     | j         j        di |}| j                                         }|                                }| j                                         }t          |dd          }|                    |          |j        t          |dt                    }	t          |dt                    }
|
                                                    d          }                    d          }g |cddlm}m} |d	} ||          D ]}||z   }|||         }t          |t                    rt!          j        |          } |||	|
          }                    t'          |                                                                                     |}nt-          t/          |                    D ]}|||dz            }t          |t                    r't!          j        |                              d	          } |||	|
          }                    t'          |                                                                                     d
t&          ffd}t5          d||          gS )Nr;  <|pad|>merge_factorconv_paramsr  r  r1   )_as_list_chunk_counts#_get_audio_output_lengths_from_maskr   item_idxc                    r	|          }n<                     d          }|||          }|j        d         }nt          d          |dk    rt          d          gt          |          z  }t	          j        |          S )Nr  r   z>Either feature_attention_mask or audio_embeds must be providedzAudio is too short)embed_token_id)rV   r   
ValueErrorrW   r+   select_token_id)r  num_featuresr  embedaudio_tokensaudio_output_lengthsaudio_token_idout_mm_datas        ra   get_replacement_glmasrzMGlmAsrMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_glmasr}  s    # 
3H=*~>>+(2E#(;q>LL$X   q   !5666*+c,.?.??L&6-   rb   r3  )rQ  targetreplacementr   )r:  r,  rq  	get_vocabr(  rR   rV   r  r4   r2   get_dataglmasr_utilsr  r  rT  r  rY   stackr_  rW   sumitemr   r^  rw  	unsqueezer)   )r]   r~  r{  r  r\  	tokenizervocabr^   r;  r  r  r  r  r  r  	start_idxcountend_idxrz  lengthsidxr  r  r  r  s                         @@@ra   _get_prompt_updatesz-GlmAsrMultiModalProcessor._get_prompt_updatesF  s    /DI.HH1GHH	I++--	##%%((**i	BB;//!&5Nv~7KLLfm5HII#,,..!,1I!J!J"~66 +-!-       
 '	22<@@ 
( 
(E'%/G1)G2CDD!$-- 1${400AAlK G )//GKKMM4F4F4H4H0I0IJJJ 'II
( !%;!<!<== K KC1#a-@D!$-- ?$|D11;;A>>AAlK G )//GKKMM4F4F4H4H0I0IJJJJ	S 	 	 	 	 	 	 	 	0  "2  
 	
rb   )rj   rk   rl   rm   r%   rZ  r  r   r   r   rW   re  r   r   r5  r   r   rt  r   r}  r$   r    r   r*   r  ro   rp   s   @ra   rW  rW    s        
U"6 U U U UI 3 #	
 
c   &<< c6k"< 38$	<
 CK(< 
< < < < < <|// !(V 4/ 
++	,	/ / / /T
%T
 !(V 4T
 -	T

 
,	T
 T
 T
 T
 T
 T
 T
 T
rb   rW  )r:  dummy_inputsc                   D    e Zd ZeZg dddgdZdddedef fd	Ze	d
ede
dedz  fd            ZdefdZdededz  fdZdedej        eej        df         z  fdZdedefdZ	 	 d*dej        dej        dedz  dej        dz  dedej        ez  fdZdej        dej        dz  fdZdeeeej        f                  dee         fdZe	dedefd             Ze	ded!edefd"            Z e	d#e!j"        ded$ed%edz  d!e#d&         d'ed(edz  de$fd)            Z% xZ&S )+GlmAsrForConditionalGenerationr   	gate_projup_proj)r   gate_up_projrs   )ru   vllm_configru   c          	      ~   t                                                       |j        j        }|j        }|j        j        }|| _        || _        || _        |                     |d          5  t          |j	        |t          |d                    | _        t          ||t          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t          ||j        t          |d          dg          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nr3  audio_towerr   multi_modal_projectorlanguage_modelLlamaForCausalLM)r  	hf_configru   architectures)rP   rQ   model_configr  rt   multimodal_configr^   _mark_tower_modelr   r  r@   r  r  r  _mark_language_modelr?   r  r  make_empty_intermediate_tensors)r]   r  ru   r^   rt   r  r`   s         ra   rQ   z'GlmAsrForConditionalGeneration.__init__  s   )3"/'4F!2(##K99 
	 
	,#)#FM::     D
 *C)#F,CDD* * *D&
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 &&{33 	 	"<' ,#F,<==12	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   ,ACCC(-D!!D%(D%rQ  irD   Nc                 N    |                     d          rdS t          d          )Nr3  z)<|begin_of_audio|><|pad|><|end_of_audio|>z Only audio modality is supported)
startswithr  )clsrQ  r  s      ra   get_placeholder_strz2GlmAsrForConditionalGeneration.get_placeholder_str  s-    w'' 	?>>;<<<rb   c                 0    t          j        ddd          S )Nzlanguage_model.zmulti_modal_projector.zaudio_tower.)r  	connectortower_model)r   from_string_fieldr)  s    ra   get_mm_mappingz-GlmAsrForConditionalGeneration.get_mm_mapping  s%    /,.&
 
 
 	
rb   r*  c           	          |                     dd           }|t          d|          S |                     dd           }|d S t          d||                     dd           |                     dd                     S )Nr  )r  r  r   r  r  r  )r  r   r  r  )rp  r  r  )r]   r*  r  r   s       ra   _parse_and_validate_audio_inputz>GlmAsrForConditionalGeneration._parse_and_validate_audio_input  s    zz.$77#(n<XXXX$4d;;!4"!)#)::.F#M#MND99	
 
 
 	
rb   audio_input.c                 @   |d         dk    rt          |d                   S |d         }|d         }t          |t                    r,t          j        |d          }t          j        |d          }|j        d         }t          |                    d          |          }|                    | j	        j
        j        j        	          }| 	                    |          j        }| j        j        j        }| j        j        j        }||z  }	|j        d
         }
|
|	z  |	z  }||
k     r|d d d |d d f         }|                    |d|          }|                     |          }t)          | j        dt*                    }t)          | j        dt,                    }t/          | j	        |                    d          ||          }t3          ||          }t          j        ||                                                                          }t;          ||          S )Nr  r  r   r  r   r   r  )r  rL   r1   r   r  r  )r  rT  r  rY   catr   r8   rV   r   r  r   weightrM   r   r^   r  rS   r   reshaper  rR   r4   r2   r6   r  r5   r   flattentolistr7   )r]   r  r   r  r  r  audio_hidden_statesrS   r   merge_ratiorc   seq_len_truncatedr  r  r  r  masked_audio_featureschunk_embeddingss                     ra   _process_audio_inputz3GlmAsrForConditionalGeneration._process_audio_input  sF    v.00^4555$%56!,-E!Fnd++ 	N"Y~1===N%*Y/E1%M%M%M"#)!,
.OON++

 
 

 (**1A1G1N1T*UU #..~>>P k.: K4F';6 &+A.$3{Bw&&"5aaa9K:K9KQQQ6N"O 299
 
 334GHHt{N<PQQdk=:MNNB"&&r**	 
  
 !B0!
 !
 !;!#7#?#?#A#A#H#H#J#J
 
 ''7FFFrb   c                 R     | j         di |}|g S |                     |          }|S r/  )r  r  )r]   r*  r  r  s       ra   embed_multimodalz/GlmAsrForConditionalGeneration.embed_multimodal+  s?    :d:DDVDDI $ 9 9+ F F$$rb   rl  	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)r  )r  model)r]   rl  r  r  r  r*  r   s          ra   ri   z&GlmAsrForConditionalGeneration.forward4  s@      + M+11 '	 2 
 
 rb   r   c                 6    | j                             |          S r   )r  compute_logits)r]   r   s     ra   r  z-GlmAsrForConditionalGeneration.compute_logitsG  s     "11-@@@rb   r   c                 T    dg}t          | |          }|                    |          S )Nzaudio_tower.embed_positions)skip_prefixes)r>   r   )r]   r   r  loaders       ra   r   z+GlmAsrForConditionalGeneration.load_weightsM  s0    67"4}EEE""7+++rb   r  c                 B    t          |          }t          |dd          S )znGet the audio token from processor.

        Similar to get_placeholder_str but returns single token.
        r;  r  )r.   rR   )r  r  r\  s      ra   _get_audio_tokenz/GlmAsrForConditionalGeneration._get_audio_tokenR  s#     1>>	y-;;;rb   	task_typec                     t          |          }|j        }t          |dt                    }t	          ||j                  S )Nr@  )max_audio_clip_ssample_rate)r.   r0  rR   r3   r   rC  )r  r  r  r\  r0  r  s         ra   get_speech_to_text_configz8GlmAsrForConditionalGeneration.get_speech_to_text_config[  sM     1>>	%7"9o?VWW!-)7
 
 
 	
rb   r3  
stt_configlanguage)
transcribe	translaterequest_promptto_languagec                 v   t          |          }|                     |          }	|dk    r#| j                            ||          }
|	 d|
 }n|dk    r|	 d}nt	          d|           d|dg}|                    |dd	
          }|                    |          }|d|id}t          t          |          S )z@Get the generation prompt to be used for transcription requests.r  ztranslate the speech to r  z4can you transcribe the speech into a written format?zUnsupported task type user)rolecontentFT)tokenizeadd_generation_promptr3  )prompt_token_idsmulti_modal_data)	r-   r  supported_languagesrV   r  apply_chat_templaterr  r
   r   )r  r3  r  r  r  r  r  r  r  r;  full_lang_name_touser_contentmessagesrf  r  prompt_dicts                   ra   get_generation_promptz4GlmAsrForConditionalGeneration.get_generation_promptg  s    1>>	**<88## # 7 ; ;K U U)VVCTVVLL,&&TTT L AiAABBB#==>..uD / 
 
 %++F33 0!(% 0
 
 J,,,rb   )NN)'rj   rk   rl   rA   r  r  r   r   rQ   classmethodrW   r  r   r  r5  r  r  rY   rn   r  r  r9   r  r,   ri   r  r   r   r   r   r  r   r  npndarrayr   r   r  ro   rp   s   @ra   r  r    s&        3 322$i0 
 BD 
 
 
z 
3 
 
 
 
 
 
B =3 =3 =3: = = = [=
 
 
 
 

 
<RVCV 
 
 
 
 >G'>G	elC/0	0>G >G >G >G@% %4H % % % % <@-1 < < 2D8	
 |d*  
+	+   &A|A 
	A A A A,HU33D-E$F ,3s8 , , , ,
 <K <C < < < [< 	
&	
36	
		
 	
 	
 [	
 "-z"- ""- '	"-
 *"- 45"- "- 4Z"- 
"- "- "- ["- "- "- "- "-rb   r  )tcollections.abcr   r   r   typingr   r   r   r	   r
   numpyr  rY   torch.nnr   transformersr   transformers.models.glmasrr   r   transformers.models.whisperr   vllm.configr   r   r   vllm.config.multimodalr   vllm.distributed.parallel_stater   vllm.inputs.datar   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r    vllm.multimodal.parser!   r"   r#   r$   r%   vllm.multimodal.processingr&   r'   r(   r)   r*   r+   vllm.sequencer,   vllm.tokenizersr-   !vllm.transformers_utils.processorr.   vllm.utils.tensor_schemar/   r0   r  r2   r3   r4   r5   r6   r7   r8   
interfacesr9   r:   r;   r<   r=   utilsr>   r?   r@   whisperrA   ModulerC   rr   r   r   r   r   r  r  r  r  r  r%  r7  r   rn   r   rL  rN  rW  register_processorr  r   rb   ra   <module>r     s   8 7 7 7 7 7 7 7 7 7 7 ; ; ; ; ; ; ; ; ; ; ; ; ; ;            % % % % % % D D D D D D D D ? ? ? ? ? ? C C C C C C C C C C 3 3 3 3 3 3 P P P P P P ' ' ' ' ' ' < < < < < < X X X X X X         
 G F F F F F M M M M M M D D D D D D / / / / / /         
                             . - - - - - 8 8 8 8 8 8 J J J J J J > > > > > > > >                               O N N N N N N N N N - - - - - -8. 8. 8. 8. 8.29 8. 8. 8.vv v v v vRY v v vr' ' ' ' 'ry ' ' 'T@ @ @ @ @ @ @ @F3 3 3 3 3 3 3 3&Z Z Z Z ZBI Z Z Zz    ,   .    L     .0EEi E E E' ' ' ' '	 ' ' 'T    -   *"
 "
 "
 "
 "
56JK "
 "
 "
J"sEL()"	#$
$%" " " "J/ / / / /!5 / / /,v
 v
 v
 v
 v
 78N O v
 v
 v
r ('	)  
h- h- h- h- h-I!:|=Rh- h- 
h- h- h-rb   