
    .`i                     <   d dl mZ d dlmZ d dlZd dlmZ dZdZddgZ	dej
        d	ed
ededej
        f
dZdej
        ee         z  eej
                 z  dee         fdZdej
        ee         z  eej
                 z  dz  dedee         fdZdej
        dedeeeeef                  dej
        fdZdej
        dedeeeeef                  dej
        fdZdej        dej
        dedeeeeef                  dej
        f
dZdej
        dej
        dej
        fdZdeej
                 dee         deej
        df         fdZdej
        eej
                 z  dej
        fd Zd!ej
        eej
                 z  dej
        ee         z  dz  d"edej
        fd#ZdS )$    )Sequence)castNi     )      r   )r   r      input_lengthpaddingkernel_sizestridereturnc                 $    | d|z  z   |z
  |z  dz   S )z6Calculate Conv1d output length using standard formula.r   r    )r	   r
   r   r   s       {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/glmasr_utils.py_calculate_conv_output_lengthr      s"    
 1w;&4?!CC    chunk_countsc                    t          | t          j                  r|                                 S | rQt          | d         t          j                  r1t	          t
          t          j                 |           }d |D             S d | D             S )Nr   c                 P    g | ]#}t          |                                          $S r   )intitem.0cs     r   
<listcomp>z)_as_list_chunk_counts.<locals>.<listcomp>    s&    555!AFFHH555r   c                 ,    g | ]}t          |          S r   )r   r   s     r   r   z)_as_list_chunk_counts.<locals>.<listcomp>!   s    )))qCFF)))r   )
isinstancetorchTensortolistr   list)r   tensor_countss     r   _as_list_chunk_countsr#      s     ,-- %""$$$ 6
<?ELAA 6T%,/>>55}5555))L))))r   
num_chunksc                 0    | dg|z  S t          |           S Nr   )r#   )r   r$   s     r   _normalize_chunk_countsr'   $   s%     sZ ...r   audio_lengthsmerge_factorconv_paramsc                 N    |D ]\  }}}t          | |||          } | |z
  |z  dz   S r&   )r   )r(   r)   r*   r
   r   r   s         r   &_get_audio_output_lengths_from_lengthsr,   -   sK    
 )4 
 
$f57K
 
 L(\9A==r   maskc                 N    |                      d          }t          |||          S )N)sumr,   )r-   r)   r*   r(   s       r   #_get_audio_output_lengths_from_maskr1   9   s,    
 HHRLLM1|[  r   audio_towerc                     t          | d          r|                     |          \  }}n|}|D ]\  }}}t          ||||          }||z
  |z  dz   S )a  
    Calculate the output lengths after audio processing.

    The output length accounts for:
    1. Convolution layers (downsampling)
    2. Merge factor (further downsampling during projection)

    Args:
        audio_tower: The audio encoder module
        audio_lengths: Input feature lengths [batch_size]
        merge_factor: Factor for merging adjacent features
        conv_params: List of (padding, kernel_size, stride) for each conv layer

    Returns:
        Output lengths after all processing [batch_size]
     _get_feat_extract_output_lengthsr   )hasattrr4   r   )	r2   r(   r)   r*   _conv_output_lengthsr
   r   r   s	            r   #_get_audio_output_lengths_for_towerr8   D   s    . {>?? 	!,!M!M"
 "
 ,,7 	 	(G[&"?#Wk6# #  ,.<?!CCr   audio_featuresaudio_output_lengthsc                    | j         \  }}}|                    d          }t          j        |                              ||                              |j                  |k     }| |                             d|          S )Nr   r/   )shape	unsqueezer   arangeexpandtodeviceview)r9   r:   r$   max_audio_tokens	embed_dimaudio_features_masks         r   !_flatten_audio_features_by_lengthrF   k   s     /=.B+J )/99!<<%&&	
,	-	-	 '	(	(
	  -.33B	BBBr   chunk_embeddings.c                     g }d}|D ]=}| |||z            }|                     t          j        |d                     ||z  }>t          |          S )Nr   )dim)appendr   cattuple)rG   r   grouped_embeddingscurrent_idxcountaudio_chunkss         r   _group_audio_embeddingsrQ   z   sp     K  'kE6I(IJ!!%)La"@"@"@AAAu#$$$r   c                     t          | t                    rJ| r4t          | d         t          j                  rt          j        |           nt          j        |           S | S )z>Convert mask to tensor, handling both list and tensor formats.r   )r   r!   r   r   stacktensor)r-   s    r   _normalize_to_tensorrU      s^    $ 
 $"47EL99$EKd##	

 Kr   feature_attention_maskitem_idxc                 n   |F| |         }t          | t          j                  r|                    d          S t	          |          S t          |          }t          |d|                   }|||         z   }t          | t          j                  r
| ||         S | ||         }t	          |          S )z1Extract attention mask for a specific audio item.Nr   )r   r   r   r=   rU   r#   r0   )rV   r   rW   r-   counts	start_idxend_idx
mask_slices           r   _extract_mask_for_itemr]      s     %h/,el;; 	%>>!$$$#D))) #<00FF9H9%&&I&**G (%,77 9%i&788'	'(9:J
+++r   )collections.abcr   typingr   r   torch.nnnnDEFAULT_MAX_AUDIO_LEN_SDEFAULT_MERGE_FACTORDEFAULT_CONV_PARAMSr   r   r   r!   r#   r'   rL   r,   r1   Moduler8   rF   rQ   rU   r]   r   r   r   <module>rf      sC   % $ $ $ $ $                !), D,D),D;>DHKD
\D D D D*,c*T%,-??*	#Y* * * */,c*T%,-??$F// 
#Y/ / / /	><	>	> eCcM*+	> \		> 	> 	> 	>
, eCcM*+ \	   $D$D<$D $D eCcM*+	$D
 \$D $D $D $DNCLC,C \C C C C
%u|,
%3-
% 5<
% 
% 
% 
%u|d5<.@@ U\    ,!L4+==,,c*T1, , \	, , , , , ,r   