
    .`i                     p   d dl Z d dlZd dlmZmZ d dlZd dlZd dlm	c m
Z d dlmZm	Z	 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZ  G d d	e	j                  Z  G d
 de j!        e	j                  Z" G d de"          Z# G d de	j                  Z$ G d de	j                  Z%dS )    N)AnyLiteral)Tensornn)CheckpointWrapper)FullyShardedDataParallel)PretrainedConfig)AbsolutePositionalEncoding
ConvModuleFeedForwardMeanVarianceNormLayerMultiHeadedAttentionMultiSequentialNemoConvSubsamplingT5RelativeAttentionLogitBiasadaptive_enc_mask
get_offsetunfold_tensorc            3       H    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d3dedededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*d+f2 fd,Z	 d4d-e	j
        d.e	j
        d/e	j
        d0e	j
        d1e
d+z  d*ee	j
        e	j
        e	j
        e	j
        f         fd2Z xZS )5ConformerEncoderLayera  ConformerEncoder Layer module.
    for more details see conformer paper:
        https://arxiv.org/abs/2005.08100
    This module implement the Conformer block layer.

    Args:
        d_model: int
            attention dim.
        ext_pw_out_channel: int
            if > 0, ext_pw_out_channel is a dim channel size
             for the last pointwise conv after swish activation.
        depthwise_seperable_out_channel: int
            if set different to 0, the number of
             depthwise_seperable_out_channel will be used as a
             channel_out of the second conv1d layer.
             otherwise, it equals to 0, the second conv1d layer is skipped.
        depthwise_multiplier: int
            number of input_dim channels duplication. this value
             will be used to compute the hidden channels of the Conv1D.
        n_head: int
            the number of heads for multihead attention module.
        d_ffn: int
            output size of the feed_forward blocks.
        ext_pw_kernel_size: int
            kernel size of the conv pointwise of the conformer.
        kernel_size: int
            kernel size.
        dropout_rate: float
            dropout rate.
        causal: bool, optional
            if set to True, convolution have no access
             to future frames. default False.
        batch_norm: bool, optional
            if set to True, apply batchnorm before activation
            in ConvModule layer of the conformer.
            default False
        activation: str, optional
            activation function name,
            one of ["relu", "swish", "sigmoid"],
            sigmoid activation is only used with "glu_in_fnn=True",
            default "relu".
        chunk_se: int, optional
            0 for offline SE.
            1 for streaming SE, where mean is computed
             by accumulated history until current chunk_se.
            2 for streaming SE, where mean is computed
             by only the current chunk.
            default 0.
        chunk_size: int, optional
            chunk_size for cnn. default 18
        conv_activation: str, optional
            activation function used in ConvModule part
            of the conformer, default "relu".
        conv_glu_type: str, optional
            activation function used for the glu inside
            the ConvModule part of the conformer.
            default: "sigmoid".
        bias_in_glu: bool, optional
            if set to True, use additive bias in the weight module
             before GLU.
        linear_glu_in_convm: bool, optional
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
        attention_inner_dim: int, optional
            if equal to -1, attention dim for linears k/q/v is
            equal to d_model. otherwise attention_inner_dim is used.
            default -1.
        attention_glu_type: str, optional
            activation function for glu used in the multihead attention,
             default "swish".
        activation_checkpointing: str, optional
            a dictionary of {"module","interval","offload"}, where
                "module": str
                    accept ["transformer", "attention"] to select
                    which module should do activation checkpointing.
                "interval": int, default 1,
                    interval of applying activation checkpointing,
                    interval = 1 means that we apply checkpointing
                    on every layer (if activation), otherwise,
                    we apply it every x interval.
                "offload": bool, default False,
                    if set to True, we offload activation to cpu and
                    reload it during backward, otherwise,
                    we recalculate activation in backward.
            default "".
        export: bool, optional
            if set to True, it removes the padding from convolutional layers
             and allow the onnx conversion for inference.
              default False.
        use_pt_scaled_dot_product_attention: bool, optional
            if set to True, use pytorch's scaled dot product attention
            implementation in training.
        attn_group_sizes: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
            attn_group_sizes = attention_heads = Multi-Query Attention
       r                  皙?Frelu   sigmoidTswish d_modelext_pw_out_channeldepthwise_seperable_out_channeldepthwise_multipliern_headd_ffnext_pw_kernel_sizekernel_sizedropout_ratecausal
batch_norm
activationchunk_se
chunk_sizeconv_activationconv_glu_typebias_in_glulinear_glu_in_convmattention_inner_dimattention_glu_typeactivation_checkpointingexport#use_pt_scaled_dot_product_attentionattn_group_sizesreturnNc                    t                                                       t          |||	||          | _        t	          |||	|||||          | _        t          |||||||	|
||||||||          | _        t          |||	||          | _        t          j
        |          | _        t          j
        |          | _        d S )N)r$   d_innerr,   r/   r4   )r:   
group_size)r9   )super__init__r   feed_forward_inr   	self_attnr   convfeed_forward_outr   	LayerNormlayer_norm_att
layer_norm)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   	__class__s                            {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/phi4mm_audio.pyrA   zConformerEncoderLayer.__init__   s
   6 	*%!# 
  
  
 .0S'	
 	
 	
 + !
 
 
	& !,%!#!
 !
 !
 !l733,w//    xpos_kpos_vmaskrelative_attention_biasc           
      <   |d|                      |          z  z   }|                     |          }||                     |||||||          z   }||                     |          z   }|d|                     |          z  z   }|                     |          }||||fS )a  ConformerEncoder forward.

        Args:
            x: input feature of shape (batch, max_time_in, size)
            pos_k: positional key embedding.
            pos_v: positional value embedding.
            mask: mask for x (batch, max_time_in)
            relative_attention_bias: bias added to attention logits w.r.t.
                relative positions (1, n_head, time1, time2)
        g      ?rQ   )rB   rG   rC   rD   rE   rH   )rI   rM   rN   rO   rP   rQ   norm_xouts           rK   forwardzConformerEncoderLayer.forward   s    $ d**1----$$Q''$;  
 
 
 		!d++A....ooa  E5$&&rL   )r   r   r   r   r   r   r   r   r   FFr   r   r   r   r    TFr!   r"   r#   FFr   N)__name__
__module____qualname____doc__intfloatboolstrrA   torchr   tuplerV   __classcell__rJ   s   @rK   r   r   %   s,       c cN "#/2$%"#!  %& $)#%")(*49 !3K0 K0K0  K0 *-	K0
 "K0 K0 K0  K0 K0 K0 K0 K0 K0 K0 K0  !K0" #K0$ %K0& "'K0( !)K0*  +K0, #&-K0. /K00 .21K02 3K04 
5K0 K0 K0 K0 K0 K0f 26#' #'<#' |#' |	#'
 l#' "($#' 
u|U\5<E	F#' #' #' #' #' #' #' #'rL   r   c            %           e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d1dedeee         z  deee         z  dedededededededede	ee
f         dz  dede	ee
f         dz  ded         dede	ee
f         dz  ddf$ fdZdeej        z  deej        z  fd Zej        de
fd!            Z	 	 d2deee         z  dz  deee         z  dz  deeef         fd"Zd#ej        dej        fd$Zd%ej        d&ej        deej        ej        f         fd'Zd%ej        deej        dz  ej        dz  f         fd(Zd)ed*edeee         z  deee         z  dej        f
d+Z	 	 d2d,ej        d&ej        d-eee         z  dz  d.eee         z  dz  deej        ej        dz  ej        dz  ej        ej        f         eej        ej        dz  ej        dz  ej        ej        ej        f         z  f
d/Zdefd0Z xZS )3TransformerEncoderBaseaj  The Base class for Transformer based encoders

    Please set causal = True in streaming model
    Args:
        input_size: int
            input feature dimension.
        chunk_size: int, list(int)
            Number of frames for each chunk
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training
            Some examples for the 2 cases:
            chunk_size = 12
            chunk_size = [6, 8, 12, 24]
        left_chunk: int, list(int)
            Number of chunks used for masking in streaming mode.
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training. When
            chunk_size is a list, left_chunk must be a list with same length.
            Some examples for the 2 cases:
            left_chunk = 6
            left_chunk = [12, 9, 6, 3]
        attention_dim: int, optional
            attention dimension. default 256.
        attention_heads: int, optional
            the number of heads. default 4
        input_layer: str, optional
            input layer type before Conformer,
            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
            default "conv2d"
        cnn_out: int, optional
            the number of CNN channels before Conformer.
            default -1.
        cnn_layer_norm: bool, optional
            layer norm between Conformer and the first CNN.
            default False.
        time_reduction: int, optional
            time reduction factor
            default 4
        dropout_rate: float, optional
            dropout rate. default 0.1
        padding_idx: int, optional
            padding index for input_layer=embed
            default -1
        relative_attention_bias_args: dict, optional
            use more efficient scalar bias-based relative multihead attention
            (Q*K^T + B) implemented in cmb.basics.embedding.
            [T5/ALiBi]RelativeAttentionLogitBias
            usage: relative_attention_bias_args={"type": t5/alibi}
            additional method-specific arguments can be provided (see
            transformer_base.py)
        positional_dropout_rate: float, optional
            dropout rate after positional encoding. default 0.0
        nemo_conv_settings: dict, optional
            A dictionary of settings for NeMo Subsampling.
            default None
        conv2d_extra_padding: str, optional
            Add extra padding in conv2d subsampling layers. Choices are
            (feat, feat_time, none, True).
            if True or feat_time, the extra padding is added into non full
            supraframe utts in batch.
            Default: none
        attention_group_size: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attention_group_size < attention_heads = Grouped-Query
            Attention
            attention_group_size = attention_heads = Multi-Query Attention
    r   r   	nemo_convr!   F        Nnoner   
input_sizer1   
left_chunkattention_dimattention_headsinput_layercnn_outcnn_layer_normtime_reductionr,   padding_idxrelative_attention_bias_argspositional_dropout_ratenemo_conv_settingsconv2d_extra_paddingfeat	feat_timerh   Tattention_group_sizeencoder_embedding_configr<   c           	      d   t                                                       || _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        || _        | j        dk    r]d| j	        ||ddt          j                    dd}|r(|                    |           dD ]}||vs
J d            t          di || _        nt#          d	|z             t%          ||          | _        |r|                    d
          nd | _        | j        dk    rj| j        | j        z  dk    s
J d            t-          | j        | j        z  |                    dd          |                    dd                    | _        nt0          t3          | j        d                   | _        d S )Nrf   dw_stridingr   r   Fsubsamplingsubsampling_factorfeat_infeat_outconv_channels subsampling_conv_chunking_factorr/   	is_causalr   r   r   6{i} should be specified outside of the NeMo dictionaryzunknown input_layer: typet5r   'attention_group_size must divide n_headt5_bias_max_distancei  t5_bias_symmetric)max_distance	symmetricri    )r@   rA   ri   rm   r1   rj   rk   	num_headsry   rp   rt   rz   r   ReLUupdater   embed
ValueErrorr
   pos_embgetrelative_attention_bias_typer   relative_attention_bias_layerNotImplementedErrorr   encoder_embedding)rI   ri   r1   rj   rk   rl   rm   rn   ro   rp   r,   rq   rr   rs   rt   ru   ry   rz   default_nemo_conv_settingsirJ   s                       rK   rA   zTransformerEncoderBase.__init__G  s$   ( 	$&$$*($8!,"4(@%{**,&*&9%)!$45 gii"	* 	*& " *112DEEEF  A$6666P 7666 -  , DJJ 4{BCCC12
 
 ,(,,V444 	)
 ,44>D$==BBB9 CBB 2N$";;9==*D  7::;NPUVV2 2 2D.. &%!6),7"
 "
rL   feature_lensc                 V   | j         dk    r| j                            dd          dv }| j                            dd          }|r|rt          |t                    r.t          j        || j        z                                            nt          j        || j        z            }|| j        z  }t          |t                    r||dk    xx         dz  cc<   n|dk    r|dz  }|S t          |t                    rt          j        nt
          j        } ||| j        z            S dS )	aj  feature_lens: int
        return updated feature lens.

        This used to return a different lambda function for each case that
        computed the right thing.  That does not work within Torchscript.
        If you really need this to be faster, create nn.Module()-s for all
        the cases and return one of them.  Torchscript does support that.
        rf   r~   r|   )r|   stridingstriding_conv1dr   Fr   N)rm   rt   r   
isinstancer   r`   ceilrp   longmathr\   )rI   r   subsampling_causal_condr   lens_changefeature_lens_remainder	ceil_funcs          rK   compute_lens_changez*TransformerEncoderBase.compute_lens_change  sT    {**&*&=&A&A}' ''# /33KGGI #4 # ",77GEJ|d.AABBGGIII<$2E#EFF 
 *68K)K&lF33 % 6! ;<<<A<<<<+q001$K""%/c%B%BR		
I9\D,??@@@/ +*rL   c                     dS )z'Abstract forward method implementation.Nr   rI   s    rK   rV   zTransformerEncoderBase.forward  s      rL   c                    || j         }|| j        }t          |t                    rt	          t          j        dt          |          d                    }||         }t          |t                    st          d          t          |          t          |          k    rt          d          ||         }n|}|}||fS )z>If chunk size is a list, we will randomly select a chunk size.Nr   )r   )lowhighsizez5Since chunk_size is a list, left_chunk must be a listzBThe length of left_chunk must be the same as length of chunk_size.)	r1   rj   r   listr\   r`   randintlenr   )rI   r1   rj   chunk_size_indexchunk_size_train_effleft_chunk_train_effs         rK   _chunk_size_selectionz,TransformerEncoderBase._chunk_size_selection  s     JJj$'' 	."!#j//EEE    $..>#? j$//  K   :#j//11 X   $..>#?  #- #- #%999rL   r   c                     t          |t                    }t          |t                    }|}|r|j        }|r|j        }|S rW   )r   r   r   _checkpoint_wrapped_modulemodule)rI   r   is_embed_using_act_chkptis_embed_fsdp_wrappedembed_classs        rK   _get_embed_classz'TransformerEncoderBase._get_embed_class  sN    #-e5F#G#G  *52J K K# 	;:K  	',KrL   input_tensormasksc                     |                      | j                  }t          |t                    sJ |                     ||          \  }}||fS rW   )r   r   r   r   )rI   r   r   r   s       rK   _forward_embeddings_corez/TransformerEncoderBase._forward_embeddings_core  sS     ++DJ77+':;;;;;"jju==eU""rL   c                 J    d }d }| j         |                     |          }||fS rW   )r   r   )rI   r   rN   rO   s       rK   _position_embeddingz*TransformerEncoderBase._position_embedding  s9     -5<< L e|rL   seq_len
batch_sizec                     |                      ||          \  }}t          j        d||          }t          |||                              d                              |ddg          }|S )Nr   )left_windowr!   )r   nparanger   	unsqueezeexpand)	rI   r   r   r1   rj   r   r   chunk_start_idxenc_streaming_masks	            rK   _streaming_maskz&TransformerEncoderBase._streaming_mask  s     6:5O5O
6
 6
22 )Aw0DEE 6J   Yq\\VZR()) 	 "!rL   xs_padchunk_size_ncleft_chunk_ncc                 |   t          |                     |j        d                             }|dk    rt          d| d          |j        d         }|                     ||| j        | j                  }|j        r(|                                }|                                }|}| 	                    ||          \  }}|}	|	|||	z  }
n||}
n|	}
|>|                     ||||          }|j        r|                                }|||z  }n|}nd}| 
                    |          \  }}|||||
|fS ||||
||fS )a  Forwarding the inputs through the top embedding layers

        Args:
            xs_pad: torch.Tensor
                input tensor
            masks: torch.Tensor
                input mask
            chunk_size_nc: (optional, default is None) chunk size for
                            non-causal layers
            left_chunk_nc: (optional, default is None) # of left chunks for
                            non-causal layers
        r   r   zFThe sequence length after time reduction is invalid: 
                z. Your input feature is too short. Consider 
                filtering out the very short sentence from data 
                loaderN)r\   r   shaper   r   r1   rj   is_cudacudar   r   )rI   r   r   r   r   r   r   r   r   streaming_maskhs_maskenc_streaming_mask_nc
hs_mask_ncrN   rO   s                  rK   forward_embeddingsz)TransformerEncoderBase.forward_embeddings  s   J d..v|A??@@a<<     \!_
!11Z$/
 
 > 	#!3!8!8!:!:[[]]F";;L%PPe+%%*;n,GGGG$G$$($8$8]M% %! ~ E(=(B(B(D(D% "%::

2

J//==u w==UE7E:EErL   c                 6    t          | j        | j                  S )a!  Returns offset used when retaining inputs for decoding.

        This is essentially, how many additional frames have to be added to
        the front-end CNN input to ensure it can produce a single output.
        So if the "padding" parameter is 0, typically offset will be > 0.
        )r   rm   rp   r   s    rK   r   z!TransformerEncoderBase.get_offseto  s     $*D,?@@@rL   )r   r   rf   r!   Fr   rg   r!   Nrg   Nrh   r   N)NN)rX   rY   rZ   r[   r\   r   r_   r^   r]   dictr   r   rA   r`   r   r   abcabstractmethodrV   ra   r   r   Moduler   r   r   r   r   r   rb   rc   s   @rK   re   re      sQ       F FZ ! &$!>B),48KQ$%:>%R
 R
R
 $s)OR
 $s)O	R

 R
 R
 R
 R
 R
 R
 R
 R
 '+38nt&;R
 "'R
 !cNT1R
  &&GH!R
" "#R
$ #'sCx.4"7%R
& 
'R
 R
 R
 R
 R
 R
h"A%,."A	u|	"A "A "A "AH 	6 6 6 6 6
 .2-1: :$s)Od*: $s)Od*: 
sCx	: : : :@	bi 	BI 	 	 	 	#!L#16#	u|U\)	*# # # #	!L		u|d"EL4$77	8	 	 	 	"" " $s)O	"
 $s)O" 
" " " "8 1504TF TFTF |TF T#Y-	TF
 T#Y-TF 	LL4L4LL		
 LL4L4LLL
	
TF TF TF TFlAC A A A A A A A ArL   re   c            O           e Zd ZU dZee         ed<   ddddddd	d
ddddddddddddd
ddddg dddddddddf#dedeee         z  deee         z  dedz  dedededededede	d e	d!ed"e	d#ed$ed%ed&ed'ed(ed)ed*ed+ed,e	d-e	d.ed/e	d0edee         d1ed2e
eef         dz  d3ed4e	d5e
eef         dz  d6ed7         d8e	d9ed:e
eef         dz  d;dfN fd<Zd=ej        d;ej        dz  fd>Zd?ej        d@ej        dAej        dz  d;ej        fdBZej        j        d?ej        dCej        d;eej        ej        f         fdD            Z xZS )EConformerEncoderar  ConformerEncoder module.
    see original paper for more details:
        https://arxiv.org/abs/2005.08100

    Please set causal = True in streaming model
    Args:
        input_size: int
            input feature dimension.
        chunk_size: int, list(int)
            Number of frames for each chunk
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training
            Some examples for the 2 cases:
            chunk_size = 12
            chunk_size = [6, 8, 12, 24]
        left_chunk: int, list(int)
            Number of chunks used for masking in streaming mode.
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training. When
            chunk_size is a list, left_chunk must be a list with same length.
            Some examples for the 2 cases:
            left_chunk = 6
            left_chunk = [12, 9, 6, 3]
        num_lang: int
            This parameter is used to store the number of languages in the
            lang_dict, only used for multiseed/multilingual models.
            default None.
        attention_dim: int, optional
            attention dimension. default 256.
        attention_heads: int, optional
            the number of heads. default 4
        linear_units:
            the number of units of position-wise feed forward.
            default 2048
        num_block:
            number of Transformer layer. default 6
        dropout_rate: float, optional
            dropout rate. default 0.1
        input_layer: str, optional
            input layer type before Conformer,
            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
            default "conv2d"
        causal: bool, optional
            if set to True, convolution have no access
             to future frames. default False.
        batch_norm: bool, optional
            if set to True, apply batchnorm before activation
            in ConvModule layer of the conformer.
            default False
        cnn_out: int, optional
            the number of CNN channels before Conformer.
            default -1.
        cnn_layer_norm: bool, optional
            layer norm between Conformer and the first CNN.
            default False.
        ext_pw_out_channel: int, optional
            the number of channel for CNN
            before depthwise_seperable_CNN.
            If 0 then use linear. default 0.
        ext_pw_kernel_size: int, optional
            kernel size of N before depthwise_seperable_CNN.
            only work for ext_pw_out_channel > 0.
            default 1
        depthwise_seperable_out_channel: int, optional
            the number of channel for
            depthwise_seperable_CNN.
            default 256.
        depthwise_multiplier: int, optional
            the number of multiplier for
            depthwise_seperable_CNN.
            default 1.
        chunk_se: int, optional
            0 for offline SE.
            1 for streaming SE, where mean is computed
             by accumulated history until current chunk_se.
            2 for streaming SE, where mean is computed
             by only the current chunk.
            default 0.
        kernel_size: int, optional
            the number of kernels for depthwise_seperable_CNN.
            default 3.
        activation: str, optional
            FeedForward block activation.
            one of ["relu", "swish", "sigmoid"]
            default "relu".
        conv_activation: str, optional
            activation function used in ConvModule part
            of the conformer, default "relu".
        conv_glu_type: str, optional
            activation used use glu in depthwise_seperable_CNN,
            default "sigmoid"
        bias_in_glu: bool, optional
            if set to True, use additive bias in the weight module
             before GLU. default True
        linear_glu_in_convm: bool, optional
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
        attention_glu_type: str
            only work for glu_in_attention !=0
            default "swish".
        export: bool, optional
            if set to True, it removes the padding from convolutional layers
             and allow the onnx conversion for inference.
              default False.
        activation_checkpointing: str, optional
            a dictionarry of {"module","interval","offload"}, where
                "module": str
                    accept ["transformer", "attention"] to select
                    which module should do activation checkpointing.
                "interval": int, default 1,
                    interval of applying activation checkpointing,
                    interval = 1 means that we apply checkpointing
                    on every layer (if activation), otherwise,
                    we apply it every x interval.
                "offload": bool, default False,
                    if set to True, we offload activation to cpu and
                    reload it during backward, otherwise,
                    we recalculate activation in backward.
            default "".
        extra_layer_output_idx: int
            the layer index to be exposed.
        relative_attention_bias_args: dict, optional
            use more efficient scalar bias-based relative multihead attention
            (Q*K^T + B) implemented in cmb.basics.embedding.
            [T5/ALiBi]RelativeAttentionLogitBias
            usage: relative_attention_bias_args={"type": t5/alibi}
            additional method-specific arguments can be provided (see
            transformer_base.py)
        time_reduction: int optional
            time reduction factor
            default 4
        use_pt_scaled_dot_product_attention: whether to use pytorch scaled
            dot product attention in training.
            Default: False
        nemo_conv_settings: dict, optional
            A dictionary of settings for NeMo Subsampling.
            default: None
            usage: nemo_conv_settings=
                {
                    "subsampling":
                    dw_striding/striding/dw_striding_conv1d/striding_conv1d,
                    "conv_channels": int,
                    "subsampling_conv_chunking_factor": int,
                    "is_causal": True/False
                }
        conv2d_extra_padding: str, optional
            Add extra padding in conv2d subsampling layers. Choices are
            (feat, feat_time, none, True)
            Default: none
        replication_pad_for_subsample_embedding:  For batched-streaming
            decoding, use "replication" padding for the cache at start of
            utterance.
            Default: False
        attention_group_size: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attention_group_size < attention_heads = Grouped-Query
            Attention
            attention_group_size = attention_heads = Multi-Query Attention
    extra_multi_layer_output_idxsNr   r   r      r   rf   TFr!   r   r   r   r   r    r"   r#   rh   ri   r1   rj   num_langrk   rl   linear_units
num_blocksr,   rm   r-   r.   rn   ro   r%   r*   r&   r'   r0   r+   r/   r2   r3   r4   r5   r7   r9   extra_layer_output_idxr8   rr   rp   r:   rt   ru   rv   'replication_pad_for_subsample_embeddingry   rz   r<   c'                   	!% t                                          |||
||| 	|d|"|#%|&           || _        || _        | _        |$| _        | j        %z  dk    s
J d            | j        %z  | _        t          %	!fdt          |          D              | _
        || _        || _        |                     dt          j        d          d	           d S )
Nrg   )r,   rr   rs   rt   ru   ry   rz   r   r   c                     g | ]Q}t          di d ddddddddd	
d
dddddd	ddddddRS )r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r7   r8   r9   r:   r;   r   )r   ).0_r/   r8   rk   r7   ry   rl   r.   r4   r-   r0   r1   r2   r3   r'   r&   r,   r9   r*   r%   r+   r5   r   r:   s     rK   
<listcomp>z-ConformerEncoder.__init__.<locals>.<listcomp>h  sV      4 3 &   )M'9'9 5T4S *>)=	
 +? ', (:'9 !, ". "6  *z  *z &X  *z %4O  #0-!" !,#$ )<(;%& (:'9'( .F-E)* "6+, 9\8[-. &:%9/  rL   dev_typer   F)
persistent)r@   rA   r   r   r+   r   r   num_heads_kr   rangeencodersr   r   register_bufferr`   zeros)(rI   ri   r1   rj   r   rk   rl   r   r   r,   rm   r-   r.   rn   ro   r%   r*   r&   r'   r0   r+   r/   r2   r3   r4   r5   r7   r9   r   r   r8   rr   rp   r:   rt   ru   r   ry   rz   rJ   s(     `  ``` ` ``  `````````````  `  `   ` rK   rA   zConformerEncoder.__init__!  s   R 	%)E$'1!5!5%=! 	 	
 	
 	
$ % &3 	4 ~ 449995 :99  >-AA'                         4 z**5  
< '=#-J* 	ZRUKKKKKrL   r   c                 >    | j         r|                      |          S d S rW   )r   )rI   r   s     rK   init_relative_attention_biasz-ConformerEncoder.init_relative_attention_bias  s0     - 	D55lCCC	D 	DrL   r   devicerP   c                    |j         d         }|j         d         }|                     ||| j        | j                  }|                    |          }||S |                    d          }|}t          j        d||                              |	                    d          d          |
                    d          k     }	|	
                    d          }	|	|z  }	|	S )Nr   r   )r   r!   )r   r   r1   rj   tosumr`   r   r   r   r   )
rI   r   r   rP   max_audio_lengthr   r   r   padding_lengthpad_masks
             rK   calculate_hs_maskz"ConformerEncoder.calculate_hs_mask  s     "<?\!_
!11j$/4?
 
 0226::<%%xx{{%<#3FCCCJJ""B
 
$$Q''( %%a((00rL   r   c                    |                      |          }|                     ||          \  }}}}}d}|j        \  }}	}
d}|	|k    rd}|	|z  dk    r	||	|z  z
  }nd}|dk    r5t          j        |ddd|fdd          }|                    |j                  }t          ||          }||                    d          }t          j        |d|fdd          }|	                    d          
                                }t          ||          }|                    d                                          }nd}|                     ||j        |          }|                     |          }| j        dk    o|du }|r|                     ||||          ^}}n/t!          | j                  D ]\  }} ||||||	          \  }}}}|r<|j        d         }|                    |d|          }|dk    r|ddd| ddf         }||fS )
zConformer Forward function

        Args:
            xs_pad: torch.Tensor
                input tensor
            masks: torch.Tensor
                post-embedding input lengths
        Fi  Tr   constantNr   r!   rS   )r   r   r   Fpadr   r   r   squeezer   r]   r^   r   r   r   r   	enumeratereshape)rI   r   r   r   rN   rO   r   unfoldedori_bzr   Dmax_seq_lenchunk_pad_sizeinput_tensor_padsubsampled_pad_mask extra_padded_subsamlped_pad_maskmasks_unfoldrQ   _simplified_pathr   r   layer	embed_dims                          rK   rV   zConformerEncoder.forward  s    ''//595L5LE6
 6
2eUGU )/[   H $q((!,+0E!F!"!!#$5 1aN";Z$ $   022<3FGG({CCL  ',mm' '# 455'!^)<j%4 40 5>>rBBHHJJ 1  -4k     ,33   $&&   $,,l1< G #'"C"CL"Q"Q '2-Q2IT2Q 	  
	#}}\5%QQL11%dm44  5(- ,C) ) )%aAA  	D$*2.I'//IFFL!!+AAA/?/?,BCU""rL   )rX   rY   rZ   r[   r   r\   __annotations__r]   r_   r^   r   r   r   rA   r`   r   r   r   r   jitignorera   rV   rb   rc   s   @rK   r   r   y  s        c cJ $(9,,,  $   !& $"#"#/2$% %& $)")&(35(*>B4948KQ8=$%:>OhL hLhL $s)OhL $s)O	hL
 *hL hL hL hL hL hL hL hL hL hL hL   !hL"  #hL$ *-%hL& "'hL( )hL* +hL, -hL. /hL0 1hL2 3hL4 "5hL6  7hL8 9hL: !$;hL< (,Cy=hL> #&?hL@ '+38nt&;AhLB ChLD .2EhLF !cNT1GhLH &&GHIhLJ 26KhLL "MhLN #'sCx.4"7OhLP 
QhL hL hL hL hL hLTD!LD		D D D Dl,1L@Et@S	   * YY#lY#+0<Y#	u|U\)	*Y# Y# Y# Y# Y# Y# Y# Y#rL   r   c                        e Zd ZdZ	 	 	 	 	 	 	 	 dd	ed
ededededededef fdZ	 ddej	        dej	        dz  dedz  de
ej	        edz  f         fdZ xZS )WindowQformerzWindow-level Qformer   r      r   r   rg   Twindow_sizenum_queriesr   rk   rl   r   r,   normalize_beforec	                 d   t                                                       t          j        fdt	          |          D                       | _        t          j        t          j        d|                    | _	        rt          j
        d          nd | _        || _        d S )Nc                 F    g | ]}t          j        d d          S )r   T)r$   nheaddim_feedforwarddropoutr/   batch_first
norm_first)r   TransformerDecoderLayer)r   r   rk   rl   r,   r   r  s     rK   r   z*WindowQformer.__init__.<locals>.<listcomp>  sR         *))$0(% $/    rL   r   g-q=)eps)r@   rA   r   
ModuleListr   decoders	Parameterr`   r   queriesrF   
after_normr  )
rI   r  r  r   rk   rl   r   r,   r  rJ   s
       `````rK   rA   zWindowQformer.__init__  s     	        z**  
 
 |EK;$N$NOO6FPBLE2222D 	 'rL   Naudio_embedrP   	embed_lenr<   c                    |                     dd          }|j        d         | j        z  }|dk    r!t          j        |d| j        |z
  fdd          }t          j        |ddddf         d| j        fd| j        f          }|j        \  }}}|                    |d| j        |          }|                     dd	                                          }|                    ||z  | j        d          }| j        	                    ||z  dd          }	| j
        D ]}
 |
|	|d|
          }	| j        |                     |	          }	|
|| j        z  }|	                    ||d          }||fS )zforward decoderr   r  r!   r   r   .N)r+   strider   )tgtmemorytgt_maskmemory_mask)	transposer   r  r   r  unfoldview
contiguousr'  r   r%  r(  )rI   r)  rP   r*  paddingembed_chunkbszr   slenqr  rU   s               rK   rV   zWindowQformer.forward(  s    "++Aq11#B'$*::Q;;%a!1G!;<j! K hT111%D,-t'(
 
 

 #(Q!&&sB0@$GG!++Aq11<<>>!&&sTz43CRHHLd
B33] 	R 	RE!K$DQQQAA?&""A !T%55IffS$##I~rL   )r  r   r  r   r  r   rg   TrW   )rX   rY   rZ   r[   r\   r]   r^   rA   r`   r   ra   rV   rb   rc   s   @rK   r  r    s           !!% '  ' '  ' 	 '
  '  '  '  '  '  '  '  '  '  'L !%	* *\* lT!* :	*
 
u|S4Z'	(* * * * * * * *rL   r  c            	            e Zd ZdZdededdf fdZdej        ddfdZ	d	ej        ddfd
Z
	 	 ddej        dej        dz  dedej        fdZ	 	 ddej        dej        dz  dedej        fdZ xZS )AudioEmbeddingzImage embedding.configkwargsr<   Nc           	      B   t                                                       || _        t          |d          r|j        n|j        }d }d| _        t          |j        t                    r`|j        
                    dd           dk    rA|j        
                    dd           }|J t          di || _        |d         }|d         }nt          d          |
J d	            || _        || _        |
                    d
d          | _        |
                    dd          | _        |
                    dd          r-|
                    di           }||d<   t%          di || _        nd | _        |
                    dd          r| j        
J d            |
                    di           }d| j        ||ddt)          j                    dd}	|r(|	                    |           dD ]}
|
|vs
J d            t/          di |	| _        nd | _        |
                    dd          }|dk    rt)          j        ||          | _        n9|dk    r|}d}| j        s| j        rdn| j        | _        t)          j        || j        z  |          g}t9          d|          D ]=}|                    t)          j                    t)          j        ||          g           >t)          j        | | _        t)          j        || j        z  |          g}t9          d|          D ]=}|                    t)          j                    t)          j        ||          g           >t)          j        | | _         nt          d| d          |j!        | _!        d | _"        d | _#        d S )Nn_embdnamecascadesr<  rk   ri   r#   z(Remember to set values for audio_dim_outfreeze_audio_processorFdownsample_rater   use_qformerqformer_configuse_conv_downsamplez6don't support use qformer and conv downsample togetherrt   r|   r   r}   r   r   projection_clslinearmlpr  zprojection_cls = z, not implementedr   )$r@   rA   r<  hasattrr?  hidden_size	layer_idxr   audio_processorr   r   r   encoderr   audio_dim_outaudio_dim_inrC  rD  r  qformerr   r   r   r   conv_dsLinearaudio_projectionlinear_downsample_rater   extendGELU
Sequentialaudio_projection_for_vision
vocab_sizeinput_embedsaudio_embed_sizes)rI   r<  r=  rL  rP  encoder_confign_melsrF  rt   r   r   rH  dim_projectiondepthlayersr   rJ   s                   rK   rA   zAudioEmbedding.__init__X  s   '.vx'@'@XfmmfFX
  	  v-t44	*&**6488JFF#377$GGN!---+==n==DL*?;M#L1FF%b)))((*T(((*"&,jj1I5&Q&Q#%zz*;Q??::mU++ 	 #ZZ(8"==N.;N?+(::>::DLLDL::+U33 	 <''H ('' "(,@"!E!E,&*&:()!$45 gii"	* 	*& " *112DEEEF  A$6666P 7666 /  , DLL  DL$4h??X%%$&Im[$I$ID!!u$$ )NElMdlM9M ' 	-$*EE~VVF 1e__ V Vrwyy")NN*S*STUUUU$&M6$:D! 	-$*EE~VVF 1e__ V Vrwyy")NN*S*STUUUU/1}f/ED,,%ENEEE  
 !+ !%rL   r\  c                     || _         d S rW   )r\  )rI   r\  s     rK   set_audio_embedszAudioEmbedding.set_audio_embeds  s    (rL   r]  c                     || _         d S rW   )r]  )rI   r]  s     rK   set_audio_embed_sizesz$AudioEmbedding.set_audio_embed_sizes  s    !2rL   speechaudio_attention_maskaudio_projection_modec                 <   | j         rEt          j                    5  |                     ||          \  }}ddd           n# 1 swxY w Y   n|                     ||          \  }}| j        |                     |d          \  }}| j        1||                    d          }|                     ||          \  }}| j        dk    r|                                \  }}}	|| j        z  }
|
dk    r#t          j
        |ddd| j        |
z
  fdd          }|                    d          }|                    ||| j        z  |	| j        z            }|dk    r|                     |          }n/|dk    r|                     |          }nt          d| d	          |S )
zl
        arguments:
            input_embeds: audio features (B, T, D)  B: num audios in a sequence
        N)rP   r   r   r   rg  visionzaudio_projection_mode = z not implemented)rC  r`   no_gradrO  rR  rS  r  rV  r   r   r  r3  rU  rZ  r   )rI   r\  rh  ri  audio_featuresr   r   bsr   feat_dimr5  audio_set_tensors               rK   get_audio_featuresz!AudioEmbedding.get_audio_features  s.    & 	U Y Y(,\CW(X(X%Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y %)LL?S$T$T!NE<# $^$ G GNA<# a(($(LLeL$L$L!NE&!++$2$7$7$9$9!B ;;G{{!""1d9GCD	" " %))!,,G+00466466 N !H,,#44^DD"h..#??OOR+@RRR    s   AAArm  c                     |                      |                    d          ||          }|                    d          S )z
        arguments:
            audio_features: audio features (T, D)

        returns:
            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
        r   )rh  ri  )rq  r   r  )rI   rm  rh  ri  audio_embedss        rK   rV   zAudioEmbedding.forward  sK     ..$$Q''!5"7 / 
 

 ##A&&&rL   )Nrg  )rX   rY   rZ   r[   r	   r   rA   r`   r   rd  rf  r_   rq  rV   rb   rc   s   @rK   r;  r;  U  sP       f&/ f&3 f&4 f& f& f& f& f& f&P)U\ )d ) ) ) )3u| 3 3 3 3 3 59%-	4  4 l4  $lT14   #	4 
 
4  4  4  4 r 59%-	' '' $lT1'  #	'
 
' ' ' ' ' ' ' 'rL   r;  )&r   r   typingr   r   numpyr   r`   torch.nn.functionalr   
functionalr   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   2torch.distributed.fsdp.fully_sharded_data_parallelr   transformersr	   'vllm.model_executor.models.phi4mm_utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   ABCre   r   r  r;  r   rL   rK   <module>r}     s_   


                                     X W W W W W ) ) ) ) ) )                         V' V' V' V' V'BI V' V' V'rxA xA xA xA xASWbi xA xA xAvG# G# G# G# G#- G# G# G#TO O O O OBI O O Ody' y' y' y' y'RY y' y' y' y' y'rL   