
    .`ih                       d dl Z d dlZd dlmc mZ d dlmZmZ  G d dej                  ZdGde	dej        j        fdZ
	 dHd	ed
ee         dededej        f
dZ G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Zdede	dededee	         dee	         d ee	         ddfd!Z G d" d#ej                  Z G d$ d%ej                  Z G d& d'ej                  Z G d( d)ej                  Z G d* d+ej                  Z G d, d-ej        j                  Z	 dId/ed0ed1ed2ed3ed4edefd5Z G d6 d7ej                  Z  G d8 d9ee           Z!d:ed;edz  defd<Z" G d= d>ej                  Z# G d? d@ej        j$                  Z%dAe	dBedefdCZ&dDedEedefdFZ'dS )J    N)Tensornnc                   0     e Zd ZdZdededdf fdZ xZS )	BlockBasezBlock abstract module
input_sizeoutput_sizereturnNc                 d    t                                                       || _        || _        d S N)super__init__r   r   )selfr   r   	__class__s      {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/phi4mm_utils.pyr   zBlockBase.__init__   s.    $&    )__name__
__module____qualname____doc__intr   __classcell__r   s   @r   r   r      sX        '3 'S 'T ' ' ' ' ' ' ' ' ' 'r   r   relunamer	   c                 F   |                                  } | dk    rt          j        d          S | dk    rt          j                    S | dk    rt          j                    S | dk    rt          j                    S | dk    rt          j                    S t          |           )zSelect an activation function by name

    Args:
        name: str
            activation function name,
            one of ["relu", "gelu", "swish", "sigmoid"],
            default "relu".
    r   T)inplacegeluswishsigmoididentity)lowerr   ReLUGELUSiLUSigmoidIdentityNotImplementedError)r   s    r   get_activationr(      s     ::<<Dv~~wt$$$$v~~wyywwyyyz||z{}}
d
#
##r   x_lenchunk_start_idxleft_windowright_windowc                    t          j        |                                          }t           j        j                            |d          }t           j        j                            |d|           }t          j        d|                               d          }||k     ||k    z                                  dddf         }t          j        d|                               d          	                    | d          }||z
  }	d|	|	dk     <   ||	         }
||
                    d          k    }||z   }t          |          ||t          |          k    <   ||         }||                    d          k     }||z  S )a,  
    The function is very important for Transformer Transducer Streaming mode
    Args:
        x_len: sequence length
        chunk_start_idx: first idx of each chunk, such as [0,18,36,48].
        It also supports adaptive chunk size [0,10,15,45]
        left_window: how many left chunks can be seen
        right_window: how many right chunks can be seen. It is used for
        chunk overlap model.
        Returns:
            mask (torch.Tensor): a mask tensor for streaming model
            Torch 1.0.1
            tensor([[1., 1., 0., 0.],
                    [0., 1., 1., 0.],
                    [0., 0., 1., 1.]])
            Torch 1.4.1
            tensor([[True., True., False., False.],
                    [False., True., True., False.],
                    [False., False., True., True.]])
    )   r   )r   r.   )valuer   Nr.   )torchr   longr   
functionalpadarange	unsqueezenonzeroexpandlen)r)   r*   r+   r,   	start_padend_pad	seq_rangeidxseq_range_expandidx_leftboundary_left	mask_left	idx_rightboundary_right
mask_rights                  r   adaptive_enc_maskrE   0   s   . l 
dff  #'' I h!%%u &  G Q&&0044II$:;
D
D
F
F	1C
 	Q((++225"==  [ HHX\h'M M$;$;B$?$??Il"I25o2F2FIi#o.../Y'N!N$<$<R$@$@@Jz!!r   c                   B     e Zd ZdZddededdf fdZd	edefd
Z xZ	S )GLUz(Implement Gated Linear Unit (GLU) moduler0   r   dimact_namer	   Nc                 ~    t                                                       || _        t          |          | _        d S r   )r   r   rH   r(   act_fn)r   rH   rI   r   s      r   r   zGLU.__init__f   s3    $X..r   xc                 p    |                     d| j                  \  }}||                     |          z  S )zGLU forward
        Apply Swish function on the first half of input matrices
        with sigmoid of the second half.

        Args:
            x: torch.Tensor
                Input.

           rH   )chunkrH   rK   )r   rL   half_xgates       r   forwardzGLU.forwardl   s6     wwqdhw//D))))r   )r0   r   )
r   r   r   r   r   strr   r   rS   r   r   s   @r   rG   rG   c   s        22/ /C / /D / / / / / /* *F * * * * * * * *r   rG   c                   X     e Zd ZdZ	 	 	 ddedededed	ed
eddf fdZdedefdZ	 xZ
S )GLUPointWiseConva  GLUPointWiseConv module
    used for conformer architecture,
    for more details see:
    https://arxiv.org/pdf/2005.08100v1.pdf

    Args:
        input_dim: int
            input channel size.
        output_dim: int
            output channel size.
        kernel_size: int
            kernel size
        glu_type: str, optional
            activation function one of
             ["sigmoid", "relu", "gelu"]
              default "sigmoid".
        bias_in_glu: bool, optional
            use addtive bias in glu
        causal: bool, optional
            if set to True, padding is set to the half of
             kernel size, ie, convolution can't see future frames.
              default False.

    r   TF	input_dim
output_dimkernel_sizeglu_typebias_in_glucausalr	   Nc                    t                                                       || _        || _        || _        |r%t          j        ||dz  |d|dz
            | _        n't          j        ||dz  |d|dz
  dz            | _        t          |          | _	        |r\t          j
        t          j        d|d                    | _        t          j
        t          j        d|d                    | _        d S d S )NrN   r.   padding)r   r   rZ   rX   r[   r   Conv1dext_pw_conv_1dr(   glu_act	Parameterr1   zerosb1b2)r   rW   rX   rY   rZ   r[   r\   r   s          r   r   zGLUPointWiseConv.__init__   s    	 $& 	"$)Q$q# # #D #%)Q$qQ.# # #D &h// 	Bl5;q*a#@#@AADGl5;q*a#@#@AADGGG	B 	Br   rL   c                    |                     g d          }|                     |          }| j        dk    r| j        rF|ddd| j        ddf         | j        z   |dd| j        | j        dz  ddf         | j        z   z  }n|ddd| j        ddf         |dd| j        | j        dz  ddf         z  }n| j        rY|ddd| j        ddf         | j        z   |                     |dd| j        | j        dz  ddf         | j        z             z  }nH|ddd| j        ddf         |                     |dd| j        | j        dz  ddf                   z  }|                     g d          }|S )z3
        Args:
            x: input tensor
        r   rN   r.   bilinearNr   rN   )permutera   rZ   r[   rX   re   rf   rb   r   rL   s     r   rS   zGLUPointWiseConv.forward   s    IIiii  ""=J&& qqq!do-qqq01DG;aaa4?Q+>>ABTWL
 qqq!do-qqq01DOdo.AA111DEG 
  qqq!do-qqq01DG;t||aaa4?Q+>>ABTWL@ @  qqq!do-qqq01T\\aaa4?Q+>>AB6 6  IIiii  r   )r   TFr   r   r   r   r   rT   boolr   r   rS   r   r   s   @r   rV   rV   {   s         < " #B #B#B #B 	#B
 #B #B #B 
#B #B #B #B #B #BJ F        r   rV   c                   P     e Zd ZdZ	 ddedededededd	f fd
ZdedefdZ xZS )DepthWiseSeperableConv1da+  DepthWiseSeperableConv1d module used in Convnet module
    for the conformer, for more details see:
    https://arxiv.org/pdf/2005.08100v1.pdf

    Args:
        input_dim: int
            input channel size.
        depthwise_seperable_out_channel: int
            if set different to 0, the number of
             depthwise_seperable_out_channel will be used as a channel_out
             of the second conv1d layer.
             otherwise, it equals to 0, the second conv1d layer is skipped.
        kernel_size: int
            kernel_size
        depthwise_multiplier: int
            number of input_dim channels duplication. this value
            will be used to compute the hidden channels of the Conv1D.
        padding: int, optional
            padding for the conv1d,
             default: 0.

    r   rW   depthwise_seperable_out_channelrY   depthwise_multiplierr_   r	   Nc                    t                                                       t          j        |||z  |d||          | _        |dk    r!t          j        ||z  |ddd          | _        nt          j                    | _        || _        d S )Nr.   )r_   groupsr   )r   r   r   r`   dw_convpw_convr&   rp   )r   rW   rp   rY   rq   r_   r   s         r   r   z!DepthWiseSeperableConv1d.__init__   s     	y,,
 
 
 +a//900/ DLL ;==DL/N,,,r   rL   c                 p    |                      |          }| j        dk    r|                     |          }|S )z4

        Args:
            x: input tensor
        r   )rt   rp   ru   rk   s     r   rS   z DepthWiseSeperableConv1d.forward  s5     LLOO/144QAr   )r   	r   r   r   r   r   r   r   rS   r   r   s   @r   ro   ro      s         : O OO *-O 	O
 "O O 
O O O O O O>	 	F 	 	 	 	 	 	 	 	r   ro   c            #            e Zd ZdZ	 	 	 	 	 	 	 	 	 dded	ed
ededededededededededededededdf" fdZddZ	de
de
fdZ xZS ) 
ConvModuleay	  ConvModule Module for the conformer block.
    for more details see:
    https://arxiv.org/pdf/2005.08100v1.pdf

    Args:
        input_dim: int
            input channel size.
        ext_pw_out_channel: int
            if > 0, ext_pw_out_channel is a dim channel size
             for the last pointwise conv after swish activation.
        depthwise_seperable_out_channel: int
            if set different to 0, the number of
             depthwise_seperable_out_channel
             will be used as a channel_out of the second conv1d layer.
             otherwise, it equal to 0, the second conv1d layer is skipped.
        ext_pw_kernel_size: int
            kernel size of the conv pointwise of the conformer.
        kernel_size: int
            kernel size.
        depthwise_multiplier: int
            number of input_dim channels duplication. this value
             will be used to compute the hidden channels of the Conv1D.
        dropout_rate: float
            dropout rate.
        causal: bool, optional
            if set to True, convolution have no access
             to future frames. default False.
        batch_norm: bool, optional
            if set to True, apply batchnorm before activation.
            default False
        chunk_se: int, optional
            0 for offline SE.
            1 for streaming SE, where mean is computed
             by accumulated history until current chunk_se.
            2 for streaming SE, where mean is computed
             by only the current chunk.
        chunk_size: int, optional
            chunk size for cnn. default 18
        activation: str, optional
            activation function used in ConvModule,
            default: "relu".
        glu_type: str, optional
            activation function used for the glu,
            default: "sigmoid".
        bias_in_glu: bool, optional
            if set to True, use additive bias in the weight module
             before GLU.
        linear_glu_in_convm: bool, optional
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
        export: bool, optional,
            if set to True, padding is equal to 0.  This is for inference,
             or onnx export.  Typically this is set by the export program or
             the decoder program, and it isn't present in your config file.
             default False
    Fr      r   r   TrW   ext_pw_out_channelrp   ext_pw_kernel_sizerY   rq   dropout_rater\   
batch_normchunk_se
chunk_size
activationrZ   r[   linear_glu_in_convmexportr	   Nc                    t                                                       t          j        |          | _        || _        || _        || _        || _        || _	        || _
        || _        || _        |                                  |	| _        || _        |	rt          j        |          | _        t%          |          | _        t          j        |          | _        || _        |r
|rdn|dz
  }n|dz
  dz  }t/          |||||          | _        |dk    r$||k    rt          j        ||          | _        d S d S |dk    rt          j        ||z  |          | _        d S d S )Nr   r.   rN   r^   )r   r   r   	LayerNorm
layer_normrW   r{   r|   rp   rZ   r[   r   r\   _add_ext_pw_layerr~   rY   BatchNorm1dbn_layerr(   actDropoutdropoutr   ro   dw_sep_conv_1dLinearln2)r   rW   r{   rp   r|   rY   rq   r}   r\   r~   r   r   r   rZ   r[   r   r   r_   r   s                     r   r   zConvModule.__init__Z  s   & 	,y11""4"4/N, &#6    $& 	6N955DM!*--z,// 	-!6aa{QGG"Q1,G6+ 
 
 
 +a//;;;9%DiPP <; $q((9Y1E%EyQQ )(r   c                 (   t          j                    x| _        x| _        x| _        | _        t          j                    | _        dx| _        | _        | j	        dk    rF| j
        rPt          j        | j        | j	        | j        d| j        dz
            | _        | j        dk    rd| _        nGd| _        n?t          j        | j        | j	        | j        d| j        dz
  dz            | _        d| _        | j        r,t          | j        | j	        | j        | j                  | _        n7t%          | j        | j	        | j        | j        | j        | j
                  | _        | j        | j	        k    r-d| _        t          j        | j	        | j                  | _        dS d| _        dS t(          j                             t)          j        d                    | _        t(          j                             t)          j        d                    | _        dS )	z
        This function is an extension of __init__ function
        and dedicated to the convolution module creation
        of the conformer.
        Fr   r.   r^   TrN      N)r   r&   ln1glur   ra   squeeze_excitation	apply_ln1fix_len1r{   r\   r`   rW   r|   r   	GLULinearrZ   r[   rV   r   r1   rc   onespw_conv_simplify_wrd   pw_conv_simplify_br   s    r   r   zConvModule._add_ext_pw_layer  s    KMM	
 	
48 	
dmd.A #%+--).."a''{ &&(iN++!4q8' ' '# *Q..$(DMM$)DMM&(iN++!4q8Q>' ' '# !&' $N+M$	  ,N++M$K  ~!888!%9T%<dnMM!&&+h&8&8A&G&GD#&+h&8&8Q&H&HD###r   rL   c                 f   |                      |          }| j        dk    r^|                     |          }| j        r%| j        dk    r|ddd| j        dz
   ddf         }| j        r|                     |          }nA|| j        d         z  | j        d         z   }|| j        d         z  | j        d         z   }||z   }|	                    g d          }| 
                    |          }| j        r%| j        dk    r|ddddd| j        dz
   f         }t          | d          rC|	                    g d          }|                     |          }|	                    g d          }| j        r|                     |          }|                     |          }| j        dk    r|                     |          }| j        r|ddddd| j        dz
   f         }| j        rC|	                    g d          }|                     |          }|	                    g d          }|	                    g d          }n]|                    d          	                    g d          }|| j        d         z  | j        d         z   }|                    d          }|                     |          }|S )zHConvModule Forward.

        Args:
            x: input tensor.
        r   r.   Nrh   r   )r   r.   r   rN   rN   )r   r{   r   r\   r|   r   r   r   r   rj   r   rY   hasattrr   r~   r   r   ra   r   r6   squeezer   )r   rL   x_0x_1s       r   rS   zConvModule.forward  s    OOA"a''A{ >t6::aaa9T4q899111<=~  HHQKKd-a0043J13MMCd-a0043J13MMCc	AIIiii  ""; 	34+a//!!!QQQ1D,q01112A4 	%		)))$$AA		)))$$A? 	!a  AHHQKK"a''##A&&A} >aaa< 7! ;<<<=~ )IIiii((HHQKKIIiii((		)))$$AAA&&|||44AD+A..1H1KKA		!ALLOOr   )	FFr   rz   r   r   TFFr	   N)r   r   r   r   r   floatrm   rT   r   r   r   rS   r   r   s   @r   ry   ry     sf       8 8F   ! $)#<R <R<R  <R *-	<R
  <R <R "<R <R <R <R <R <R <R <R <R  "!<R" #<R$ 
%<R <R <R <R <R <R|;I ;I ;I ;Iz1 1F 1 1 1 1 1 1 1 1r   ry   c                   N     e Zd ZdZ	 	 ddededededd	f
 fd
ZdedefdZ	 xZ
S )r   a`  Linear + GLU module

    Args:
        input_dim: int
            input size
        output_dim: int
            output size.
        glu_type:
            activation function name used in glu module.
            default "sigmoid" (swish function).
        bias_in_glu: bool, optional
            If True, the addtive bias is added. Default False.
    r   TrW   rX   rZ   r[   r	   Nc                     t                                                       t          j        ||dz  |          | _        t          d|          | _        d S )NrN   r0   )r   r   r   r   linearrG   rb   )r   rW   rX   rZ   r[   r   s        r   r   zGLULinear.__init__  sI     	i	:>;GG2x((r   rL   c                 V    |                      |          }|                     |          S )zFGLULinear forward

        Args:
            x: input tensor.
        )r   rb   rk   s     r   rS   zGLULinear.forward#  s#     KKNN||Ar   r   Trl   r   s   @r   r   r   	  s         $ " 	) 	)	) 	) 		)
 	) 
	) 	) 	) 	) 	) 	) F        r   r   c                   R     e Zd ZdZ	 	 ddededededed	d
f fdZde	d	e	fdZ
 xZS )FeedForwarda  FeedForward Module.
    For more details see Conformer paper:
        https://arxiv.org/pdf/2005.08100.pdf

    Args:
        d_model: int
            input size.
        d_inner: int
            output size.
        dropout_rate: float,
            dropout rate.
        activation: str,
            activation function name,
            one of ["relu", "swish", "sigmoid"],
            sigmoid activation is only used with "glu_in_fnn=True",
            default "sigmoid".
        bias_in_glu: bool, optional
    r   Td_modeld_innerr}   r   r[   r	   Nc                 `   t                                                       || _        || _        t	          j        |          | _        t          ||||          }t	          j        |t	          j	        |          t	          j
        ||          t	          j	        |                    | _        d S r   )r   r   r   r   r   r   r   r   
Sequentialr   r   net)r   r   r   r}   r   r[   moduler   s          r   r   zFeedForward.__init__A  s     	,w//7GZEE=J|$$Igw''J|$$	
 
r   rL   c                 V    |                      |                     |                    }|S )zRFeedForward forward function.

        Args:
            x: input tensor.
        )r   r   )r   rL   outs      r   rS   zFeedForward.forwardV  s&     hhtq))**
r   r   )r   r   r   r   r   r   rT   rm   r   r   rS   r   r   s   @r   r   r   -  s         0 $ 
 

 
 	

 
 
 

 
 
 
 
 
* F        r   r   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsc                 F    |dz   }|| v r|                      |           dS dS )zPerform pre-hook in load_state_dict for backward compatibility.

    Note:
        We saved self.pe until v.0.5.2 but we have omitted it later.
        Therefore, we remove the item "pe" from `state_dict` for backward
        compatibility.

    peN)pop)r   r   r   r   r   r   r   ks           r   	_pre_hookr   b  s5    " 	AJq r   c                   `     e Zd ZdZ	 	 	 ddedededed	d
f
 fdZded	efdZded	efdZ	 xZ
S )T5RelativeAttentionLogitBiasaS  
    This module implements the relative position bias described in Section
    2.1 of the T5 paper: https://arxiv.org/pdf/1910.10683.pdf

    The Huggingface implementation is used as a reference
    https://github.com/huggingface/transformers/blob/v4.30.0/src/
    transformers/models/t5/modeling_t5.py#L435

    Modifies attention as Q*K^T + B, where B is a learned scalar bias based
    on relative position of the query and key. It is HxNxN, where H is the
    number of heads, N is the sequence length.

    I've made these modifications to the original T5 bias:
    - Skipping of the bucketing step. Original T5 bias converted rel
      position distances into logarithmically increasing buckets. This is
      supposed to help with length generalization.
    - I just directly use rel position index as bias values, as we don't
      need length generalization (40s max is good enough for ASR encoder),
      and it keeps ONNX export simple.
    - I've also extended it so that biases can be asymmetric, the default
      implementation treats L->R and R->L the same. Asymmetric was found to
      yield better results in my experiments.

    Args:
        num_heads: int
            Number of attention heads
        num_buckets: int
            Number of buckets to use for relative attention bias. This is the
            size of the learnable bias parameter. Bucketing is not yet
            supported, so this defaults to -1 which means no bucketing is
            used (max_distance determines size of bias param).
        max_distance: int
            Maximum distance to use for relative attention bias. With
            num_buckets=-1, this directly controls the max size of the bias
            parameter. When num_buckets > 0 is supported, this will control
            the maximum distance for logarithmic bucketing after which all
            positions are in the same bucket.
        symmetric: bool
            Whether to use symmetric or asymmetric biases. symmetric=False uses
            2x number of bias params to distinguish L->R from R->L. This was
            found to be better for the encoder.
    r0     F	num_headsnum_bucketsmax_distance	symmetricr	   Nc                 R   t                                                       || _        || _        || _        || _        | j        dk     | _        | j        r|| _        nt          d          | j        s| xj        dz  c_        t          j	        | j        | j                  | _
        d S )Nr   z;T5 attention bias with bucketed positions is not yet testedrN   )r   r   r   r   r   r   _skip_bucketingr'   r   	Embeddingbias_values)r   r   r   r   r   r   s        r   r   z%T5RelativeAttentionLogitBias.__init__  s     	"&("#/!3 	+D%M   ~ 	"!<(8$.IIr   rL   c                    |                     d          }t          j        ||j        t          j                  d d d f         }t          j        ||j        t          j                  d d d f         }||z
  }|                    || j         k     | j                   }|                    || j        dz
  k    | j        dz
            }| j        r|}n|                     |          }| j	        r|
                                }n|| j        dz  z  }|                     |          }|                    ddd                              d          }|S )Nr.   devicedtyperN   r   )sizer1   r5   r   r2   masked_fillr   r   _bucket_relative_positionr   absr   r   rj   r6   )r   rL   maxposcontext_positionmemory_positionrelative_positionbias_idxt5_rel_att_biass           r   rS   z$T5RelativeAttentionLogitBias.forward  sh    <qxuzRRRAAtG
  ,vahejQQQ!!!G
 ,.>> .99!2 22T5F4F
 
 .99 1A 55t7H17L
 

  	I(HH556GHHH> 	.||~~HH(A--H**844)11!Q::DDQGGr   r   c                    d}| j         sS| xj        dz  c_        ||dk                        t          j                  | j        z  z  }t          j        |          }n(t          j        |t          j        |                     }| j        dz  }||k     }|t          j        |	                                |z            t          j        | j        |z            z  | j        |z
  z                      t          j                  z   }t          j        |t          j        || j        dz
                      }|t          j        |||          z  }|S )Nr   rN   r.   )r\   r   tor1   r2   r   min
zeros_likelogr   mathr   	full_likewhere)r   r   relative_buckets	max_exactis_smallrelative_position_if_larges         r   r   z6T5RelativeAttentionLogitBias._bucket_relative_position  sr    { 		"!2Q!6 : :
! ! !! ! !&	*; < <!&!5#34E#F#F" " ! $)	$y0 &/I'--//);<<ht(94556)+- "UZ..	&"
 &+Y&O68H18LMM&
 &
"
 	EK')C
 
 	
  r   )r0   r   F)r   r   r   r   r   rm   r   r   rS   r   r   r   s   @r   r   r   x  s        ) )\  J JJ J 	J
 J 
J J J J J J.   F        D$ 6 $ f $  $  $  $  $  $  $  $ r   r   c            	       t     e Zd ZdZddedededdf fdZd	ej        ddfd
Z	d	ej        dej        fdZ
 xZS )AbsolutePositionalEncodingai  Absolute Positional encoding module.
    This module implement Absolute sinusoidal positional encoding
    from: https://arxiv.org/pdf/1706.03762.pdf

    Args:
        d_model: int
            Input embedding size.
        dropout_rate: float
            dropout rate
        max_len: int, optional
            Maximum input length sequence, Default 5000

      r   r}   max_lenr	   Nc                    t                                                       || _        t          j        | j                  | _        t          j                            |          | _	        d| _
        |                     t          j        d                              d|                     |                     t                     dS )z'Construct an PositionalEncoding object.pN        r.   )r   r   r   r   sqrtxscaler1   r   r   r   r   	extend_petensorr8   "_register_load_state_dict_pre_hookr   )r   r   r}   r   r   s       r   r   z#AbsolutePositionalEncoding.__init__  s    i--x'','77u|C((//7;;<<<//	:::::r   rL   c                    | j         | j                             d          |                    d          k    rW| j         j        |j        k    s| j         j        |j        k    r+| j                             |j        |j                  | _         dS t          j        |                    d          | j                  }t          j        d|                    d          t
          j	                  
                    d          }t          j        t          j        d| j        dt
          j	                  t          j        d          | j        z   z            }t          j        ||z            |dddddf<   t          j        ||z            |dddddf<   |
                    d          }|                    |j        |j                  | _         dS )	zSReset the positional encodings.

        Args:
            x: input tensor
        Nr.   )r   r   r   r   rN   g     @r   )r   r   r   r   r   r1   rd   r   r5   float32r6   expr   r   sincos)r   rL   r   positiondiv_terms        r   r   z$AbsolutePositionalEncoding.extend_pe  s    747<<??affQii#?#?w}''47>QX+E+E'**1718*DDF[DL11<166!99EMBBBLLQOO9LDL!5=AAA!!DL012
 
 i8 344111add7i8 344111add7\\!__%%qxqw%77r   c                     |                      |           || j        z  | j        ddd|                    d          f         z   }|                     |          S )zAdd positional encoding.

        Args:
            x: Input tensor. shape is (batch, time, ...)

        Returns:
            Encoded tensor. Its shape is (batch, time, ...)

        Nr.   )r   r   r   r   r   rk   s     r   rS   z"AbsolutePositionalEncoding.forward2  sS     	qOdgaaa166!99n55||Ar   )r   )r   r   r   r   r   r   r   r1   r   r   rS   r   r   s   @r   r   r     s         ; ; ;5 ;3 ;RV ; ; ; ; ; ;85< 8D 8 8 8 8* %,        r   r   c                   <     e Zd ZdZdeddf fdZdedefdZ xZS )MeanVarianceNormLayerzMean/variance normalization layer.

    Will subtract mean and multiply input by inverted standard deviation.
    Typically used as a very first layer in a model.

    Args:
        input_size: int
            layer input size.
    r   r	   Nc                    t                                                       || _        t          j        t          j        |                    | _        t          j        t          j        |                    | _	        d S r   )
r   r   r   r   rc   r1   rd   global_meanr   global_invstd)r   r   r   s     r   r   zMeanVarianceNormLayer.__init__M  s^    $<J(?(?@@\%*Z*@*@AAr   input_c                 &    || j         z
  | j        z  S )zWMeanVarianceNormLayer Forward

        Args:
            input_: input tensor.
        )r   r   )r   r   s     r   rS   zMeanVarianceNormLayer.forwardS  s     ))T-???r   rw   r   s   @r   r   r   B  s         B3 B4 B B B B B B@f @ @ @ @ @ @ @ @ @r   r   c                        e Zd ZdZ	 	 	 	 	 	 	 	 ddeded	ed
edeez  dededededdf fdZ	 ddededz  de	eedz  f         fdZ
	 ddededz  dee	eedz  f         z  f fdZ xZS )CausalConv1Da  
    A causal version of nn.Conv1d where each step would have limited access to
    locations on its right or left
    All arguments are the same as nn.Conv1d except padding.

    If padding is set None, then paddings are set automatically to make it a
    causal convolution where each location would not see any steps on its right.

    If padding is set as a list (size of 2), then padding[0] would be used as
    left padding and padding[1] as right padding.
    It would make it possible to control the number of steps to be accessible
    on the right and left.
    This mode is not supported when stride > 1. padding[0]+padding[1] should
    be equal to (kernel_size - 1).
    r.   r   Trd   Nin_channelsout_channelsrY   strider_   dilationrs   biaspadding_moder	   c                    d | _         ||dz
  | _        |dz
  | _        n|dk    r||dz
  k    rt          d          t	          |t
                    r|| _        || _        nnt	          |t                    rFt          |          dk    r3|d         |d         z   |dz
  k    r|d         | _        |d         | _        nt          d| d          | j        | _        t                      
                    ||||d||||	|
|           d S )Nr.   z3No striding allowed for non-symmetric convolutions!rN   r   zInvalid padding param: !)r  r  rY   r  r_   r  rs   r  r  r   r   )cache_drop_size_left_padding_right_padding
ValueError
isinstancer   listr9   _max_cache_lenr   r   r   r  r  rY   r  r_   r  rs   r  r  r   r   r   s               r   r   zCausalConv1D.__init__m  sE     $?!,qD"(1*D{{w+/99 !VWWW'3'' G%,"&-##7D))GLLA%%AJ+{Q>>%,QZ"&-aj## !E7!E!E!EFFF"0#%#% 	 	
 	
 	
 	
 	
r   rL   cachec                 V   |%t          j        || j        | j        f          }|}nt          j        |d| j        f          }t	          j        ||gd          }| j        dk    r|d d d d d | j         f         }n|}|d d d d |                    d           d f         }||fS )Nr4   r   r0   rO   )Fr4   r  r  r1   catr
  r   )r   rL   r  new_x
next_caches        r   update_cachezCausalConv1D.update_cache  s     =E!$"4d6I!JKKKEJJE!!T%8!9:::EIuen"555E#a''"111aaa)@D,@+@)@#@A

"
#AAAqqq5::b>>/*;*;$;<Jj  r   c                     |                      ||          \  }}t                                          |          }||S ||fS )N)r  )r  r   rS   )r   rL   r  r   s      r   rS   zCausalConv1D.forward  sI     $$Qe$445GGOOA=He8Or   r.   r   r.   r.   Trd   NNr   )r   r   r   r   r   rT   rm   r   r   tupler  rS   r   r   s   @r   r  r  \  sl        * #0
 0
0
 0
 	0

 0
 s0
 0
 0
 0
 0
 
0
 0
 0
 0
 0
 0
f 15! !! &!	vv}$	%! ! ! !" 15  &	%-.	.         r   r  c                   x     e Zd ZdZ	 	 	 	 	 	 	 	 ddeded	ed
edeez  dededededdf fdZdedef fdZ	 xZ
S )CausalConv2Dz
    A causal version of nn.Conv2d where each location in the 2D matrix would
    have no access to locations on its right or down
    All arguments are the same as nn.Conv2d except padding which should be
    set as None
    r.   r   Trd   Nr  r  rY   r  r_   r  rs   r  r  r	   c                     |t          d          |dz
  | _        |dz
  | _        d}t                                          |||||||||	|
|           d S )Nz8Argument padding should be set to None for CausalConv2D.r.   r   )r  r  r  r   r   r  s               r   r   zCausalConv2D.__init__  s~     WXXX(1_$qj	
 	
 	
 	
 	
r   rL   c                     t          j        || j        | j        ddf          }t	                                          |          }|S )Nr   r  )r  r4   r  r  r   rS   )r   rL   r   s     r   rS   zCausalConv2D.forward  sL     E#T%8!Q?
 
 
 GGOOAr   r  rl   r   s   @r   r  r    s          # 
  
 
  
 	 

  
 s 
  
  
  
  
 
 
  
  
  
  
  
D		 
	 	 	 	 	 	 	 	 	 	r   r  c                   X    e Zd ZdZdddd ej                    dfdeded	ed
edededej        j	        de
ddf fdZdee         fdZdee         fdZdededz  deeedz  f         fdZddZdedeee
f         fdZdedefdZdej        j	        dededefdZdeddfdZ xZS )NemoConvSubsamplinga|  Convlutional subsampling module, taken from NeMo ASR
    (https://github.com/NVIDIA/NeMo/blob/b367413645d5c72db3c2c96e46e95a
    34501479cf/nemo/collections/asr/parts/submodules/subsampling.py)

    Striding Subsampling: "Speech-Transformer: A No-Recurrence
    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong
    et al. (https://ieeexplore.ieee.org/document/8462506)


    Compared with the EncoderConv2D (`input_layer: custom`), this is a
    much simplified approach, and uses no LayerNorm and far fewer Conv2Ds.
    Moreover, depthwise convolutions are used to reduce FLOPs, but the first
      layer is kept as a regular convolution so as not to degrade accuracy.

    `Striding` and `dw_striding` are the same except that the latter uses
    depthwise convolutions after the first layer, whereas the former does not.

    Args:
        subsampling_factor (int): Time reduction factor
        feat_in (int): size of the input features
        feat_out (int): size of the output features
        subsampling (str): The subsampling technique, choose from
            {"striding", "dw-striding", "striding_conv1d",
            "dw_striding_conv1d"}
        conv_channels (int): Number of channels for the convolution layers,
                            default is 256.
        subsampling_conv_chunking_factor (int): Input chunking factor which
            can be -1 (no chunking) 1 (auto) or a power of 2. Default is 1
        activation (Module): activation function, default is nn.ReLU()
        is_causal (bool): whether to use causal Conv1/2D, where each step will
            have limited access to locations on its right or left
       dw_striding   r.   Ffeat_infeat_outsubsampling_factorsubsamplingconv_channels subsampling_conv_chunking_factorr   	is_causalr	   Nc	                 (   t                                                       || _        || _        || _        || _        |dz  dk    rt          d          t          t          j	        |d                    | _
        || _        || _        |dv | _        |dk    r|dk    r|dz  dk    rt          d          || _        d}	g }
|dk    rd| _        d	| _        d
| _        | j        r)| j        dz
  | _        | j        dz
  | _        |dz   | _        n+| j        dz
  dz  | _        | j        dz
  dz  | _        d| _        | j        r2|
                    t-          |	|| j        | j        d                      nF|
                    t.          j                            |	|| j        | j        | j                             |}	|
                    |           t5          | j
        dz
            D ]}| j        r3|
                    t-          |	|	| j        | j        d |	                     nG|
                    t.          j                            |	|	| j        | j        | j        |	                     |
                    t.          j                            |	|dddd                     |
                    |           |}	Ӑn|dk    rd| _        d	| _        d
| _        | j        r)| j        dz
  | _        | j        dz
  | _        |dz   | _        n+| j        dz
  dz  | _        | j        dz
  dz  | _        d| _        t5          | j
                  D ]}| j        r2|
                    t-          |	|| j        | j        d                      nF|
                    t.          j                            |	|| j        | j        | j                             |
                    |           |}	n|dk    rA|}	d| _        d| _        d
| _        | j        r)| j        dz
  | _        | j        dz
  | _        |dz   | _        n+| j        dz
  dz  | _        | j        dz
  dz  | _        d| _        t5          | j
                  D ]}| j        rB|
                    t7          |	| j
        |dz   k    r|n|| j        | j        d                      nV|
                    t.          j                            |	| j
        |dz   k    r|n|| j        | j        | j                             |
                    |           |}	n|dk    ry|}	d| _        d| _        d
| _        | j        dz
  dz  | _        | j        dz
  dz  | _        |
                    t.          j                            |	|	| j        | j        | j        |	          t.          j                            |	| j
        dk    r|n|dddd          g           |}	|
                    |           t5          | j
        dz
            D ]}|
                    t.          j                            |	|	| j        | j        | j        |	          t.          j                            |	| j
        |dz   k    r|n|dddd          g           |
                    |           |}	nt          d| d          |dv rt/          j        |t.          j                  }tA          || j        | j        z   | j        | j        | j        | j
                  }t.          j        !                    |t          |          z  |          | _"        d| _#        n&|dv rd | _"        d
| _#        nt          d| d          t/          j        j$        |
 | _%        d S )NrN   r   z*Sampling factor should be a multiply of 2!)r$  stridingstriding_conv1dr0   r.   Asubsampling_conv_chunking_factor should be -1, 1, or a power of 2r$  r   F)r  r  rY   r  r_   )r  r  rY   r  r_   rs   r.  r/     dw_striding_conv1dzNot valid sub-sampling: r	  )r$  r.  r   )lengthsall_paddingsrY   r  	ceil_mode
repeat_numT)r/  r2  )&r   r   _subsampling_conv_channels_feat_in	_feat_outr  r   r   r   _sampling_numr(  r,  subsampling_causal_condr+  _stride_kernel_size
_ceil_moder  r  r  appendr  r1   r   Conv2dranger  r`   extendr   r   calc_lengthr   r   conv2d_subsamplingr   conv)r   r&  r'  r(  r)  r*  r+  r   r,  r  layersi	in_length
out_lengthr   s                 r   r   zNemoConvSubsampling.__init__  s    	'+!!Q&&IJJJ *<a!@!@AA"4"'2 7
 (
$ -220A5501499S   1Q--''DL !D#DO~ (%)%6%:"&*lQ&6#&81&<##&*&7!&;%A"'+'81'<&B#&'# ~  $/%2$($5#| $      HOO$/%2$($5#| $ 2 $     (KMM*%%%4-122 #, #,> MM$(3)4(,(9#'<$(#.  	 	 	 	 MM(3)4(,(9#'<$($6#. (  	 	 	 HOO$/%2$%  !  $  	 	 	 j)))+G#,J J&&DL !D#DO~ (%)%6%:"&*lQ&6#&81&<##&*&7!&;%A"'+'81'<&B#&'#4-.. , ,> MM$(3)6(,(9#'<$(      MM(3)6(,(9#'<$($6 (     j)))+-,0 ---!KDL !D#DO~ (%)%6%:"&*lQ&6#&81&<##&*&7!&;%A"'+'81'<&B#&'#4-.. , ,> MM$(3 $(#5Q#>#> !)%2(,(9#'<$(
 
 
    MM(3 $(#5Q#>#> !)%2(,(9#'<$($6 ( 
 
   j)))+=,@ 000!KDL !D#DO"&"3a"7A!=D#'#4q#8Q">D MMHOO$/%0$($5#| $ 2* $   HOO$/(,(:a(?(?HH]$%  !  $ 	 	  , (KMM*%%%4-122 , ,(3)4(,(9#'<$($6#. (   (3 $(#5Q#>#> !)%2()#$$%#$ (    0 j)))+5,: FFFFGGG555WEK@@@I$!!/$2EE -|/-  J x}s:'FQQDH&*D##EEEDH&+D##FFFFGGGH'0			r   c                     d| j         gS )Nr.   r(  r   s    r   get_sampling_framesz'NemoConvSubsampling.get_sampling_frames3  s    4*++r   c                     d| j         dz   gS )Nr   r.   rL  r   s    r   get_streaming_cache_sizez,NemoConvSubsampling.get_streaming_cache_size6  s    4*Q.//r   rL   maskc                    | j         r|                    d          n|                    dd          }| j        dk    r| j         r| j        dk    r3d| j        z  | j        z  | j        z  }t          j        |          |k    }nd}|rQ|                     |          \  }}|s6| j	        dk    r| 
                    |          }nA|                     |          }n+|                     |          }n|                     |          }| j         rX|                                \  }}}}	|                     |                    dd                              ||d                    }n|                    dd          }||dfS |j        d         }
|                    d          }t          j        || j        z            }| j        r%| j        r|| j        z  }||dk    xx         dz  cc<   t          j        d|
|j        	                              |                    d          d          |                    d          k     }||                    d          fS )
al  
        Forward method for NeMo subsampling.

        Args:
            x: input tensor
            mask: input mask

        Returns:
            x: Resulting tensor from subsampling (B, T //
                time_reduction_factor, feat_out)
            pad_mask: tensor of padded hidden state sequences (B, 1, T //
                time_reduction_factor)
        r.   rN   r0           Tr$  Nr   )r   )rE  r6   	transposer+  r8  r=  r1   numelconv_split_by_batchr7  conv_split_by_channelrF  r   r   reshapeshapesumceilr(  r,  r<  r5   r   r8   )r   rL   rP  x_ceilneed_to_splitsuccessbctfmax_audio_lengthfeature_lenspadding_lengthfeature_lens_remainderpad_masks                  r   rS   zNemoConvSubsampling.forward9  s`    #5LAKKNNN1;;q!;L;L 0B664;R6499 !44t|CdlR %A 7 !% !!55a88
7 )(M99 66q99 IIaLLIIaLL		!A " 	"JAq!QQ**221a<<==AA Aq!!A<d7N71:xx{{L43J$JKK> 	=d: 	=%1D4K%K"1Q67771<777<#3AHEEELL""B
 
$$Q''( ($$Q''''r   c                    | j         dk    rAt          j                    5  d| j        z  }| j        dz  dz  }| j        dz  }t          j        j                            | j        d         j	        | |           t          j        j                            | j        d         j
        | |           t          dt          | j                  d          D ]}t          j        j                            | j        |         j	        | |           t          j        j                            | j        |         j
        | |           t          j        j                            | j        |dz            j	        | |           t          j        j                            | j        |dz            j
        | |           | j        | j        z  | j        z  dz  }t          j        j                            | j        j	        | |           t          j        j                            | j        j
        | |           d d d            d S # 1 swxY w Y   d S d S )Nr$        ?rN   g      r   r   r.   )r7  r1   no_gradr>  r8  r   inituniform_rF  weightr  rB  r9   r:  r9  r;  r   )r   scaledw_maxpw_maxr=   fc_scales         r   reset_parametersz$NemoConvSubsampling.reset_parametersy  sF   -- K Kd//+Q.47,d2&&ty|':UFEJJJ&&ty|'85&%HHH C	NNA66 U UCHM**49S>+@6'6RRRHM**49S>+>PPPHM**49S1W+=+DvgvVVVHM**49S1W+=+BVGVTTTT
 !NT]:T=OOTXX&&tx	8LLL&&tx}xiJJJ)K K K K K K K K K K K K K K K K K K .-s   HIIIc                     |                                 \  }}}}|dk    r|dfS  j        dk    r j        }n[d j        z   j        z   j        z  }t	          j        t	          j        t          j        |          |z  d                    }d|z  }||z  }|dk    r|dfS t          j	         fdt          j
        ||d          D                       dfS )z:Tries to split input by batch, run conv and concat resultsr.   FrR  rN   r   c                 :    g | ]}                     |          S  rF  ).0rP   r   s     r   
<listcomp>z;NemoConvSubsampling.conv_split_by_batch.<locals>.<listcomp>  s%    QQQe5!!QQQr   T)r   r+  r8  r=  r   rZ  r   r1   rT  r  split)r   rL   r^  _cfr[  r   new_batch_sizes   `       r   rU  z'NemoConvSubsampling.conv_split_by_batch  s    VVXX
1a66e8O01446BB T004<?$,NF	$(5;q>>F#:A>>??AABbQe8O IQQQQu{1na/P/PQQQ  	
 	
r   c           	         	   j         d         |          }  j         d         |          }t           j        dz
            D ]	|                                \  }}}} j        dk    r j        }nAt          j        t          j        t          j	        |          dz  d                    }d|z  }t          ||z            }|dk    rd}t          ||z            }|dk    rd}                      j         	dz  dz            ||          }t          j        	 fdt          j        ||d          D             d          }  j         	dz  dz            |          } |S )zOFor dw convs, tries to split input by time, run conv and concat
        resultsr   r.   rR  rN   r   c                 H    g | ]} j         d z  d z            |          S )r   ru  )rv  rP   rH  r   s     r   rw  z=NemoConvSubsampling.conv_split_by_channel.<locals>.<listcomp>  s3    SSS%1q519%e,,SSSr   r#  )rF  rB  r;  r   r+  r   rZ  r   r1   rT  r   channel_chunked_convr  rx  )
r   rL   ry  r_  r`  rz  r   new_cnew_trH  s
   `        @r   rV  z)NemoConvSubsampling.conv_split_by_channel  sv    DIaLOODIaLOOt)A-.. 	( 	(AJAq!Q4q88:
 Idhu{1~~'=qAABBTRLLEzzRLLEzz))	!a%!)$eQ A
 	SSSSS%+aPQ:R:RSSS A %	!a%!)$Q''AAr   rF  r   c           
         d}g }t          j        ||d          D ]1}|                                d         }| j        rt          j                            || j        dz
  | j        dz
  | j        dz
  | j        dz
  f          }t          j        	                    ||j
        |||z   ddddddf         |j        |||z            | j        d|          }nYt          j        	                    ||j
        |||z   ddddddf         |j        |||z            | j        | j        |          }|                    |           ||z  }3t          j        |d          S )z$Performs channel chunked convolutionr   r.   r  N)r  r  r_   rs   )r1   rx  r   r,  r   r3   r4   r>  r=  conv2drl  r  r  r@  r  )	r   rF  r   rL   ind
out_chunksrP   stepch_outs	            r   r~  z(NemoConvSubsampling.channel_chunked_conv  s   
 
[J22 	 	E::<<?D~ )))A-q()A-q(	 *   --KcDj 0!!!QQQ 9:3t#34< .   --KcDj 0!!!QQQ 9:3t#34< . .   f%%%4KCCyQ'''r   c                 \    |dk    r|dk    r|dz  dk    rt          d          || _        d S )Nr0   r.   rN   r   r0  )r  r+  )r   r+  s     r   'change_subsampling_conv_chunking_factorz;NemoConvSubsampling.change_subsampling_conv_chunking_factor  sP     -220A5501499S   1Q---r   r   )r   r   r   r   r   r"   r   rT   r1   Modulerm   r   r  rM  rO  r   r  rS   rq  rU  rV  r~  r  r   r   s   @r   r"  r"    s        J #$( 01&-bgii_1 _1_1 _1  	_1
 _1 _1 +._1 HO_1 _1 
_1 _1 _1 _1 _1 _1B	,T#Y , , , ,0$s) 0 0 0 0>( >(v} >(vvPT}?T9U >( >( >( >(@K K K K2
V 
fdl0C 
 
 
 
4$v $& $ $ $ $L((HO((14((9?((	(( (( (( ((TQ03Q	Q Q Q Q Q Q Q Qr   r"  r.   r3  r4  rY   r  r5  r6  c                 :   ||z
  }d}t          |          D ]e}t          j        |                     t          j                  |z   |          |z   } |rt          j        |           nt          j        |           } f|                     t          j                  S )z^Calculates the output length of a Tensor passed through a convolution or
    max pooling layerrh  r   )rB  r1   divr   r   rZ  floorr   )	r3  r4  rY   r  r5  r6  add_padonerH  s	            r   rD  rD  
  s     "K/GC: M M)GJJU[J99GCVLLsR)2L%*W%%%G8L8L::EI:&&&r   c                        e Zd ZdZd fdZddeddfdZ	 	 	 dded	edz  d
edz  dedz  deeeedz  edz  f         f
dZ	 xZ
S )	AttModulezAttention abstraction moduler	   Nc                 V    t                                                       d| _        d S )NF)r   r   export_mode)r   r   s    r   r   zAttModule.__init__   s'     r   Tmodec                     || _         dS )zset the export modeN)r  )r   r  s     r   
set_exportzAttModule.set_export$  s    r   rL   memorypos_embatt_maskc                     ||||fS )zAttModule forward

        Args:
            x: input tensor.
            memory: memory tensor.
            pos_emb: positional encoder embedding.
            att_mask: attention mask tensor.
        rt  )r   rL   r  r  r  s        r   rS   zAttModule.forward(  s     &'8++r   r   )T)NNN)r   r   r   r   r   rm   r  r   r  rS   r   r   s   @r   r  r    s        &&! ! ! ! ! !   t  t         !%!%"&, ,, , $	,
 4-, 
vvv}ftm;	<, , , , , , , ,r   r  c                   4    e Zd ZdZddedeeef         fdZdS )AttBlockzBAttention Block module to support both Attention and Block module.Fr   r	   c                     d| j         fS )zmemory dimensionsr.   )r   )r   r   s     r   memory_dimszAttBlock.memory_dims=  s    4?##r   N)F)r   r   r   r   rm   r  r   r  rt  r   r   r  r  :  sI        LL$ $4 $E#s(O $ $ $ $ $ $r   r  scoresrP  c                    |t|                     d                              d          }|                     |t          j                   } t          j        | d                              |d          }nt          j        | d          }|S )Nr.   r   r0   rO   r   )r6   eqr   r1   infsoftmax)r  rP  attns      r   masked_softmaxr  B  s     ~~a  ##A&&##D59*55}V,,,88#
 
 }V,,,Kr   c                   H    e Zd ZU dZej        j        e         ed<   ej        j        e	         ed<   ej        j        e	         ed<   ej        j        e	         ed<   	 	 	 	 	 	 dde	de	dede	de
dedede	de	ddf fdZ	 d dededededz  dedz  dedz  dedz  defdZ xZS )!MultiHeadedAttentiona  Multi-Head Attention layer with optional relative position embedding
    and GLU.

    Args:
        n_head: int
            the number of heads.
        n_feat: int
            input size features.
        dropout_rate: float
            dropout rate.
        attention_inner_dim: int, optional
            the attention dimension used in the class,
            it can be different from the input dimension n_feat.
            default: -1 (equal to n_feat).
        use_pt_scaled_dot_product_attention: bool, optional
            if set True, use pytorch scaled dot product attention in training.
            NOTE: this will NOT be used in ONNX decoding due to a lack of
            support.  In that case, we use the original attention
            implementation, which shows no regression.
            default: False.
        n_value: int, optional
            if set to values other than -1, use a different dimension for
            value. With the default value (i.e. -1), it is backward compatible.
        group_size: int, optional. must divide `n_head`
            if group_size > 1:       GQA
            if group_size = 1:       MHA
            if group_size = n_head:  MQA
    inv_sqrt_d_khh_kgr0   r   TFr.   n_headn_featr}   attention_inner_dimrZ   r[   #use_pt_scaled_dot_product_attentionn_value
group_sizer	   Nc
                 .   t                                                       |dk    r|}|dk    r|}||z  dk    sJ ||z  | _        dt          j        | j                  z  | _        || _        ||	z  dk    s
J d            |	| _        ||	z  | _        t          j
        ||          | _        t          j
        |||	z            | _        t          j
        |||	z            | _        t          j
        ||	z  |          | _        t          j                            d t$          d z            | _        t          j        |          | _        || _        || _        |r|	dk    rt1          d          t          j        j                                        | _        t          j        j                                        | _        t          j        j                                        | _        t          j        j	        j         !                                | _"        d S )Nr0   r   rh  zgroup_size must divide n_headr   r.   z'Cannot use PT Scaled Attention with GQA)#r   r   d_kr   r   r  r  r  r  r   r   linear_qlinear_klinear_v
linear_outr1   jit	Attributer   r  r   r   r}   r  r  aoquantization	QuantStubquant_qquant_xDeQuantStubdequant	quantizedFloatFunctionalffunc)r   r  r  r}   r  rZ   r[   r  r  r  r   s             r   r   zMultiHeadedAttention.__init__t  s    	b==G"$$"("V+q0000 '&0$)DH"5"55
"a''')H'''Z'	&*=>>	&*=*KLL	'+>*+LMM)$7:$EwOOI''ftm<<	zL111(3V0. 	H:>>FGGG
 x,6688x,6688x,88::X[*::<<


r   querykeyr/   pos_kpos_vrP  relative_attention_biasc                 (   |                     d          }|                     |                              |d| j        | j                  }	|                     |                              |d| j        | j                  }
|                     |                              |d| j        | j                  }| j        r4t          j
                                        s|	                    dd          n|	                    dd          | j        z  }	|
                    dd          }
|                    dd          }| j        r@t          j
                                        s!d}|I|                    d          }|||z   }n|}|j        |	j        k    r|                    |	j                  }t          j        j                            t          j        j        j        j        t          j        j        j        j        t          j        j        j        j        t          j        j        j        j        g          5  t          j        j                            |	|
||| j                  }ddd           n# 1 swxY w Y   n| j        | j        k    r?|	                    || j        | j        d| j                  }	t          j        d|	|
          }n)t          j        |	|
                    dd                    }|| j        | j        k    rt          j        d	|	|          }n|	                                                     || j        z  d| j                                      dd          }t          j        ||                    dd                    }|                    dd                              || j        |                     d          |                     d                    }||z   }n|}|||z   }tC          ||          }|| _"        | #                    |          }t          j        |                    |j                  |          }||                                                     || j        z  |                     d          |                     d                                        dd          }t          j        ||                              dd                                                               || j        |                     d          | j                  }||z   }|                    dd                                                               |d| j        | j        z            }| $                    |          S )
a4  Compute 'Scaled Dot Product Attention'.

        Args:
            query: query tensor (batch, time1, size)
            key: key tensor (batch, time2, size)
            value: value tensor (batch, time1, size)
            pos_k: key tensor used for relative positional embedding.
            pos_v: value tensor used for relative positional embedding.
            mask: mask tensor (batch, time1, time2)
            relative_attention_bias: bias added to attention logits w.r.t.
                relative positions
                (1, n_head, time1, time2)
        r   r0   r.   rN   N)	attn_mask	dropout_pzb g h t d, b h s d -> b h t szb g h t d, t s d -> b h t s)%r   r  viewr  r  r  r  r  r  r1   r  is_scriptingrS  r  r6   r   r   r   	attentionsdpa_kernel
SDPBackendFLASH_ATTENTIONEFFICIENT_ATTENTIONMATHCUDNN_ATTENTIONr3   scaled_dot_product_attentionr}   rW  r  einsummatmul
contiguousr  r  r   r  )r   r  r  r/   r  r  rP  r  n_batchqr   vr  rL   AB	reshape_qr  r  p_attnreshape_attnattn_vs                         r   rS   zMultiHeadedAttention.forward  s   . **Q--MM%  %%gr4648DDMM###GR48DDMM%  %%gr48TXFF 77@E	@V@V@X@X7AKK1Q""T%66 	

 KK1KK13 I	EI<R<R<T<T I	I~~a((*6 $'> >II $I:(( )QW 5 5I#//H&1AH&1EH&16H&1A	    H'DD'"/ E                  v!!IIgtvtxTXFFL!@!QGGLAKKB$7$788 6TX%%%BAuMMAA g.DH=="1a 
 !5??2r#:#: A Aq))..A

1 A Q&2"99!&$//DDI\\$''FVYYqw//33A %%''T'DF*EJJqMM5::a==IIYq!__  Lu55Yq!__Z\\T'465::a==$(CC	  JKK1((**//TX=PQQ 	
 q!!!s   :/I55I9<I9)r0   r   TFr0   r.   r   )r   r   r   r   r1   r  Finalr   __annotations__r   rT   rm   r   r   rS   r   r   s   @r   r  r  Q  s         : )/%((((ys		ys $& 49.= .=.= .= 	.=
 !.= .= .= .2.= .= .= 
.= .= .= .= .= .=p 26r" r"r" r" 	r"
 }r" }r" tmr" "($r" 
r" r" r" r" r" r" r" r"r   r  c                   B    e Zd ZdZej        j        defd            ZdS )MultiSequentialz,Multi-input multi-output torch.nn.Sequentialr	   c                     | D ]} || }|S )zForward method implementation.rt  )r   argsms      r   rS   zMultiSequential.forward  s%      	 	A1d8DDr   N)	r   r   r   r   r1   r  ignorer  rS   rt  r   r   r  r    sE        66
Y      r   r  input_layertime_reductionc                 N    | dv r|dk    rdS | dv r|dk    rdS | dv r|dk    rdS d	S )
a   Get an offset. We will use the offset for determining #frames of a
    subsampled feature.

    Args:
        input_layer: Type of an input layer
        time_reduction: time reduction factor for downsampling a feature
    Returns:
        int: offset
    )r  	nemo_convr#  r   )r     r.         r   rt  )r  r  s     r   
get_offsetr  $  sY     ---.A2E2Eqk!!n&9&9q---.A2E2Eq1r   xs_padmax_seq_lenc                 \   | j         \  }}}|                     dd          } t          j        | ddddf         d|fd|f          } | j         \  }}}|                     |d||          } |                     ddd	d                                          } |                     d||          } | S )
a(  
    For a given tensor with shape of (N, T, D), if sequence length T is
    longer than max_seq_len, this function unfold it to a
    (NT', max_seq_len, D) where T' is T // max_seq_len.
    Args:
        xs_pad: input tensor with shape (N, T, D)
        max_seq_len: maximum sequence length
    r0   r  .Nr.   )rY   r  r   r   rN   )rX  rS  r  unfoldr  rj   r  )r  r  ry  Dnew_bszslens         r   unfold_tensorr  7  s     lGAq!b"%%FXsD!!!|$;  F
 |GQ[["k488F^^Aq!Q''2244F[[[!,,FMr   )r   )r   r   )r.   )(r   r1   torch.nn.functionalr   r3   r  r   r  r   rT   r(   r   r  rE   rG   rV   ro   ry   r   r   dictrm   r   r   r   r   r`   r  rA  r  r"  rD  r  r  r  r  r   r  r  r  rt  r   r   <module>r     sd                     ' ' ' ' '	 ' ' '$ $ $%(/ $ $ $ $2 WX0" 0"0"!%c0"9<0"PS0"
\0" 0" 0" 0"f* * * * *") * * *0^ ^ ^ ^ ^ry ^ ^ ^B@ @ @ @ @ry @ @ @Fg g g g g g g gT! ! ! ! !	 ! ! !H1 1 1 1 1") 1 1 1j  	
 s) #Y S	 
   ,I  I  I  I  I 29 I  I  I X: : : : : : : :|@ @ @ @ @BI @ @ @4[ [ [ [ [29 [ [ [|3 3 3 3 329 3 3 3lWQ WQ WQ WQ WQ%(/ WQ WQ WQ@ ' ''' ' 	'
 ' ' ' ' ' '&, , , , ,	 , , ,:$ $ $ $ $y) $ $ $
4-    E" E" E" E" E"29 E" E" E"P    eh)   C      && s v      r   