
    0`iS                        d dl Z d dlmZmZmZ d dlZd dlmZmZ ddl	m
Z
 ddZddZ G d	 d
ej        j                  Z	 	 	 	 	 ddeeej        f         deej                 dee         fdZeZ G d dej        j                  Z	 	 	 	 	 ddeeej        f         dee         fdZ G d dej        j                  Zej        Z	 	 ddeeej        f         fdZ G d dej        j                  ZdS )    N)OptionalTupleUnion)	rearrangerepeat   )apply_rotaryFc                     |s3|                      dd          \  }}t          j        | |fd          S | dd d df         | ddd df         }}t          t          j        | |fd          dd          S )Nr   dim.   z... d two -> ... (d two))two)chunktorchcatr   stack)xinterleavedx1x2s       v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/vllm_flash_attn/layers/rotary.pyrotate_halfr      s     \##By2#r++++3!8aQTT	lBrc2YB7779SYZ[[[[    c           	      2   |j         d         dz  }|| j         d         k    sJ t          ||sdnd          }t          ||sdnd          }t          j        | dd|f         |z  t	          | dd|f         |          |z  z   | d|df         gd          S )z
    x: (batch_size, seqlen, nheads, headdim)
    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
    r   r   z... d -> ... 1 (2 d)z... d -> ... 1 (d 2).Nr   )shaper   r   r   r   )r   cossinr   ro_dims        r   apply_rotary_emb_torchr       s    
 Yr]QFQWR[    
K[,,E[
\
\C
K[,,E[
\
\C9	
3<3	QsGVG|_k!J!JS!P	PRSTWY_Y`Y`T`Rab   r   c            	           e Zd Ze	 	 	 	 	 d	deeej        f         deej                 dee         fd            Z	ed             Z
dS )
ApplyRotaryEmbFr   Nseqlen_offsets
cu_seqlens
max_seqlenc	           
         t          ||||||||          }	t          |t                    r|                     |||           || _        n|                     ||||           d | _        || _        || _        || _        |s|	n|S )N)r#   r$   r%   r   inplace)r	   
isinstanceintsave_for_backwardr#   r   r'   r%   )
ctxr   r   r   r   r'   r#   r$   r%   outs
             r   forwardzApplyRotaryEmb.forward&   s     )!!#	
 	
 	
 nc** 	&!!#sJ777!/C!!#sJGGG!%C%#!(ssq(r   c                     | j         }|| j        \  }}}}n| j        \  }}}| j        s| j        s|                                }t          |||||| j        | j        | j        d	  	        }|d d d d d d d fS )NT)r#   r$   r%   r   r'   	conjugate)r#   saved_tensorsr   r'   cloner	   r%   )r+   dor#   r   r   r$   dxs          r   backwardzApplyRotaryEmb.backwardG   s    +!363D0Cj..#&#4 Cj  	s{ 	B)!~K

 

 

 4tT4t;;r   FFr   NN)__name__
__module____qualname__staticmethodr   r)   r   Tensorr   r-   r4    r   r   r"   r"   %   s         34-1$() ) c5</0) U\*) SM) ) ) \)@ < < \< < <r   r"   r#   r$   r%   c           
      D    t                               | |||||||          S )aM  
    Arguments:
        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
            else (total_seqlen, nheads, headdim)
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
            else (total_seqlen, nheads, headdim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r"   apply)r   r   r   r   r'   r#   r$   r%   s           r   apply_rotary_embr>   `   s-    8 	3['>:z  r   c                   v    e Zd Ze	 	 	 	 	 ddeeej        f         dee         fd            Zed             Z	dS )	ApplyRotaryEmbQKV_NFr   r#   num_heads_qc	                 v   |||                                 r|                                dk    r=|j        \  }	}
}}}|dk    sJ |d d d d d df                             |	|
d|          }n^|                                dk    sJ |J |j        d         |z
  dz  }|j        d         |d|z  z   k    sJ |d d d d d ||z   f         }t	          |||||d           n||n|}||n|}|                                dk    r|d d d d df         |d d d d d	f         }}no|                                dk    sJ |J |j        d         |z
  dz  }|j        d         |d|z  z   k    sJ |d d d d d |f         |d d d d |||z   f         }}t	          |||||d
           t	          |||||d
           |                     ||||           t          |t                    r |                     ||||           || _        n |                     |||||           d | _        || _	        || _
        |S )N      r   r      Tr#   r   r'   r   r   )r   r'   )is_contiguousr   r   reshaper	   r*   r(   r)   r#   r   rA   )r+   qkvr   r   cos_ksin_kr   r#   rA   batchseqlenthreenheadsheaddimqknum_heads_kqks                     r   r-   zApplyRotaryEmbQKV_.forward   s    =U]s/@/@/B/B] wwyyA~~8;	5vufgzzzzAAArr]**5&"gFFwwyyA~~~~"..."y|k9a?y|{Q_'DDDDDAAA9k 999:C^^b     !=CCeE =CCeEwwyyA~~111aaa7|SAAAq\1wwyyA~~~~"..."y|k9a?y|{Q_'DDDDD111aaa+-.AAAqqq+VaHa:a4a0b1Cn+W[\\\\E5.k[_````!!#sE5999nc** 	&!!#sE5999!/C!!#sE5.III!%C%%
r   c           	      &   | j         }|| j        \  }}}}}n| j        \  }}}}|||                                r|                                dk    r t	          |d d d d d df         d          }nr|                                dk    sJ | j        J |j        d         | j        z
  dz  }|j        d         | j        d|z  z   k    sJ |d d d d d | j        |z   f         }t          ||||| j        dd           n||n|}||n|}|                                dk    r|d d d d df         |d d d d df         }
}	n|                                dk    sJ | j        J |j        d         | j        z
  dz  }|j        d         | j        d|z  z   k    sJ |d d d d d | j        f         }	|d d d d | j        | j        |z   f         }
t          |	|||| j        dd	           t          |
|||| j        dd	           |d d d d d d d fS )
NrC   r   zb s t h d -> b s (t h) drE   Tr#   r   r'   r/   r   r   )r   r'   r/   )	r#   r0   rG   r   r   rA   r   r	   r   )r+   dqkvr#   r   r   rJ   rK   dqkrR   dqdks              r   r4   zApplyRotaryEmbQKV_.backward   s   +!585F2CeUNN%(%6"CeU=U]t/A/A/C/C] xxzzQQQQ2A2X0JKKxxzzQ222#z!}s>1Dz!}!k/(IIIII111aaa!@3?[#@!@@A-O     !=CCeE =CCeExxzzQaaaAgQQQ1WBxxzzQ222#z!}s>1Dz!}!k/(IIIII!!!QQQ 1#/ 112!!!QQQ#/K2O OOPO    O    T4tT4==r   NNFr   N
r6   r7   r8   r9   r   r)   r   r:   r-   r4   r;   r   r   r@   r@      s         34"&3 3 c5</03 3Z3 3 3 \3j 9> 9> \9> 9> 9>r   r@   rA   c           
      D    t                               | |||||||          S )a  
    Arguments:
        qkv: (batch_size, seqlen, 3, nheads, headdim) or (batch_size, seqlen, num_heads_q + 2 * num_heads_k, headdim).
            If qkv has shape (batch_size, seqlen, num_heads_q + 2 * num_heads_k, headdim) (e.g. MQA / GQA),
            then num_heads_q must be provided.
        cos, sin: (seqlen, rotary_dim / 2)
        cos_k, sin_k: (seqlen, rotary_dim / 2), optional
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
            1st half and 2nd half (GPT-NeoX style).
        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
            Most commonly used in inference when we have KV cache.
    Return:
        qkv: (batch_size, seqlen, 3, nheads, headdim) or (batch_size, seqlen, num_heads_q + 2 * num_heads_k, headdim)
    rotary_dim must be <= headdim
    Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
    )r@   r=   )rI   r   r   rJ   rK   r   r#   rA   s           r   apply_rotary_emb_qkv_r^      s-    4 ##S#ue[.+  r   c                   \    e Zd Zeddeeej        f         fd            Zed             Z	dS )ApplyRotaryEmbKV_Fr   r#   c                 (   |j         \  }}}}	}
|dk    sJ |d d d d df         }t          |||||d           t          |t                    r|                     ||           || _        n|                     |||           d | _        || _        |S )Nr   r   TrF   )r   r	   r(   r)   r*   r#   r   )r+   kvr   r   r   r#   rL   rM   r   rO   rP   rT   s               r   r-   zApplyRotaryEmbKV_.forward  s    .0h+vsFGaxxxxqqq!!!QwKsCKY]	
 	
 	
 	
 nc** 	&!!#s+++!/C!!#sN;;;!%C%	r   c           	          | j         }|| j        \  }}}n
| j        \  }}t          |d d d d df         |||| j        dd           |d d d d fS )Nr   TrV   )r#   r0   r	   r   )r+   dkvr#   r   r   s        r   r4   zApplyRotaryEmbKV_.backward*  s    +!'*'8$Cnn(HC111aL)	
 	
 	
 	
 D$d**r   NFr   r\   r;   r   r   r`   r`     sg         eCQVQ]L]F^    \  + + \+ + +r   r`   c                 >    t                               | ||||          S )aR  
    Arguments:
        kv: (batch_size, seqlen, 2, nheads, headdim)
        cos, sin: (seqlen, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
            1st half and 2nd half (GPT-NeoX style).
        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
            Most commonly used in inference when we have KV cache.
    Return:
        kv: (batch_size, seqlen, 2, nheads, headdim)
    rotary_dim must be <= headdim
    Apply rotary embedding *inplace* to the first rotary_dim of K.
    )r`   r=   )rb   r   r   r   r#   s        r   apply_rotary_emb_kv_rg   @  s     ( ""2sCnMMMr   c                        e Zd ZdZ	 	 	 	 	 ddef fdZddZdd	Z	 	 	 	 ddej	        de
ej	                 deeej	        f         de
e         de
e         deej	        eej	        ej	        f         f         fdZ xZS )RotaryEmbeddinga5  
    The rotary position embeddings from RoFormer_ (Su et. al).
    A crucial insight from the method is that the query and keys are
    transformed by rotation matrices which depend on the relative positions.

    Other implementations are available in the Rotary Transformer repo_ and in
    GPT-NeoX_, GPT-NeoX was an inspiration

    .. _RoFormer: https://arxiv.org/abs/2104.09864
    .. _repo: https://github.com/ZhuiyiTechnology/roformer
    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox

    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
         @FNTr   c                    t                                                       || _        t          |          | _        || _        |                     |          }|                     d|d           || _        || _	        |/t          j        d|d|t          j                  d|z  z   d	|z  z  nd}|                     d
|d           d| _        d| _        d| _        d| _        d| _        dS )a  
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
            otherwise they might be in lower precision.
            This option was added because previously (before 2023-07-02), when we construct
            the position indices, we use the dtype of self.inv_freq. In most cases this would
            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
            self.inv_freq would be bf16, and the position indices are also in bf16.
            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
            embeddings for some positions will coincide.
            To maintain compatibility with models previously trained in pure bf16,
            we add this option.
        inv_freqF)
persistentNr   r   devicedtypeg?gffffff?scale)super__init__r   floatbasepos_idx_in_fp32_compute_inv_freqregister_bufferr   
scale_baser   arangefloat32_seq_len_cached_cos_cached_sin_cached_cos_k_cached_sin_k_cached)
selfr   ru   r   ry   rv   ro   rl   rq   	__class__s
            r   rs   zRotaryEmbedding.__init__i  s   . 	$KK	.))&11ZeDDD&$ % \!S!F%-HHH3QT9TY\_bYbcc 	
 	We>>> !!r   c           	      x    d| j         t          j        d| j        d|t          j                  | j        z  z  z  S )Ng      ?r   r   rn   )ru   r   rz   r   r{   )r   ro   s     r   rw   z!RotaryEmbedding._compute_inv_freq  s?    IQ!F%-PPPSWS[[]
 	
r   c                    || j         k    sI| j        B| j        j        |k    s2| j        j        |k    s"| j        rH| j                                        r0|| _         | j        rZt          j        ||t          j	                  }| j
        j        t          j	        k    r|                     |          }n0| j
        }n(t          j        ||| j
        j                  }| j
        }t          j        ||          }| j        Zt          j        |                              |          | _        t          j        |                              |          | _        d S t          j        || j        j        | j        j                  |dz  z
  | j        z  }| j                            |j                  t'          |d          z  }t          j        |          |z                      |          | _        t          j        |          |z                      |          | _        t          j        |          |z                      |          | _        t          j        |          |z                      |          | _        d S d S d S )Nrn   )ro   )rp   ro   r   zs -> s 1)r|   r}   ro   rp   trainingis_inferencerv   r   rz   r{   rl   rw   outerrq   r   tor   r~   ry   r   r   r   )	r   rM   ro   rp   trl   freqspowerrq   s	            r   _update_cos_sin_cachez%RotaryEmbedding._update_cos_sin_cache  sN   
 T)))'&&00%.. /"&"2"?"?"A"A / $*D  # )LemLLL
 =&%-77#55V5DDHH#}HHLdm>QRRR= K8,,Ez!#(9U#3#3#6#6u#=#= #(9U#3#3#6#6u#=#=    Ltz/?
HYZZZk"O$ 
U\::iz>Z>ZZ$)Ie$4$4u$<#@#@#G#G $)Ie$4$4u$<#@#@#G#G &+i&6&6&>%B%B5%I%I"&+i&6&6&>%B%B5%I%I"""I /...r   r   rI   rb   seqlen_offsetr%   rA   returnc           
         |j         d         }|#|                     ||j        |j                   n:t	          |t
                    r%|                     ||z   |j        |j                   |[| j        $t          || j        | j	        | j
        ||          S t          || j        | j	        | j        | j        | j
        ||          S |}t          || j        | j	        | j
        d|          }| j        $t          || j        | j	        | j
        |          }n#t          || j        | j        | j
        |          }||fS )a*  
        qkv: (batch, seqlen, 3, nheads, headdim) or (batch, seqlen, num_heads_q + 2 * num_heads_k, headdim)
            if kv is none, else it's just q of shape (batch, seqlen, nheads, headdim).
            If qkv has shape (batch, seqlen, num_heads_q + 2 * num_heads_k, headdim) (e.g. MQA / GQA),
            then num_heads_q must be provided.
        kv: (batch, seqlen, 2, nheads, headdim)
        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
            should pass in max_seqlen, which will update the cos / sin cache up to that length.
        Apply rotary embedding *inplace* to qkv and / or kv.
        r   Nrn   )r   r#   rA   T)r   r'   r#   )r   r#   )r   r   ro   rp   r(   r)   rq   r^   r}   r~   r   r   r   apply_rotary_emb_funcrg   )r   rI   rb   r   r%   rA   rM   rS   s           r   r-   zRotaryEmbedding.forward  s   ( 1!&&z#*CI&VVVVs++ 	c&&v'=cjX[Xa&bbb:z!,$$ $ 0#0 +    -$$&& $ 0#0 +	 	 	 	 A%   ,,  A z!)$$ $ 0#0   *&& $ 0#0   b5Lr   )rj   FNTN)N)NN)Nr   NN)r6   r7   r8   __doc__r)   rs   rw   r   r   r:   r   r   r   r-   __classcell__)r   s   @r   ri   ri   W  s:        ( +" +"+" +" +" +" +" +"Z
 
 
 
,J ,J ,J ,Jb &*23$(%)H H\H U\"H S%,./	H
 SMH c]H 
u|U5<#=>>	?H H H H H H H Hr   ri   )Fr5   r[   re   )mathtypingr   r   r   r   einopsr   r   ops.triton.rotaryr	   r   r    autogradFunctionr"   r)   r:   r>   r   r@   r^   r`   r=   rg   nnModuleri   r;   r   r   <module>r      se  
  ) ) ) ) ) ) ) ) ) )  $ $ $ $ $ $ $ $ , , , , , ,\ \ \ \   8< 8< 8< 8< 8<U^, 8< 8< 8<~ /0)- $  #u|+, &    D ) q> q> q> q> q>0 q> q> q>p 
/0!%  #u|+, #   >"+ "+ "+ "+ "+/ "+ "+ "+J ).  /0N N
 #u|+,N N N N.{ { { { {eho { { { { {r   