
     `iP                     >   d dl Z d dlmZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZ d
dlmZmZmZmZmZmZmZmZ d
dlmZ ddlmZ  ej         e!          Z"dZ#dZ$ G d de          Z%d Z& G d dej'                  Z( G d de(          Z) G d de(          Z*e(e)e*dZ+ G d de          Z, G d de          Z- G d  d!e          Z. G d" d#e          Z/ G d$ d%e          Z0 G d& d'e          Z1 G d( d)e          Z2g d*Z3dS )+    N)Optional)nn   )CacheStaticCache)_flash_attention_forward!flash_attn_supports_top_left_mask)PreTrainedModel)logging)deprecate_kwarg   )GemmaForCausalLM)LlamaDecoderLayerLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelapply_rotary_pos_emb	repeat_kv)
MistralMLP   )DiffLlamaConfigzkajuma/DiffLlama-0.3B-handcutr   c                       e Zd ZdS )DiffLlamaMLPN__name__
__module____qualname__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/diffllama/modular_diffllama.pyr   r   2           Dr!   r   c                 <    ddt          j        d| z            z  z
  S )Ng?g333333?g333333ӿ)mathexp)	layer_idxs    r"   lambda_init_fnr(   6   s!    txy 011111r!   c                   f    e Zd ZdZddedee         f fdZ eddd	          	 	 	 	 	 dde	j
        dee	j
        e	j
        f         dee	j
                 dee	j                 dee         dedee	j                 dee	j
        ee	j
                 eee	j
                          f         fd            Z xZS )DiffLlamaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfigr'   c                    t                                                       || _        || _        |(t                              d| j        j         d           |j        | _        |j	        | _	        |j
        | _        t          |d| j	        | j        z            | _        |j        | _        | j        | j        z  | _        |j        | _        |j        | _        d| _        t'          j        | j	        | j        | j        z  |j                  | _        t'          j        | j	        | j        | j        z  |j                  | _        t'          j        | j	        | j        | j        z  |j                  | _        t'          j        | j        | j        z  | j	        |j                  | _        t5          |          | _        t'          j        t;          j        d|j        | j        f                    | _         t'          j        t;          j        d|j        | j        f                    | _!        t'          j        t;          j        d|j        | j        f                    | _"        t'          j        t;          j        d|j        | j        f                    | _#        t'          j$        d| j        z  |j%        d	
          | _&        d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.head_dimT)biasr   )sizer   F)epselementwise_affine)'super__init__r+   r'   loggerwarning_once	__class__r   attention_dropouthidden_sizenum_attention_heads	num_headsgetattrr-   num_key_value_headsnum_key_value_groupsmax_position_embeddings
rope_theta	is_causalr   Linearattention_biasq_projk_projv_projo_projr(   lambda_init	Parametertorchnormallambda_std_dev	lambda_q1	lambda_k1	lambda_q2	lambda_k2RMSNormrms_norm_eps	groupnormselfr+   r'   r6   s      r"   r3   zDiffLlamaAttention.__init__=   sa   ",!8 , , ,   "(!9!-3
D4D4VWW#)#= $(Nd6N$N!'-'E$ +i 0$.4=2PW]Wlmmmi 0$2JT]2Zagavwwwi 0$2JT]2Zagavwwwi >@PW]Wlmmm))44el1f6KSWS`Rb&c&c&cddel1f6KSWS`Rb&c&c&cddel1f6KSWS`Rb&c&c&cddel1f6KSWS`Rb&c&c&cddA$56;Nchiiir!   past_key_valuepast_key_values4.58new_nameversionFhidden_statesposition_embeddingsattention_maskposition_ids	use_cachecache_positionreturnc                    |                                 \  }	}
}|
}|                     |          }|                     |          }|                     |          }|                    |	|| j        | j                                      dd          }|                    |	|| j        | j                                      dd          }|                    |	|| j        | j                                      dd          }|\  }}t          ||||          \  }}|&|||d}|
                    ||| j        |          \  }}t          || j                  }t          || j                  }t          j        t          j        |dd          d          }|                    dddd          }t          j        ||                    dd                    t'          j        | j                  z  }|$|d d d d d d d |j        d         f         }||z   }t,          j                            |dt          j                                      |j                  }t,          j                            || j        | j        	          }t          j        t          j         | j!        | j"        z  dt          j                                                |j                  }t          j        t          j         | j#        | j$        z  dt          j                                                |j                  }||z
  | j%        z   }t          j        ||          }t          j        |dd          \  }}|||z  z
  }d| j%        z
  | &                    |          z  }|                    dd          '                                }|(                    |	|d          }| )                    |          }||fS )
Nr   r   sincosr`   dimr   rg   dtype)ptraining)*r/   rC   rD   rE   viewr:   r-   	transposer<   r   updater'   r   r=   rI   catchunkrepeatmatmulr%   sqrtshaper   
functionalsoftmaxfloat32tork   dropoutr7   rm   r&   sumrL   rM   rN   rO   rG   rR   
contiguousreshaperF   )rT   r[   r\   r]   r^   rV   r_   r`   kwargsbsz
target_len_q_lenquery_states
key_statesvalue_statesre   rd   cache_kwargsattn_weightscausal_masklambda_1lambda_2lambda_fullattn_outputattn_output1attn_output2s                              r"   forwardzDiffLlamaAttention.forward_   s    +//11Z{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$Jz4+DEE
 t/HIIy\1!!D!D!D"MMM#**1aA66|L*2F2Fq!2L2LMMPTPYZ^ZgPhPhh%(AAAqqq2HJ4DR4H2H)HIK'+5L },,\r,WWZZ[g[mnn},,\T=S^b^k,ll9UYt~'FBV[Vcdddeehh
 
 9UYt~'FBV[Vcdddeehh
 
 )D,<<l<>>%*[aQ%G%G%G"l"[<%??4++t~~k/J/JJ!++Aq11<<>>!))#ub99kk+..L((r!   NNNNFN)r   r   r   __doc__r   r   intr3   r   rI   Tensortuple
LongTensorr   boolr   __classcell__r6   s   @r"   r*   r*   :   sI       GG j  j  j8C=  j  j  j  j  j  jD _%0A6RRR
 2637+/59<) <)|<) #5<#=><) !.	<)
 u/0<) "%<) <) !!12<) 
u|Xel3XeEL>Q5RR	S<) <) <) SR<) <) <) <) <)r!   r*   c                       e Zd ZdZ fdZ eddd          	 	 	 	 	 dd	ej        d
eej        ej        f         de	ej
                 de	ej
                 de	e         dede	ej
                 deej        df         fd            Z xZS )DiffLlamaFlashAttention2aN  
    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 `     t                      j        |i | t                      | _        d S r   )r2   r3   r	   _flash_attn_uses_top_left_mask)rT   argsr   r6   s      r"   r3   z!DiffLlamaFlashAttention2.__init__   s6    $)&)))
 /P.Q.Q+++r!   rU   rV   rW   rX   NFr[   r\   r]   r^   r_   r`   ra   c                 
   t          |t                    rt          d          |                                \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j        | j	                  
                    dd          }|                    ||	| j        | j	                  
                    dd          }|                    ||	| j        | j	                  
                    dd          }|4t                              d           |                     ||          \  }}n|\  }}t          ||||          \  }}|&|||d}|                    ||| j        |          \  }}|
                    dd          }|
                    dd          }|
                    dd          }| j        r| j        nd}|j        }|j        j        dk    r|j        j        nd}|t.          j        k    rt/          j                    r=t5          t.          d	          rt/          j        |          nt/          j                    }n3t5          | j        d
          r| j        j        }n| j        j        j        }t                              d| d           |                     |          }|                     |          }|                     |          }t/          j!        |dd          \  }}|"                    dddd          }|"                    dddd          }tG          |||||	||tI          | dd           | j%        | j&        
  
        }tG          |||||	||tI          | dd           | j%        | j&        
  
        }t/          j'        ||gd          }t/          j!        |dd          \  }}t/          j(        t/          j)        | j*        | j+        z  dt.          j                                                 |j                  }t/          j(        t/          j)        | j,        | j-        z  dt.          j                                                 |j                  }||z
  | j.        z   }|||z  z
  }d| j.        z
  | /                    |          z  }|0                    ||	d          1                                }| 2                    |          }|d fS )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr   r   aY  The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.rc           mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .rf   sliding_window)r^   r{   r   use_top_left_maskr@   rh   rj   )3
isinstancer   
ValueErrorr/   rC   rD   rE   rn   r:   r-   ro   r<   r4   r5   
rotary_embr   rp   r'   rm   r7   rk   devicetyperI   ry   is_autocast_enabledhasattrr   get_autocast_gpu_dtyper+   r   weightrz   rr   rs   r   r;   r   r@   rq   r&   r|   rL   rM   rN   rO   rG   rR   r~   r}   rF   )rT   r[   r\   r]   r^   rV   r_   r`   r   r   r   r   r   r   re   rd   r   dropout_rateinput_dtypedevice_typetarget_dtypevalue_states1value_states2r   r   r   r   r   r   s                                r"   r   z DiffLlamaFlashAttention2.forward   s"    o{33 	}  
 &**,,UA{{=11[[//
{{=11
 $((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm&G   |\BBHC*HC#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J $--a33))!Q//
#--a3315Gt--C #(2>2E2Je2S2Sl)..Y^%-''(** 
8 u&:;;8E,[999577  &?@@ 8#{B#{17$ $ $ $   (??<88L#|44J'??<88L',{<'J'J'J$}%,,Q1a88%,,Q1a88/% "4)94@@"An
 
 
 0% "4)94@@"An
 
 
 i| <"EEE%*[aQ%G%G%G"l9UYt~'FBV[Vcdddeehh
 
 9UYt~'FBV[Vcdddeehh
 
 )D,<<"[<%??4++t~~k/J/JJ!))#ub99DDFFkk+..D  r!   r   )r   r   r   r   r3   r   rI   r   r   r   r   r   r   r   r   r   s   @r"   r   r      s)        R R R R R _%0A6RRR
 6:37+/59B! B!|B! #5<#=>B! !!12	B!
 u/0B! "%B! B! !!12B! 
u|T!	"B! B! B! SRB! B! B! B! B!r!   r   c                   >   e Zd ZdZ eddd          	 	 	 	 	 ddej        d	eej        ej        f         d
eej                 deej	                 dee
         dedeej	                 deej        eej                 eeej                          f         fd            ZdS )DiffLlamaSdpaAttentiona   
    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    rU   rV   rW   rX   NFr[   r\   r]   r^   r_   r`   ra   c                 f   |                                 \  }	}
}|                     |          }|                     |          }|                     |          }|                    |	|
| j        | j                                      dd          }|                    |	|
| j        | j                                      dd          }|                    |	|
| j        | j                                      dd          }|\  }}t          ||||          \  }}|&|||d}|
                    ||| j        |          \  }}t          || j                  }t          || j                  }t          j        t          j        |dd          d          }|                    dddd          }|}||d d d d d d d |j        d         f         }|j        j        dk    r>|<|                                }|                                }|                                }|d u o|
dk    }t          j        j                            ||||| j        r| j        nd|	          }t          j        |dd          \  }}t          j        t          j        | j        | j        z  dt          j        
                                         |j!                  }t          j        t          j        | j"        | j#        z  dt          j        
                                         |j!                  }||z
  | j$        z   }|||z  z
  }d| j$        z
  | %                    |          z  }|                    dd                                          }|                    |	|
d          }| &                    |          }|d fS )Nr   r   rc   rf   rh   ri   cudar   )	attn_mask	dropout_pr@   rj   )'r/   rC   rD   rE   rn   r:   r-   ro   r<   r   rp   r'   r   r=   rI   rq   rr   rs   rv   r   r   r}   r   rw   scaled_dot_product_attentionrm   r7   r&   r|   rL   rM   ry   rz   rk   rN   rO   rG   rR   rF   )rT   r[   r\   r]   r^   rV   r_   r`   r   r   r   r   r   r   r   re   rd   r   r   r@   r   r   r   r   r   r   s                             r"   r   zDiffLlamaSdpaAttention.forward<  s    &**,,UA{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$Jz4+DEE
 t/HIIy\1!!D!D!D"MMM#**1aA66$%%aaaAAA/E1A"1E/E&EFK #v--+2I'2244L#..00J'2244L  4'5EAI	h)FF!04Fd,,3 G 
 
 &+[aQ%G%G%G"l9UYt~'FBV[Vcdddeehh
 
 9UYt~'FBV[Vcdddeehh
 
 )D,<<"[<%??4++t~~k/J/JJ!++Aq11<<>>!&&sE266kk+..D  r!   r   )r   r   r   r   r   rI   r   r   r   r   r   r   r   r    r!   r"   r   r   4  s         _%0A6RRR
 2637+/59I! I!|I! #5<#=>I! !.	I!
 u/0I! "%I! I! !!12I! 
u|Xel3XeEL>Q5RR	SI! I! I! SRI! I! I!r!   r   )eagerflash_attention_2sdpac                   (     e Zd Zdedef fdZ xZS )DiffLlamaDecoderLayerr+   r'   c                     t                                          ||           t          |j                 ||          | _        d S )N)r+   r'   )r2   r3   DIFFLLAMA_ATTENTION_CLASSES_attn_implementation	self_attnrS   s      r"   r3   zDiffLlamaDecoderLayer.__init__  s?    +++4V5PQY_ktuuur!   )r   r   r   r   r   r3   r   r   s   @r"   r   r     sW        v v3 v v v v v v v v v vr!   r   c                       e Zd ZdZdZd ZdS )DiffLlamaPreTrainedModelFc                    t          j        | |           t          |t                    r|j        j                            d| j        j                   |j	        j                            d| j        j                   |j
        j                            d| j        j                   |j        j                            d| j        j                   d S d S )Nr   )r
   _init_weightsr   r*   rL   datanormal_r+   rK   rM   rN   rO   )rT   modules     r"   r   z&DiffLlamaPreTrainedModel._init_weights  s    %dF333f011 	I!))!T[-GHHH!))!T[-GHHH!))!T[-GHHH!))!T[-GHHHHH		I 	Ir!   N)r   r   r   _supports_flex_attn_supports_attention_backendr   r    r!   r"   r   r     s4        "'I I I I Ir!   r   c                       e Zd ZdS )DiffLlamaModelNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd ZdS )DiffLlamaForCausalLMNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd ZdS )"DiffLlamaForSequenceClassificationNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd ZdS )DiffLlamaForQuestionAnsweringNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd ZdS )DiffLlamaForTokenClassificationNr   r    r!   r"   r   r     r#   r!   r   )r   r   r   r   r   r   )4r%   typingr   rI   r   cache_utilsr   r   modeling_flash_attention_utilsr   r	   modeling_utilsr
   utilsr   utils.deprecationr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_diffllamar   
get_loggerr   r4   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r(   Moduler*   r   r   r   r   r   r   r   r   r   r   __all__r    r!   r"   <module>r      s  $               - - - - - - - - i i i i i i i i - - - - - -       0 0 0 0 0 0 3 3 3 3 3 3	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 2 1 1 1 1 1 4 4 4 4 4 4 
	H	%	%5 #	 	 	 	 	: 	 	 	2 2 2b) b) b) b) b) b) b) b)JR! R! R! R! R!1 R! R! R!jR! R! R! R! R!/ R! R! R!l  1"  v v v v v- v v v
I 
I 
I 
I 
I3 
I 
I 
I	 	 	 	 	Z 	 	 		 	 	 	 	+ 	 	 		 	 	 	 	)G 	 	 		 	 	 	 	$= 	 	 		 	 	 	 	&A 	 	 	  r!   