
     `i                     "   d dl Z d dlmZmZmZ d dlZd dlmc mZ	 d dlmZ ddl
mZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1  e*            rd dl2m3Z3  ed           G d dej4                              Z5 G d dej4                  Z6d Z7dGdZ8dej9        de:dej9        fd Z;	 dHd"ej4        d#ej9        d$ej9        d%ej9        d&eej9                 d'e<d(e<d)e%e'         fd*Z=	 	 	 dId"ej4        d#ej9        d$ej9        d%ej9        d&eej9        d+f         d'ee<         d,ee<         d-eej9                 de>ej9        ej9        f         fd.Z? e"            Z@e?e@d/<    G d0 d1ej4                  ZA G d2 d3ej4                  ZB G d4 d5ej4                  ZC G d6 d7e          ZDe( G d8 d9e#                      ZEe( G d: d;eE                      ZF	 	 	 	 dJd=eej9        e>ej9                 df         d>ee:         d?ee:         d@e:d&eej9                 deej9        e:f         fdAZGe( G dB dCeEe                      ZH G dD dEeeE          ZIg dFZJdS )K    N)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)compile_friendly_flex_attention)create_causal_mask!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_available)deprecate_kwarg)OutputRecordercheck_model_inputs   )
DogeConfig)	BlockMaskRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )DogeRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z:
        DogeRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/doge/modeling_doge.pyr)   zDogeRMSNorm.__init__6   sD     	l5:k#:#:;; #    c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )N   T)keepdim)	dtypetor+   float32powmeanrsqrtr.   r-   )r/   hidden_statesinput_dtypevariances       r3   forwardzDogeRMSNorm.forward>   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r4   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler-   shaper.   )r/   s    r3   
extra_reprzDogeRMSNorm.extra_reprE   s&    )**II$2GIIIr4   )r&   )__name__
__module____qualname__r)   rB   rF   __classcell__r2   s   @r3   r%   r%   4   sb        $ $ $ $ $ $; ; ;J J J J J J Jr4   r%   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )DogeRotaryEmbeddinginv_freqNconfigc                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultrN   F)
persistent)r(   r)   hasattr
isinstancerQ   dictgetrR   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrO   r   rope_init_fnattention_scalingregister_bufferrN   original_inv_freq)r/   rO   devicerN   r2   s       r3   r)   zDogeRotaryEmbedding.__init__L   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r4   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r7   r    mpscpuF)device_typeenabledr6   dim)r9   )rN   floatexpandrE   r:   ra   rW   rS   strr+   autocast	transposecatcosr^   sinr9   )
r/   xposition_idsinv_freq_expandedposition_ids_expandedre   freqsembro   rp   s
             r3   rB   zDogeRotaryEmbedding.forward]   s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/N)rG   rH   rI   r+   Tensor__annotations__r!   r)   no_gradr   rB   rJ   rK   s   @r3   rM   rM   I   s         l/ /z / / / / / /" U]__< <  _< < < < <r4   rM   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nr7   r6   rg   )rE   r+   rn   )rq   x1x2s      r3   rotate_halfr~   m   s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r4   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer~   )qkro   rp   rr   unsqueeze_dimq_embedk_embeds           r3   apply_rotary_pos_embr   t   sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr4   r?   n_repreturnc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rE   rj   reshape)r?   r   batchnum_key_value_headsslenhead_dims         r3   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr4           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr6   r   r7   )rh   r9   ptrainingr    )r   num_key_value_groupsr+   matmulrm   rE   r   
functionalsoftmaxr;   r:   r9   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r3   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r4   r"   softcap	head_maskc           
      T   d }	d t          |t                    r|}	n|d d d d d d d |j        d         f         fd}
t          ||||
|	d|d          \  }}|                    |j                  }|                    dd                                          }||fS )Nr   c                     t          j        | z            z  } | |         |         |         |         z   } | |         |         d         d         z   } | S )Nr   )r+   tanh)score	batch_idxhead_idxq_idxkv_idxr   r   r   s        r3   	score_modz)flex_attention_forward.<locals>.score_mod   sn    ej999E"K	28<UCFKKE Ii0:1=a@@Er4   T)r   
block_mask
enable_gqascale
return_lser    r6   )rW   r"   rE   r   r:   r9   rm   r   )r   r   r   r   r   r   r   r   r   r   r   r   attention_weightsr   s         ``     @r3   flex_attention_forwardr      s     JK.),, %#

$!!!!QQQ?SYr]?":;       &E & & &"K" *,,U[99''1--88::K)))r4   doge_flex_attentionc                       e Zd Zddedee         f fdZ eddd          	 	 	 dd	ej	        d
e
ej	        ej	        f         deej	                 dee         deej                 de
ej	        eej	                 ee
ej	                          f         fd            Z	 	 dd	ej	        dej	        dedeej	                 fdZ xZS )DogeAttentionNrO   	layer_idxc                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        |j        | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        t)          j        |j                            | _        t          j        |j        | j        z  |j        |j                  | _        t          j        |j        | j        z  |j        |j                  | _        t3          | j        |j                  | _        t3          | j        |j                  | _        d S )Nr   g      ࿩biasr1   )r(   r)   rO   r   getattrr0   num_attention_headsr   r   r   r   attention_dropoutkeep_window_sizer   Linearattention_biasq_projk_projv_projr*   r+   zerosAdt_projo_projr%   rms_norm_epsq_normk_normr/   rO   r   r2   s      r3   r)   zDogeAttention.__init__   s   "
F4F&Jd4dee$*$>&B\$\!}d*!'!9 & 7i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 ek&*DEEFFy&68RY_Yn
 
 
 i&68JQWQf
 
 
 "$-V5HIII!$-V5HIIIr4   past_key_valuepast_key_values4.58new_nameversionr?   position_embeddingsr   cache_positionr   c                 "   |j         d d         }g |d| j        R }|                     |                     |                              |                                        dd          }	|                     |                     |                              |                                        dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|
                    |
|| j        |          \  }
}|                     |                    dd                              |j         d         |j         d         d                    }t          j        | j        t#          j        |          z                                dd          }|                     ||| j        |          }t+          || j                  }t.          }| j        j        dk    rt4          | j        j                 } || |	|
|f|| j        sd	n| j        | j        d
|\  }} |j        g |dR                                  }|                     |          }||fS )Nr7   r    r6   )rp   ro   r   r   r   )r?   	dt_statesr   r   eagerr   )r   r   r   ) rE   r   r   r   viewrm   r   r   r   r   updater   r   r   r+   expr   Fsoftplusprepare_dynamic_maskr   r   r   r   rO   _attn_implementationALL_ATTENTION_FUNCTIONSr   r   r   r   r   )r/   r?   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   ro   rp   cache_kwargsr   	attn_maskattention_interfacer   r   s                       r3   rB   zDogeAttention.forward	  s    $)#2#.88b8$-88{{4;;}#=#=#B#B<#P#PQQ[[\]_`aa[[]!;!;!@!@!N!NOOYYZ[]^__
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J LL""1a((001CA1FHZ[]H^`bcc
 
	 Idfqz)'<'<<==GGBOO	--'!2)	 . 
 
	 i)BCC	(?;+w66"9$+:Z"[$7$7		%

 %#}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r4      r   r   c           	         t          j        |j                  j        }|j        }|dddddddf                             dd|j        d         d          }|t          |t                    s|j        t           j        k    r7|j        }t          j	        |t          j
        d|j        |          |          }|                    |ddddddd|j        d         f         dk    |          }|j        d         |k    rkt          j        |||j                  }t          j        ||ddd	
          j        }	|                    d|	d          }|                    |dk    |          }|S )a8  
        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

        Args:
            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
        Nr7   r    r   )ra   r9   r   r9   ra   TF)rh   largestsorted      ?)r+   finfor9   minrj   rE   rW   r"   boolwheretensorra   masked_fill
zeros_liketopkindicesscatter)
r/   r?   r   r   r   	min_dtyper9   r   active_masktopk_indicess
             r3   r   z"DogeAttention.prepare_dynamic_maskB  s   $ K 3448	#aaaD!!!m,33M'*B
 
	 %j.S.S%#uz11%+!&"EL^=RZ_$`$`$`bk" " "--nQQQ111F[	XZH[F[=[.\`a.aclmmI?2!111*9E)JZ[[[K :i1ArSW`efffnL%--b,DDK!--kS.@)LLIr4   rw   NNN)r   N)rG   rH   rI   r!   r   intr)   r   r+   rx   rD   r	   
LongTensorrB   r   rJ   rK   s   @r3   r   r      ss       J Jz Jhsm J J J J J J< _%0A6RRR
 26+/596) 6)|6) #5<#=>6) !.	6)
 "%6) !!126) 
u|Xel3XeEL>Q5RR	S6) 6) 6) SR6)x !%15# #|# <# 	#
 !.# # # # # # # #r4   r   c                   $     e Zd Z fdZd Z xZS )DogeMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        t          j        | j        | j        |j                  | _
        t          |j                 | _        d S )Nr   )r(   r)   rO   r0   intermediate_sizer   r   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr/   rO   r2   s     r3   r)   zDogeMLP.__init__i  s    !-!'!94#3T5KRXRabbby!143IPVP_```4#94;KRXRabbbV./r4   c                     |                      |                     |                     |                    |                     |          z            }|S rw   )r	  r  r  r  )r/   rq   r	  s      r3   rB   zDogeMLP.forwards  sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r4   )rG   rH   rI   r)   rB   rJ   rK   s   @r3   r  r  h  sG        0 0 0 0 0      r4   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )	DogeCDMoErO   c                 2   t                                                       |j        | _        |j        | _        t          |j                 | _        |j        | _        t          j	        t          j
        | j                            | _        |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        dz  d          | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nr   r6   F)r(   r)   r0   r  r   r
  r  num_expertsmathfloorsqrtnum_keysnum_experts_per_toktop_knorm_topk_probr   r   r  r  r  r	  router_gate	Embedding
down_embedup_embedr  s     r3   r)   zDogeCDMoE.__init__y  sA   !-!'!9V./!-
49T-=#>#>??/
$3 4#3T5KRXRabbby!143IPVP_```4#94;KRXRabbb 9T%5t}q7HuUUU ,t'79IJJT%5t7GHHr4   r?   r   c                    |j         \  }}}|                     |                              d||z  d          }|                    | j        d          \  \  }}\  }	}
|                    d          |                    d          z   }|	                    d          | j        z  |
                    d          z   } |j        g |j         d d         dR  } |j        g |j         d d         dR  }|                    | j        d          \  }}|                    d|          }t          j	        |d          }| j
        r||                    dd          z  }|                     |          }|                     |          }t          j        ||                    ||z  dd                                        ||z  d          }|                     |          |z  }t          j        |                    ||z  dd          |                              ||d          }|                     |                     |                     |                    |                     |          z            }||z   }||fS )Nr6   r7   rg   r   T)rh   r8   r    )rE   r  r   r   r  r   r  gatherr   r   r  sumr  r  r+   r   r  r	  r  r  )r/   r?   r   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr  r  experts_weightsexperts_statess                        r3   rB   zDogeCDMoE.forward  s   
 (-Wa ((77<<QgrRR 8E7I7I$-]_7I7`7`484y)''++h.@.@.D.DD
))"--=	@S@STV@W@WW$Z_@j&6ss&;@R@@@
&k&C(9#2#(>CCCC#-??4:2?#F#F  $$R)9::)F333 	I22r42HHHO __W--
==)),z=3E3EcGmUWYZ3[3[\\aabehoboqstt++o66Ho&:&:3=!R&P&PRZ[[``adfmoqrrt{{4>>-3P3P'Q'QTXT`T`anToTo'opp%6m++r4   )	rG   rH   rI   r!   r)   r+   rx   rB   rJ   rK   s   @r3   r  r  x  su        Iz I I I I I I.,|, 
	, , , , , , , ,r4   r  c                   t    e Zd Zddedee         f fdZ eddd          	 	 	 	 	 dd
ej	        de
ej	        ej	        f         deej	                 deej                 dee         dee         deej                 dee         de
ej        ee
ej        ej        f                  f         fd            Z xZS )DogeDecoderLayerNrO   r   c                     t                                                       |j        | _        t          |j        |j                  | _        t          ||          | _        t          j
        t          j        |j                            | _        t          |j        |j                  | _        |j        st!          |          nt#          |          | _        t          j
        t          j        |j                            | _        d S )Nr   )rO   r   )r(   r)   hidden_dropoutr%   r0   r   input_layernormr   	self_attnr   r*   r+   r,   input_residualpost_attention_layernormis_moer  r  mlppost_attention_residualr   s      r3   r)   zDogeDecoderLayer.__init__  s    $3*6+=6CVWWW&f	JJJ l5:f6H+I+IJJ(3F4FFL_(`(`(`%*0-N76???Yv=N=N')|EJv?Q4R4R'S'S$$$r4   r   r   r   r   Fr?   r   r   rr   	use_cacher   r   r   c                 t   |}	|                      |          } | j        d|||||||d|\  }}
t          j        || j        | j                  }| j        |	z  |z   }|}	|                     |          }|                     |          }t          j        || j        | j                  }| j	        |	z  |z   }|S )N)r?   r   r   rr   r   r:  r   r    )
r3  r4  r   r   r2  r   r5  r6  r8  r9  )r/   r?   r   r   rr   r   r:  r   r   residualself_attn_weightss              r3   rB   zDogeDecoderLayer.forward  s     !,,];;+94> 	,
' 3)%+)	,
 	,
 	,
 	,
(( 	-43FQUQ^___+h6F !55mDD//	-43FQUQ^___4x?-Or4   rw   )NNNFN)rG   rH   rI   r!   r   r   r)   r   r+   rx   rD   r  r	   r   r   r   FloatTensorrB   rJ   rK   s   @r3   r0  r0    sW       
T 
Tz 
Thsm 
T 
T 
T 
T 
T 
T _%0A6RRR
 2637+/$)59" "|" #5<#=>" !.	"
 u/0" "%" D>" !!12" +," 
u (51BEDU1U+V"WW	X" " " SR" " " " "r4   r0  c                   t     e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZ eed          eed	Z fd
Z xZS )DogePreTrainedModelrO   modelTr0  r   Fr    )index)r#  r?   
attentionsc                    t                                          |           t          |t                    r2t	          |d          r |j        j                                         dS dS t          |t                    r`t	          |d          r|j	        j        
                    d           t	          |d          r#|j        j        
                    d           dS dS dS )zInitialize the weightsr   r5  r   r9  N)r(   _init_weightsrW   r   rV   r   datazero_r0  r5  fill_r9  )r/   r   r2   s     r3   rF  z!DogePreTrainedModel._init_weights  s    f%%%fm,, 	?vs## &##%%%%%& & 011 	?v/00 6%*00555v899 ?.399#>>>>>		? 	?? ?r4   )rG   rH   rI   r!   ry   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r  r0  r   _can_record_outputsrF  rJ   rK   s   @r3   rA  rA    s         &*#+,#4"5 N""&'	;;;)# 
? 
? 
? 
? 
? 
? 
? 
? 
?r4   rA  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         deej                 d	ee         d
eej	                 dee         defd                        Z xZS )	DogeModelrO   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S r<  )r0  ).0r   rO   s     r3   
<listcomp>z&DogeModel.__init__.<locals>.<listcomp>  s$    bbbYfi00bbbr4   r   rO   F)r(   r)   pad_token_idpadding_idx
vocab_sizer   r  r0   embed_tokens
ModuleListrangenum_hidden_layerslayersr%   r   normrM   
rotary_embgradient_checkpointing	post_initr  s    `r3   r)   zDogeModel.__init__  s       !. +L):F<NPTP`aambbbb%H`BaBabbb
 
   28KLLL	-V<<<&+# 	r4   N	input_idsr   rr   r   inputs_embedsr:  r   r   r   c                 |   |d u |d uz  rt          d          |r|t          | j                  }||                     |          }|B||                                nd}	t          j        |	|	|j        d         z   |j                  }||	                    d          }| j        j
        t          nt          }
 |
| j        |||||          }|}|                     ||          }| j        d | j        j                 D ]} ||f||||||d|}|                     |          }t#          ||          S )	Nz:You must specify exactly one of input_ids or inputs_embedsrZ  r   r    )ra   )rO   input_embedsr   r   r   rr   )r   r   rr   r   r:  r   )last_hidden_stater   )
ValueErrorr
   rO   r^  get_seq_lengthr+   arangerE   ra   r   sliding_windowr   r   rd  rb  ra  rc  r   )r/   rg  r   rr   r   rh  r:  r   r   past_seen_tokensmask_functionr   r?   r   decoder_layers                  r3   rB   zDogeModel.forward  s    -t";< 	[YZZZ 	?0*$+>>>O  --i88M!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L.2k.H.P**Vw#m;&))+%
 
 
 & #oom\JJ![)H4;+H)HI 
	 
	M)M	$7*) /#-	 	 	 	MM 		-00%++
 
 
 	
r4   )NNNNNNN)rG   rH   rI   r!   r)   r   r   r   r+   r  rx   r	   r?  r   r   r   r   rB   rJ   rK   s   @r3   rU  rU    s       z         151537+/59$(59<
 <
E,-<
 !.<
 u/0	<

 "%<
   12<
 D><
 !!12<
 +,<
 
 <
 <
 <
 ^ <
 <
 <
 <
 <
r4   rU  r6   gate_logitsr  r  r  c                 T   | t          | t                    sdS | d         j        }| d         j        }g }g }| D ]9}	|	                    |          }	|	                    |d          \  \  }
}\  }}|
                    d          |                    d          z   }|                    d          |z  |                    d          z   } |j        g |j        dd         dR  } |j        g |j        dd         dR  }|                    |d          \  }}|	                    d|          }t          j        |d          }|                    |           |                    |           ;t          j        |d          }t          j        |d          }||                    d          }t          j        |||          }t          j        |||          }|                    d||          |j        d         z  }t          j        |d          }nk|j        \  }}t'          |           }|ddddddf                             ||||f                              d                              |          }|                    d          |                                         }t          j        |||          }t          j        |||          }|                    d||          t          j        |          z  }|ddddddf                             ||||f                              d|                              |          }t          j        ||z  d          t          j        |d          z  }t          j        ||z            }||z  S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [2, batch_size * sequence_length, num_keys].
        num_experts:
            Number of experts
        num_keys:
            Number of keys
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   r7   rg   r   r   )rW   rD   r9   ra   r:   r   r   r   rE   r  r   r   appendr+   rn   r   	ones_likescatter_add_r=   lenrj   r   r   r  )rs  r  r  r  r   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr$  r%  r&  r'  r(  r)  r"  r+  expert_indicesr,  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthra  expert_attention_mask router_per_expert_attention_maskoverall_losss                                r3   load_balancing_loss_funcr  V  s   @ *[%"@"@qN(M ^*N( 4 4-00@@7H7M7Mh\^7M7_7_484y)''++h.@.@.D.DD
))"--89;N;Nr;R;RR$Z_@j&6ss&;@R@@@
&k&C(9#2#(>CCCC(ooeo<<$++B0@AA)JB777!!.111""?3333#51===)$7Q???/44R88!K=Q_```o0n]]]-::1>PRUVVYkYqrsYtt "',?Q!G!G!G&4&:#
O,, 4AAAt+,V&
OUKLLWR[[R	 	 044R889N9S9S9U9UV "K=Q_```o0n]]]-::1>PRUVVY^Yb!Z
 Z
 
 4AAAt+,V&
O[QRRWR%%R	 	) "'+>Aa+agh!i!i!ilqlu,!m
 m
 m
 "
 9.1GGHHL+%%r4   c                   x    e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e         de	e
j                 deee
j        f         de	e         dee         defd                        Z xZS )DogeForCausalLMzlm_head.weightlm_headcolwise_repr?   logitsc                 F   t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        |j	        | _	        |j
        | _
        |j        | _        |                                  d S )NFr   )r(   r)   rU  rB  r]  r   r   r0   r  router_aux_loss_coefr  r  rf  r  s     r3   r)   zDogeForCausalLM.__init__  s       v&&
 +y!3V5FUSSS$*$?!!-#)#=  	r4   Nr   rg  r   rr   r   rh  labelsr:  r   logits_to_keepoutput_router_logitsr   r   c                 `   |
|
n| j         j        }
 | j        d|||||||d|}|j        }t	          |	t
                    rt          |	 d          n|	}|                     |dd|ddf                   }d}| | j        ||| j	        fi |}d}|
rrt          |j        | j        t          j        t          j        | j                            | j        |          }|%|| j        |                    |j                  z  z  }t)          ||||j        |j        |j        |j                  S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DogeForCausalLM

        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)rg  r   rr   r   rh  r:  r   )lossaux_lossr  r   r?   rD  r#  r<  )rO   r  rB  rk  rW   r   slicer  loss_functionr]  r  r#  r  r  r  r  r  r  r:   ra   r   r   r?   rD  )r/   rg  r   rr   r   rh  r  r:  r   r  r  r   outputsr?   slice_indicesr  r  r  s                     r3   rB   zDogeForCausalLM.forward  s   N %9$D  $+Jj 	
 +5$* 	+
)%+')	+
 	+
 	+
 	+
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%ffdoPPPPD 		M/% 
49T%56677( H !1HKK4L4LLL(#3!/)!/
 
 
 	
r4   )
NNNNNNNNr   N)rG   rH   rI   _tied_weights_keys_tp_plan_pp_planr)   r   r   r   r+   r  rx   r	   r?  r   r   r   r   r   r   rB   rJ   rK   s   @r3   r  r    s       *+=)H_-z:;H
 
 
 
 
  151537+/59-1$(5934/3Q
 Q
E,-Q
 !.Q
 u/0	Q

 "%Q
   12Q
 )*Q
 D>Q
 !!12Q
 c5</0Q
 'tnQ
 +,Q
 
#Q
 Q
 Q
 ^ Q
 Q
 Q
 Q
 Q
r4   r  c                       e Zd ZdS )DogeForSequenceClassificationN)rG   rH   rI   r<  r4   r3   r  r  (  s        Dr4   r  )r  rU  rA  r  )Nr    )r   r   )NNr6   N)Kr  typingr   r   r   r+   torch.nn.functionalr   r   r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   integrations.flex_attentionr   masking_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   configuration_doger!   !torch.nn.attention.flex_attentionr"   Moduler%   rM   r~   r   rx   r   r   ri   r   rD   r   r   r   r  r  r0  rA  rU  r  r  r  __all__r<  r4   r3   <module>r     s9  0  , , , , , , , , , ,                 ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 J J J J J J R R R R R R R R [ [ [ [ [ [ [ [ Q Q Q Q Q Q Q Q K K K K K K K K A A A A A A A A & & & & & & g g g g g g g g g g g g 0 0 0 0 0 0 ? ? ? ? ? ? ? ? * * * * * *  !! <;;;;;; Y''J J J J J") J J ('J(!< !< !< !< !<") !< !< !<H( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %@  $#(,.* .*I.*<.* 
.* <	.*
 %,34.* e_.* e_.* %.* 5<%&.* .* .* .*b -,.. 1G - .{ { { { {BI { { {|    bi    6, 6, 6, 6, 6,	 6, 6, 6,r0 0 0 0 01 0 0 0f ? ? ? ? ?/ ? ? ?< O
 O
 O
 O
 O
# O
 O
 O
h "&"-1g& g&u|U5<%8$>?g&#g& smg& 	g&
 U\*g& 5<g& g& g& g&T d
 d
 d
 d
 d
)? d
 d
 d
N	 	 	 	 	$DFY 	 	 	 c
b
br4   