
     `i                        d Z ddlZddlmZmZmZ ddlZddlmc m	Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1  e!            rddl2m3Z3  G d de          Z4 G d de*          Z5 G d de+          Z6	 	 	 d>dej7        dej8        dej8        dej8        d eej8        d!f         d"ee9         d#ee9         d$eej8                 d%e:ej8        ej8        f         fd&Z; e            Z<e;e<d'<    G d( d)ej7                  Z= G d* d+e(          Z> G d, d-ej7                  Z? G d. d/e          Z@ G d0 d1e)          ZA G d2 d3e1          ZB	 	 	 	 d?d4eej8        e:ej8                 df         d5eeC         d6eeC         d7eCd eej8                 d%eej8        eCf         fd8ZD G d9 d:e0          ZE G d; d<e'          ZFg d=ZGdS )@zPyTorch Doge model.    N)CallableOptionalUnion)nn   )ACT2FN)Cache)PretrainedConfig)compile_friendly_flex_attention)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)rope_config_validation)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsis_torch_flex_attn_available)deprecate_kwarg)OutputRecorder   )LlamaForSequenceClassificationLlamaMLPLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward	repeat_kv)MixtralForCausalLMMixtralModel)	BlockMaskc                        e Zd ZdZdZdgZi dddddddd	d
d	dddddddddddddddd	ddddddZdgdgfddgdgfdgdgfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d/ fd.	Z xZ	S )0
DogeConfiga   
    This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
    model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for each sequence transformation and state transformation module.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings.
            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
            Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
                    In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'.
                    The original max position embeddings used during pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation.
                    If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention.
            If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
            When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
            For more details checkout [this paper](https://huggingface.co/papers/2305.13245).
            If it is not specified, will default to `num_attention_heads`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `None`.
        keep_window_size (`int`, *optional*, defaults to 2048):
            The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
        is_moe (`bool`, *optional*, defaults to `False`):
            Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
        num_experts (`int`, *optional*, defaults to 16384):
            Number of routed experts in the model. This is only used when `is_moe=True`.
        num_experts_per_tok (`int`, *optional*, defaults to 64):
            Number of selected experts to route per-token.
        norm_topk_prob (`bool`, *optional*, defaults to `False`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.

    ```python
    >>> from transformers import DogeConfig, DogeModel

    >>> # Initializing a Doge-320M style configuration
    >>> configuration = DogeConfig()

    >>> # Initializing a model from the Doge-320M style configuration
    >>> model = DogeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```dogepast_key_valueszlayers.*.self_attn.q_projcolwisezlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.dt_projrowwisezlayers.*.self_attn.o_projzlayers.*.input_layernorm.weightsequence_parallelzlayers.*.input_residual.weightz(layers.*.post_attention_layernorm.weightz'layers.*.post_attention_residual.weightznorm.weightzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.mlp.router_gatecolwise_repzlayers.*.mlp.down_embedrowwise_repzlayers.*.mlp.up_embed	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnorm                     silu{Gz?ư>TF     @N    @  @   MbP?c                    || _         || _        || _        || _        || _        || _        || _        || _        |	| _        || _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | j        d| j        v r| j        d         | j        d<   t3          |            ||| _         t5                      j        dd|
i| d S )Ntype	rope_typetie_word_embeddings )
vocab_sizehidden_sizeintermediate_sizenum_hidden_layershidden_dropout
hidden_actinitializer_rangerms_norm_eps	use_cachemax_position_embeddings
rope_thetarope_scalingnum_attention_headsnum_key_value_headsattention_biasattention_dropoutmlp_biassliding_windowkeep_window_sizeis_moenum_expertsnum_experts_per_toknorm_topk_proboutput_router_logitsrouter_aux_loss_coefr   super__init__)selfrE   rF   rG   rH   rI   rJ   rK   rL   rM   rC   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   kwargs	__class__s                               y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/doge/modular_doge.pyr_   zDogeConfig.__init__   sL   < %&!2!2,$!2("'>$$(#6 #6 ,!2 , 0&#6 ,$8!$8! (Vt7H-H-H-1->v-FDk*t$$$ &':D$ 	
 	
 3	
	
 	
 	
 	
 	
    )r3   r4   r5   r6   r7   r8   r9   r:   TFr5   r;   Nr<   NFr7   FNr5   Fr=   r>   FFr?   )
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr_   __classcell__rb   s   @rc   r$   r$   7   s       n n` J#4"5#Y#Y 	$Y 	%i	
 	$Y 	*+> 	)*= 	34G 	23F 	* 	!) 		 	!) 	#M 	"=  	 !& &(9:#%568IJ!"_$56  ! $ ""7G
 G
 G
 G
 G
 G
 G
 G
 G
 G
rd   r$   c                       e Zd ZdS )DogeRMSNormNre   rf   rg   rD   rd   rc   rp   rp             Drd   rp   c                       e Zd ZdS )DogeRotaryEmbeddingNrq   rD   rd   rc   rt   rt     rr   rd   rt   modulequerykeyvaluer/   r"   scalingsoftcap	head_maskreturnc           
      T   d }	d t          |t                    r|}	n|d d d d d d d |j        d         f         fd}
t          ||||
|	d|d          \  }}|                    |j                  }|                    dd                                          }||fS )Nc                     t          j        | z            z  } | |         |         |         |         z   } | |         |         d         d         z   } | S )Nr   )torchtanh)score	batch_idxhead_idxq_idxkv_idxcausal_maskr{   rz   s        rc   	score_modz)flex_attention_forward.<locals>.score_mod*  sn    ej999E"K	28<UCFKKE Ii0:1=a@@Erd   T)r   
block_mask
enable_gqascale
return_lse   r   )
isinstancer"   shaper   todtype	transpose
contiguous)ru   rv   rw   rx   r/   ry   rz   r{   ra   r   r   attn_outputattention_weightsr   s         ``     @rc   flex_attention_forwardr     s     JK.),, %#

$!!!!QQQ?SYr]?":;       &E & & &"K" *,,U[99''1--88::K)))rd   doge_flex_attentionc                       e Zd Zddedee         f fdZ eddd          	 	 	 dd	ej	        d
e
ej	        ej	        f         deej	                 dee         deej                 de
ej	        eej	                 ee
ej	                          f         fd            Z	 	 dd	ej	        dej	        dedeej	                 fdZ xZS )DogeAttentionNconfig	layer_idxc                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        |j        | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        t)          j        |j                            | _        t          j        |j        | j        z  |j        |j                  | _        t          j        |j        | j        z  |j        |j                  | _        t3          | j        |j                  | _        t3          | j        |j                  | _        d S )Nhead_dimg      ࿩biaseps)r^   r_   r   r   getattrrF   rQ   r   rR   num_key_value_groupsry   rT   rW   r   LinearrS   q_projk_projv_proj	Parameterr   zerosAdt_projo_projrp   rL   q_normk_normr`   r   r   rb   s      rc   r_   zDogeAttention.__init__K  s   "
F4F&Jd4dee$*$>&B\$\!}d*!'!9 & 7i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 ek&*DEEFFy&68RY_Yn
 
 
 i&68JQWQf
 
 
 "$-V5HIII!$-V5HIIIrd   past_key_valuer&   4.58new_nameversionr.   position_embeddingsr/   cache_positionr|   c                 "   |j         d d         }g |d| j        R }|                     |                     |                              |                                        dd          }	|                     |                     |                              |                                        dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|
                    |
|| j        |          \  }
}|                     |                    dd                              |j         d         |j         d         d                    }t          j        | j        t#          j        |          z                                dd          }|                     ||| j        |          }t+          || j                  }t.          }| j        j        dk    rt4          | j        j                 } || |	|
|f|| j        sd	n| j        | j        d
|\  }} |j        g |dR                                  }|                     |          }||fS )Nr   r   )sincosr   r   r~   )r.   	dt_statesrW   r/   eagerr7   )r/   dropoutry   ) r   r   r   r   viewr   r   r   r   r   updater   r   reshaper   expr   Fsoftplusprepare_dynamic_maskrW   r   r   r   r   _attn_implementationALL_ATTENTION_FUNCTIONStrainingrT   ry   r   r   )r`   r.   r   r/   r&   r   ra   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsr   	attn_maskattention_interfacer   attn_weightss                       rc   forwardzDogeAttention.forwardi  s    $)#2#.88b8$-88{{4;;}#=#=#B#B<#P#PQQ[[\]_`aa[[]!;!;!@!@!N!NOOYYZ[]^__
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J LL""1a((001CA1FHZ[]H^`bcc
 
	 Idfqz)'<'<<==GGBOO	--'!2)	 . 
 
	 i)BCC	(?;+w66"9$+:Z"[$7$7		%

 %#}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((rd   r5   r   rW   c           	         t          j        |j                  j        }|j        }|dddddddf                             dd|j        d         d          }|t          |t                    s|j        t           j        k    r7|j        }t          j	        |t          j
        d|j        |          |          }|                    |ddddddd|j        d         f         dk    |          }|j        d         |k    rkt          j        |||j                  }t          j        ||ddd	
          j        }	|                    d|	d          }|                    |dk    |          }|S )a8  
        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

        Args:
            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
        Nr   r   r7   )devicer   r   r   r   TF)dimlargestsorted      ?)r   finfor   minexpandr   r   r"   boolwheretensorr   masked_fill
zeros_liketopkindicesscatter)
r`   r.   r   rW   r/   	min_dtyper   r   active_masktopk_indicess
             rc   r   z"DogeAttention.prepare_dynamic_mask  s   $ K 3448	#aaaD!!!m,33M'*B
 
	 %j.S.S%#uz11%+!&"EL^=RZ_$`$`$`bk" " "--nQQQ111F[	XZH[F[=[.\`a.aclmmI?2!111*9E)JZ[[[K :i1ArSW`efffnL%--b,DDK!--kS.@)LLIrd   NNNN)r5   N)re   rf   rg   r$   r   intr_   r   r   Tensortupler	   
LongTensorr   r   rm   rn   s   @rc   r   r   J  ss       J Jz Jhsm J J J J J J< _%0A6RRR
 26+/596) 6)|6) #5<#=>6) !.	6)
 "%6) !!126) 
u|Xel3XeEL>Q5RR	S6) 6) 6) SR6)x !%15# #|# <# 	#
 !.# # # # # # # #rd   r   c                       e Zd ZdS )DogeMLPNrq   rD   rd   rc   r   r     rr   rd   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )	DogeCDMoEr   c                 2   t                                                       |j        | _        |j        | _        t          |j                 | _        |j        | _        t          j	        t          j
        | j                            | _        |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        dz  d          | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nr   r   F)r^   r_   rF   rG   r   rJ   act_fnrY   mathfloorsqrtnum_keysrZ   top_kr[   r   r   rU   	gate_projup_proj	down_projrouter_gate	Embedding
down_embedup_embedr`   r   rb   s     rc   r_   zDogeCDMoE.__init__  sA   !-!'!9V./!-
49T-=#>#>??/
$3 4#3T5KRXRabbby!143IPVP_```4#94;KRXRabbb 9T%5t}q7HuUUU ,t'79IJJT%5t7GHHrd   r.   r|   c                    |j         \  }}}|                     |                              d||z  d          }|                    | j        d          \  \  }}\  }	}
|                    d          |                    d          z   }|	                    d          | j        z  |
                    d          z   } |j        g |j         d d         dR  } |j        g |j         d d         dR  }|                    | j        d          \  }}|                    d|          }t          j	        |d          }| j
        r||                    dd          z  }|                     |          }|                     |          }t          j        ||                    ||z  dd                                        ||z  d          }|                     |          |z  }t          j        |                    ||z  dd          |                              ||d          }|                     |                     |                     |                    |                     |          z            }||z   }||fS )Nr   r   r   r~   T)r   keepdimr   )r   r   r   r   r   	unsqueezer   gatherr   softmaxr[   sumr   r   r   matmulr   r   r   r   )r`   r.   ra   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr   r   experts_weightsexperts_statess                        rc   r   zDogeCDMoE.forward  s   
 (-Wa ((77<<QgrRR 8E7I7I$-]_7I7`7`484y)''++h.@.@.D.DD
))"--=	@S@STV@W@WW$Z_@j&6ss&;@R@@@
&k&C(9#2#(>CCCC#-??4:2?#F#F  $$R)9::)F333 	I22r42HHHO __W--
==)),z=3E3EcGmUWYZ3[3[\\aabehoboqstt++o66Ho&:&:3=!R&P&PRZ[[``adfmoqrrt{{4>>-3P3P'Q'QTXT`T`anToTo'opp%6m++rd   )	re   rf   rg   r$   r_   r   r   r   rm   rn   s   @rc   r   r     su        Iz I I I I I I.,|, 
	, , , , , , , ,rd   r   c                   t    e Zd Zddedee         f fdZ eddd          	 	 	 	 	 dd
ej	        de
ej	        ej	        f         deej	                 deej                 dee         dee         deej                 dee         de
ej        ee
ej        ej        f                  f         fd            Z xZS )DogeDecoderLayerNr   r   c                     t                                                       |j        | _        t          |j        |j                  | _        t          ||          | _        t          j
        t          j        |j                            | _        t          |j        |j                  | _        |j        st!          |          nt#          |          | _        t          j
        t          j        |j                            | _        d S )Nr   )r   r   )r^   r_   rI   rp   rF   rL   input_layernormr   	self_attnr   r   r   onesinput_residualpost_attention_layernormrX   r   r   mlppost_attention_residualr   s      rc   r_   zDogeDecoderLayer.__init__  s    $3*6+=6CVWWW&f	JJJ l5:f6H+I+IJJ(3F4FFL_(`(`(`%*0-N76???Yv=N=N')|EJv?Q4R4R'S'S$$$rd   r   r&   r   r   Fr.   r   r/   position_idsrM   r   ra   r|   c                 t   |}	|                      |          } | j        d|||||||d|\  }}
t          j        || j        | j                  }| j        |	z  |z   }|}	|                     |          }|                     |          }t          j        || j        | j                  }| j	        |	z  |z   }|S )N)r.   r   r/   r!  r&   rM   r   )pr   rD   )
r  r  r   r   rI   r   r  r  r  r   )r`   r.   r   r/   r!  r&   rM   r   ra   residualself_attn_weightss              rc   r   zDogeDecoderLayer.forward  s     !,,];;+94> 	,
' 3)%+)	,
 	,
 	,
 	,
(( 	-43FQUQ^___+h6F !55mDD//	-43FQUQ^___4x?-Ord   r   )NNNFN)re   rf   rg   r$   r   r   r_   r   r   r   r   r   r	   r   r   r   FloatTensorr   rm   rn   s   @rc   r  r    sW       
T 
Tz 
Thsm 
T 
T 
T 
T 
T 
T _%0A6RRR
 2637+/$)59" "|" #5<#=>" !.	"
 u/0" "%" D>" !!12" +," 
u (51BEDU1U+V"WW	X" " " SR" " " " "rd   r  c                   >    e Zd ZdZdZ eed          eedZ	d Z
dS )DogePreTrainedModelFr   )index)r  r.   
attentionsc                    t          j        | |           t          |t                    r2t	          |d          r |j        j                                         dS dS t          |t                    r`t	          |d          r|j	        j        
                    d           t	          |d          r#|j        j        
                    d           dS dS dS )zInitialize the weightsr   r  r   r   N)r   _init_weightsr   r   hasattrr   datazero_r  r  fill_r   )r`   ru   s     rc   r,  z!DogePreTrainedModel._init_weightsA  s    %dF333fm,, 	?vs## &##%%%%%& & 011 	?v/00 6%*00555v899 ?.399#>>>>>		? 	?? ?rd   N)re   rf   rg   _supports_flash_attn_can_compile_fullgraphr   r   r  r   _can_record_outputsr,  rD   rd   rc   r(  r(  8  sT         "'	;;;)# 
? 
? 
? 
? 
?rd   r(  c                       e Zd ZdS )	DogeModelNrq   rD   rd   rc   r5  r5  N  rr   rd   r5  gate_logitsrY   r   r   c                 T   | t          | t                    sdS | d         j        }| d         j        }g }g }| D ]9}	|	                    |          }	|	                    |d          \  \  }
}\  }}|
                    d          |                    d          z   }|                    d          |z  |                    d          z   } |j        g |j        dd         dR  } |j        g |j        dd         dR  }|                    |d          \  }}|	                    d|          }t          j        |d          }|                    |           |                    |           ;t          j        |d          }t          j        |d          }||                    d          }t          j        |||          }t          j        |||          }|                    d||          |j        d         z  }t          j        |d          }nk|j        \  }}t'          |           }|ddddddf                             ||||f                              d                              |          }|                    d          |                                         }t          j        |||          }t          j        |||          }|                    d||          t          j        |          z  }|ddddddf                             ||||f                              d|                              |          }t          j        ||z  d          t          j        |d          z  }t          j        ||z            }||z  S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [2, batch_size * sequence_length, num_keys].
        num_experts:
            Number of experts
        num_keys:
            Number of keys
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   r   r  r~   r   )r   r   r   r   r   r   r  r   r   r  r   r  appendr   catr   	ones_likescatter_add_meanlenr   r   r   r  )r6  rY   r   r   r/   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr  r  r  r  r  r  r
  r  expert_indicesr  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthrH   expert_attention_mask router_per_expert_attention_maskoverall_losss                                rc   load_balancing_loss_funcrL  R  s   @ *[%"@"@qN(M ^*N( 4 4-00@@7H7M7Mh\^7M7_7_484y)''++h.@.@.D.DD
))"--89;N;Nr;R;RR$Z_@j&6ss&;@R@@@
&k&C(9#2#(>CCCC(ooeo<<$++B0@AA)JB777!!.111""?3333#51===)$7Q???/44R88!K=Q_```o0n]]]-::1>PRUVVYkYqrsYtt "',?Q!G!G!G&4&:#
O,, 4AAAt+,V&
OUKLLWR[[R	 	 044R889N9S9S9U9UV "K=Q_```o0n]]]-::1>PRUVVY^Yb!Z
 Z
 
 4AAAt+,V&
O[QRRWR%%R	 	) "'+>Aa+agh!i!i!ilqlu,!m
 m
 m
 "
 9.1GGHHL+%%rd   c                   :    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         deej	                 d	eej                 d
ee
         deej                 deeej        f         dee
         dee         defdZ xZS )DogeForCausalLMc                     t                                          |           t          |          | _        |j        | _        d S r   )r^   r_   r5  modelrY   r   s     rc   r_   zDogeForCausalLM.__init__  s;       v&&
!-rd   Nr   r,   r/   r!  r&   r-   labelsrM   r   logits_to_keepr\   ra   r|   c                 `   |
|
n| j         j        }
 | j        d|||||||d|}|j        }t	          |	t
                    rt          |	 d          n|	}|                     |dd|ddf                   }d}| | j        ||| j	        fi |}d}|
rrt          |j        | j        t          j        t          j        | j                            | j        |          }|%|| j        |                    |j                  z  z  }t)          ||||j        |j        |j        |j                  S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DogeForCausalLM

        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r,   r/   r!  r&   r-   rM   r   )lossaux_losslogitsr&   r.   r*  r  rD   )r   r\   rP  last_hidden_stater   r   slicelm_headloss_functionrE   rL  r  rY   r   r   r   rZ   r]   r   r   r   r&   r.   r*  )r`   r,   r/   r!  r&   r-   rQ  rM   r   rR  r\   ra   outputsr.   slice_indicesrV  rT  rU  s                     rc   r   zDogeForCausalLM.forward  s   J %9$D  $+Jj 	
 +5$* 	+
)%+')	+
 	+
 	+
 	+
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%ffdoPPPPD 		M/% 
49T%56677( H !1HKK4L4LLL(#3!/)!/
 
 
 	
rd   )
NNNNNNNNr   N)re   rf   rg   r_   r   r   r   r   r	   r&  r   r   r   r   r   r   r   rm   rn   s   @rc   rN  rN    sI       . . . . . 151537+/59-1$(5934/3Q
 Q
E,-Q
 !.Q
 u/0	Q

 "%Q
   12Q
 )*Q
 D>Q
 !!12Q
 c5</0Q
 'tnQ
 +,Q
 
#Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
rd   rN  c                       e Zd ZdS )DogeForSequenceClassificationNrq   rD   rd   rc   r^  r^    rr   rd   r^  )r$   rN  r5  r(  r^  r   )NNr   N)Hrh   r   typingr   r   r   r   torch.nn.functionalr   
functionalr   activationsr   cache_utilsr	   configuration_utilsr
   integrations.flex_attentionr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.deprecationr   utils.genericr   llama.modeling_llamar   r   r   r   r   r   r   r   mixtral.modeling_mixtralr    r!   !torch.nn.attention.flex_attentionr"   r$   rp   rt   Moduler   floatr   r   r   r   r   r   r  r(  r5  r   rL  rN  r^  __all__rD   rd   rc   <module>rt     sf  "    , , , , , , , , , ,                 ! ! ! ! ! !             3 3 3 3 3 3 J J J J J J 9 9 9 9 9 9 Q Q Q Q Q Q Q Q 9 9 9 9 9 9 A A A A A A A A & & & & & & E E E E E E E E 0 0 0 0 0 0 + + + + + +	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 H G G G G G G G  !! <;;;;;;S
 S
 S
 S
 S
! S
 S
 S
l	 	 	 	 	, 	 	 		 	 	 	 	. 	 	 	  $#(,.* .*I.*<.* 
.* <	.*
 %,34.* e_.* e_.* %.* 5<%&.* .* .* .*b -,.. 1G - .{ { { { {BI { { {|	 	 	 	 	h 	 	 	6, 6, 6, 6, 6,	 6, 6, 6,r0 0 0 0 01 0 0 0f? ? ? ? ?. ? ? ?,	 	 	 	 	 	 	 	 "&"-1g& g&u|U5<%8$>?g&#g& smg& 	g&
 U\*g& 5<g& g& g& g&TW
 W
 W
 W
 W
( W
 W
 W
t	 	 	 	 	$B 	 	 	  rd   