
     `ik                     l   d dl mZmZmZ d dlZd dlmc mZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-  ed           G d dej.                              Z/ G d dej.                  Z0d Z1d;dZ2dej3        de4dej3        fdZ5	 d<d!ej.        d"ej3        d#ej3        d$ej3        d%eej3                 d&e6d'e6d(e#e%         fd)Z7 G d* d+ej.                  Z8 G d, d-ej.                  Z9 G d. d/ej.                  Z: G d0 d1ej.                  Z; G d2 d3e          Z<e& G d4 d5e!                      Z=e& G d6 d7e=                      Z>e& G d8 d9e=e                      Z?g d:Z@dS )=    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )Dots1ConfigRMSNormc                   T     e Zd Zd	deddf fdZdej        dej        fdZd Z xZ	S )
Dots1RMSNormư>epsreturnNc                     t                                                       t          j        t	          j        |                    | _        || _        dS )z;
        Dots1RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer#   	__class__s      |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/dots1/modeling_dots1.pyr'   zDots1RMSNorm.__init__.   sD     	l5:k#:#:;; #    hidden_statesc                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )N   T)keepdim)	dtypetor)   float32powmeanrsqrtr,   r+   )r-   r2   input_dtypevariances       r0   forwardzDots1RMSNorm.forward6   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r1   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler+   shaper,   )r-   s    r0   
extra_reprzDots1RMSNorm.extra_repr=   s&    )**II$2GIIIr1   )r"   )
__name__
__module____qualname__floatr'   r)   Tensorr?   rC   __classcell__r/   s   @r0   r!   r!   ,   s        $ $ $$ $ $ $ $ $ $;U\ ;el ; ; ; ;J J J J J J Jr1   r!   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )Dots1RotaryEmbeddinginv_freqNconfigc                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultrM   F)
persistent)r&   r'   hasattr
isinstancerP   dictgetrQ   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrN   r   rope_init_fnattention_scalingregister_bufferrM   original_inv_freq)r-   rN   devicerM   r/   s       r0   r'   zDots1RotaryEmbedding.__init__D   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r1   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r5   r   mpscpuF)device_typeenabledr4   dimr7   )rM   rG   expandrB   r8   r`   rV   rR   strr)   autocast	transposecatcosr]   sinr7   )
r-   xposition_idsinv_freq_expandedposition_ids_expandedrd   freqsembrn   ro   s
             r0   r?   zDots1RotaryEmbedding.forwardU   s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/N)rD   rE   rF   r)   rH   __annotations__r   r'   no_gradr   r?   rI   rJ   s   @r0   rL   rL   A   s         l/ /{ / / / / / /" U]__< <  _< < < < <r1   rL   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nr5   r4   rf   )rB   r)   rm   )rp   x1x2s      r0   rotate_halfr|   e   s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r1   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer|   )qkrn   ro   rq   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embr   l   sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr1   r2   n_repr$   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rB   ri   reshape)r2   r   batchnum_key_value_headsslenhead_dims         r0   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr1           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr4   r   r5   )rg   r7   )ptrainingr   )r   num_key_value_groupsr)   matmulrl   rB   r   
functionalsoftmaxr9   r8   r7   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r0   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r1   c                        e Zd ZdZdedef fdZ eddd          	 	 dd
ej	        de
ej	        ej	        f         deej	                 dee         deej                 dee         de
ej	        eej	                 f         fd            Z xZS )Dots1Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrN   	layer_idxc                 l   t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        d| _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        | j        z  |j        |j                  | _        t)          | j        |j                  | _        t)          | j        |j                  | _        |j        |         dk    r|j        nd | _        d S )Nr   g      Tbiasr#   sliding_attention)r&   r'   rN   r   getattrr.   num_attention_headsr   r   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_projr!   rms_norm_epsq_normk_normlayer_typessliding_windowr-   rN   r   r/   s      r0   r'   zDots1Attention.__init__   s   "
F4F&Jd4dee$*$>&B\$\!}d*!'!9i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i&68JQWQf
 
 
 #4=f6IJJJ"4=f6IJJJ7=7I)7TXk7k7kf33qur1   past_key_valuepast_key_values4.58new_nameversionNr2   position_embeddingsr   cache_positionr   r$   c                    |j         d d         }g |d| j        R }|                     |                     |                              |                                        dd          }	|                     |                     |                              |                                        dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|
                    |
|| j        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr5   r   r4   )ro   rn   r   eagerr   )r   r   r   )rB   r   r   r   viewrl   r   r   r   r   updater   r   rN   _attn_implementationr   r   r   r   r   r   r   r   )r-   r2   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rn   ro   cache_kwargsattention_interfacer   r   s                     r0   r?   zDots1Attention.forward   s    $)#2#.88b8$-88{{4;;}#=#=#B#B<#P#PQQ[[\]_`aa[[]!;!;!@!@!N!NOOYYZ[]^__
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HL.
%
 
%
 
%
 
%
!\ *k);;;;;;FFHHkk+..L((r1   NN)rD   rE   rF   __doc__r   intr'   r   r)   rH   rA   r   r	   
LongTensorr   r   r?   rI   rJ   s   @r0   r   r      s       GGv{ vs v v v v v v4 _%0A6RRR ,059*) *)|*) #5<#=>*) !.	*)
 "%*) !!12*) -.*) 
u|Xel33	4*) *) *) SR*) *) *) *) *)r1   r   c                   &     e Zd Zd fd	Zd Z xZS )Dots1MLPNc                    t                                                       || _        ||j        n|| _        ||j        n|| _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          |j                 | _        d S NFr   )r&   r'   rN   r.   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fn)r-   rN   r.   r   r/   s       r0   r'   zDots1MLP.__init__   s    1<1D6--+=N=V!9!9\m4#3T5KRWXXXy!143IPUVVV4#94;KRWXXXV./r1   c                     |                      |                     |                     |                    |                     |          z            }|S rv   )r   r   r   r   )r-   rp   r   s      r0   r?   zDots1MLP.forward  sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r1   r   )rD   rE   rF   r'   r?   rI   rJ   s   @r0   r   r      sL        	0 	0 	0 	0 	0 	0      r1   r   c                   Z     e Zd ZdZ fdZdej        dej        dej        fdZd Z xZ	S )Dots1MoEz:
    A mixed expert module containing shared experts.
    c                 4   t                                                       | _        t          j        fdt          j                  D                       | _        t                    | _	        t          j        j        z            | _        d S )Nc                 <    g | ]}t          j                   S ))r   )r   moe_intermediate_size).0_rN   s     r0   
<listcomp>z%Dots1MoE.__init__.<locals>.<listcomp>  s*    vvvRSXf0LMMMvvvr1   )rN   r   )r&   r'   rN   r   
ModuleListrangen_routed_expertsexpertsDots1TopkRoutergater   r   n_shared_expertsshared_expertsr-   rN   r/   s    `r0   r'   zDots1MoE.__init__  s    }vvvvW\]c]tWuWuvvv
 
 $F++	&V-IFLc-c
 
 
r1   r2   topk_indicestopk_weightsc                 r   t          j        ||j                  }t           j        j                            |t          | j                            }|                    ddd          }t          t          | j                            D ]}| j        |         }||         }t          j
        |          \  }	}
|	                                dk    rL||	|
f         }||	         } ||          }||                    d          z  }|                    d|	|           |                    |j                  S )z
        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
        to not have to do a loop here (deepseek has 256 experts soooo yeah).
        rh   )num_classesr4   r   r   r5   )r)   
zeros_liker7   r   r   one_hotlenr   permuter   wherenumelr~   
index_add_rR   )r-   r2   r   r   final_hidden_statesexpert_mask
expert_idxexpertmasktoken_indicesweight_indicesexpert_weightsexpert_inputexpert_outputweighted_outputs                  r0   moezDots1MoE.moe  s3   
 $.}LDVWWWh)11,CPTP\L]L]1^^!))!Q22DL 1 122 
	R 
	RJ\*-Fz*D,1K,=,=)M>""$$q((!-m^.K!L,]; &| 4 4"/.2J2J22N2N"N#..q-QQQ
 #''(;<<<r1   c                     |}|j         }|                     |          \  }}|                    d|j         d                   } |                     |||          j        | }||                     |          z   }|S )Nr5   )rB   r   r   r  r   )r-   r2   	residuals
orig_shaper   r   s         r0   r?   zDots1MoE.forward3  s    !	"(
%)YY}%=%="l%**2}/B2/FGGPlKKPR\]%(;(;I(F(FFr1   )
rD   rE   rF   r   r'   r)   rH   r  r?   rI   rJ   s   @r0   r   r   	  s         	
 	
 	
 	
 	
= =U\ =Y^Ye = = = =4      r1   r   c                   T     e Zd Z fdZ ej                    d             Zd Z xZS )r   c                    t                                                       || _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _	        t          j        t          j        | j        |j        f                    | _        |                     dt          j        | j                             d S )Ne_score_correction_bias)r&   r'   rN   num_experts_per_toktop_kr   routed_scaling_factorn_group
topk_groupnorm_topk_probr   r(   r)   emptyr.   r+   r^   zerosr   s     r0   r'   zDots1TopkRouter.__init__>  s    /
 & 7%+%A"~ +$3l5;0EvGY/Z#[#[\\6DDY8Z8Z[[[[[r1   c                     |                     d| j                  | j                            d          z   }|                     d| j        | j        | j        z                                dd          d                             d          }t          j        || j        dd          d         }t          j	        |          }|
                    d|d           |                    d                              d| j        | j        | j        z                                d| j                  }|                    |                                 d          }t          j        || j        dd          d         }|S )	Nr5   r   r4   rf   F)r   rg   sortedr   r   )r   r   r  r~   r  topksumr)   r  r   scatter_ri   r   masked_fillboolr  )r-   scoresscores_for_choicegroup_scores	group_idx
group_mask
score_maskr   s           r0   get_topk_indicesz Dots1TopkRouter.get_topk_indicesK  sU   "KKD,ABBTEaEkEklmEnEnn""2t|T5Jdl5Z[[T!T__Q SRS[[ 	
 J|tBuUUUVWX	%l33
Ay!,,,  $$VBd&;t|&KLLWR.// 	
 .99:??;L;L:LcRRz"3tzrRWXXXYZ[r1   c                    |                     d| j        j                  }t          j        |                    t          j                  | j                            t          j                            }|	                                }| 
                    |          }|                    d|          }| j        r|                    dd          dz   }||z  }|| j        z  }||fS )Nr5   r   T)rg   r6   g#B;)r   rN   r.   FlinearrR   r)   r9   r+   sigmoidr!  gatherr  r  r  )r-   r2   router_logitsr  r   r   denominators          r0   r?   zDots1TopkRouter.forward_  s    %**2t{/FGG!3!3EM!B!BDKDTDTUZUbDcDcdd&&((,,V44}}Q55 	(&**r4*@@5HKK'L#d&@@\))r1   )	rD   rE   rF   r'   r)   rx   r!  r?   rI   rJ   s   @r0   r   r   =  so        \ \ \ \ \ U]__  _&
* 
* 
* 
* 
* 
* 
*r1   r   c                   4    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	ej
                 de	eej        ej        f                  dee         dej        fd            Z xZS )Dots1DecoderLayerrN   r   c                    t                                                       |j        | _        t          ||          | _        ||j        k    rt          |          | _        nt          |          | _        t          |j        |j
                  | _        t          |j        |j
                  | _        |j        |         | _        d S )N)rN   r   r   )r&   r'   r.   r   	self_attnfirst_k_dense_replacer   mlpr   r!   r   input_layernormpost_attention_layernormr   attention_typer   s      r0   r'   zDots1DecoderLayer.__init__m  s    !-'vKKK444''DHH''DH+F,>FDWXXX(4V5GVM`(a(a(a%$0;r1   r   r   r   r   NFr2   r   rq   	use_cacher   r   r   r$   c                     |}	|                      |          } | j        d|||||||d|\  }}
|	|z   }|}	|                     |          }|                     |          }|	|z   }|S )N)r2   r   rq   r   r2  r   r    )r/  r,  r0  r.  )r-   r2   r   rq   r   r2  r   r   r   residualr   s              r0   r?   zDots1DecoderLayer.forward|  s     !,,];;)4> 	
')%+) 3	
 	
 	
 	
q !=0 !55mDD// =0r1   )NNNFNN)rD   rE   rF   r   r   r'   r   r)   rH   r   r   r	   r  rA   r   r   r?   rI   rJ   s   @r0   r*  r*  l  s-       <{ <s < < < < < < _%0A6RRR 2637+/$)59KO | !. u/0	
 "% D> !!12 &eEL%,,F&GH +, 
   SR    r1   r*  c                   \     e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZ fdZ xZS )	Dots1PreTrainedModelrN   modelTr*  r   F)r2   
attentionsc                     t                                          |           t          |t                    r-|j        j                            d| j        j                   d S d S )Nr   )r;   std)	r&   _init_weightsrV   r   r+   datanormal_rN   initializer_range)r-   r   r/   s     r0   r<  z"Dots1PreTrainedModel._init_weights  sc    f%%%fo.. 	TM&&CT[5R&SSSSS	T 	Tr1   )rD   rE   rF   r   rw   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr*  r   _can_record_outputsr<  rI   rJ   s   @r0   r7  r7    s         &*#,-#4"5N""&*$ 
T T T T T T T T Tr1   r7  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         deej                 d	ee         d
eej	                 dee         defd                        Z xZS )
Dots1ModelrN   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        d| j        j        v | _        |                                  d S )Nc                 0    g | ]}t          |          S r4  )r*  )r   r   rN   s     r0   r   z'Dots1Model.__init__.<locals>.<listcomp>  s$    cccivy11cccr1   r   rN   Fr   )r&   r'   pad_token_idpadding_idx
vocab_sizer   	Embeddingr.   embed_tokensr   r   num_hidden_layerslayersr!   r   normrL   
rotary_embgradient_checkpointingrN   r   has_sliding_layers	post_initr   s    `r0   r'   zDots1Model.__init__  s       !. +L):F<NPTP`aamcccc5IaCbCbccc
 
 !!39LMMM	.f===&+#"59P"P 	r1   N	input_idsr   rq   r   inputs_embedsr2  r   r   r$   c                    |d u |d uz  rt          d          ||                     |          }|r|t          | j                  }|B||                                nd}	t          j        |	|	|j        d         z   |j                  }||	                    d          }t          |x}
t                    s2| j        |||||d}dt          di |i}
| j        rt          di ||
d<   |}|                     ||          }| j        d | j        j                 D ]} ||f|
|j                 |||||d	|}|                     |          }t)          ||r|nd 
          S )Nz:You must specify exactly one of input_ids or inputs_embedsrN  r   r   )r`   )rN   input_embedsr   r   r   rq   full_attentionr   )r   rq   r   r2  r   r   )last_hidden_stater   r4  )
ValueErrorrS  r
   rN   get_seq_lengthr)   arangerB   r`   r~   rV   rW   r   rY  r   rW  rU  rT  r1  rV  r   )r-   r[  r   rq   r   r\  r2  r   r   past_seen_tokenscausal_mask_mappingmask_kwargsr2   r   decoder_layers                  r0   r?   zDots1Model.forward  s    -t";< 	[YZZZ  --i88M 	?0*$+>>>O!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L ?-FF 	l + -"0"0#2 , K !"4"C"C{"C"C# & l;\;k;k_j;k;k#$78% #oom\JJ![)H4;+H)HI 
	 
	M)M	2=3OP) /#-$7	 	 	 	MM 		-00&+/8BOOd
 
 
 	
r1   )NNNNNNN)rD   rE   rF   r   r'   r   r   r   r)   r   rH   r	   FloatTensorr  r   r   r   r?   rI   rJ   s   @r0   rK  rK    s*       {      "  151537+/59$(59E
 E
E,-E
 !.E
 u/0	E

 "%E
   12E
 D>E
 !!12E
 +,E
 
!E
 E
 E
 ^ E
 E
 E
 E
 E
r1   rK  c                   f    e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e         de	e
j                 deee
j        f         dee         defd                        Z xZS )Dots1ForCausalLMzlm_head.weightlm_headcolwise_repr2   logitsc                     t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        | 	                                 d S r   )
r&   r'   rK  r8  rQ  r   r   r.   rk  rZ  r   s     r0   r'   zDots1ForCausalLM.__init__  sj       ''
 +y!3V5FUSSS 	r1   Nr   r[  r   rq   r   r\  labelsr2  r   logits_to_keepr   r$   c
                 R    | j         d|||||||d|
}|j        }t          |	t                    rt	          |	 d          n|	}|                     |dd|ddf                   }d}| | j        d||| j        j        d|
}t          |||j
        |j        |j                  S )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Dots1ForCausalLM

        >>> model = Dots1ForCausalLM.from_pretrained("rednote-hilab/dots1.llm1.inst")
        >>> tokenizer = AutoTokenizer.from_pretrained("rednote-hilab/dots1.llm1.inst")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r[  r   rq   r   r\  r2  r   N)rm  ro  rQ  )lossrm  r   r2   r9  r4  )r8  r`  rV   r   slicerk  loss_functionrN   rQ  r   r   r2   r9  )r-   r[  r   rq   r   r\  ro  r2  r   rp  r   outputsr2   slice_indicesrm  rr  s                   r0   r?   zDots1ForCausalLM.forward"  s    J ,64: 	,
)%+')	,
 	,
 	,
 	,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%pVFt{OeppioppD%#3!/)
 
 
 	
r1   )	NNNNNNNNr   )rD   rE   rF   _tied_weights_keys_tp_plan_pp_planr'   r   r   r   r)   r   rH   r	   rh  r  r   r   r   r   r   r?   rI   rJ   s   @r0   rj  rj    sa       *+=)H_-z:;H      151537+/59-1$(5934=
 =
E,-=
 !.=
 u/0	=

 "%=
   12=
 )*=
 D>=
 !!12=
 c5</0=
 +,=
 
 =
 =
 =
 ^ =
 =
 =
 =
 =
r1   rj  )r7  rK  rj  )Nr   )r   )Atypingr   r   r   r)   torch.nn.functionalr   r   r#  activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   configuration_dots1r   Moduler!   rL   r|   r   rH   r   r   rG   r   r   r   r   r   r*  r7  rK  rj  __all__r4  r1   r0   <module>r     s  * - , , , , , , , , ,                 ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 R R R R R R R R B B B B B B 9 9 9 9 9 9 O O O O O O O O K K K K K K K K F F F F F F F F & & & & & & I I I I I I I I I I 0 0 0 0 0 0 / / / / / / , , , , , , Y''J J J J J29 J J ('J(!< !< !< !< !<29 !< !< !<H( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4H) H) H) H) H)RY H) H) H)V    ry   "1 1 1 1 1ry 1 1 1h,* ,* ,* ,* ,*bi ,* ,* ,*^0 0 0 0 02 0 0 0f T T T T T? T T T, Y
 Y
 Y
 Y
 Y
% Y
 Y
 Y
x M
 M
 M
 M
 M
+_ M
 M
 M
` E
D
Dr1   