
     `i;                        d dl mZmZmZ d dlZd dlmc mZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1  ed           G d dej2                              Z3 G d de          Z4 G d dej2                  Z5d Z6dGdZ7dej8        de9d ej8        fd!Z:	 dHd#ej2        d$ej8        d%ej8        d&ej8        d'eej8                 d(e;d)e;d*e&e(         fd+Z< G d, d-ej2                  Z= G d. d/ej2                  Z> G d0 d1ej2                  Z? G d2 d3e          Z@e) G d4 d5e$                      ZA G d6 d7ej2                  ZBe) G d8 d9eA                      ZC	 	 	 dId;eej8        eDej8                 df         d<ee9         d'eej8                 d eej8        e9f         fd=ZEe) G d> d?eAe                      ZF G d@ dAeeA          ZG G dB dCeeA          ZH G dD dEeeA          ZIg dFZJdS )J    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)OutputRecordercheck_model_inputs   )MiniMaxConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )MiniMaxRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z=
        MiniMaxRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/minimax/modeling_minimax.pyr)   zMiniMaxRMSNorm.__init__5   sD     	l5:k#:#:;; #    c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )N   T)keepdim)	dtypetor+   float32powmeanrsqrtr.   r-   )r/   hidden_statesinput_dtypevariances       r3   forwardzMiniMaxRMSNorm.forward=   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r4   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler-   shaper.   )r/   s    r3   
extra_reprzMiniMaxRMSNorm.extra_reprD   s&    )**II$2GIIIr4   )r&   )__name__
__module____qualname__r)   rB   rF   __classcell__r2   s   @r3   r%   r%   3   sb        $ $ $ $ $ $; ; ;J J J J J J Jr4   r%   c                   ~     e Zd Z fdZd ZdefdZ fdZdef fdZd Z	defd	Z
d
ej        fdZdefdZ xZS )MiniMaxCachec                 V    t                                                       g | _        d S N)r(   r)   linear_cacher/   r2   s    r3   r)   zMiniMaxCache.__init__I   s'    02r4   c                     t          t          | j                  |dz             D ]}| j                            g            || j        |<   d S Nr!   )rangelenrP   append)r/   	layer_idxrP   _s       r3   set_linear_cachezMiniMaxCache.set_linear_cacheM   sW    s4,--y1}== 	) 	)A$$R(((('3)$$$r4   rW   c                 F    |t          |           k     r| j        |         S d S rO   )rU   rP   r/   rW   s     r3   get_linear_cachezMiniMaxCache.get_linear_cacheS   s&    s4yy  $Y//tr4   c                     t          t                                                      t          | j                            S rO   )maxr(   __len__rU   rP   rQ   s    r3   r_   zMiniMaxCache.__len__X   s,    577??$$c$*;&<&<===r4   c                     |t          | j                  k     r| j        |         g k    r| j        |         fS t                                          |          S rO   )rU   rP   r(   __getitem__)r/   rW   r2   s     r3   ra   zMiniMaxCache.__getitem__[   sU    s4,----$2CI2NRT2T2T%i022ww""9---r4   c              #   \   K   t          t          |                     D ]}| |         V  d S rO   )rT   rU   r[   s     r3   __iter__zMiniMaxCache.__iter__`   s@      s4yy)) 	" 	"Iy/!!!!	" 	"r4   repeatsc                     t          t          |                     D ]^}| j        |         g k    r+| j        |                             |d          | j        |<   >| j        |                             |           _d S )Nr   dim)rT   rU   rP   repeat_interleavelayersbatch_repeat_interleave)r/   rd   rW   s      r3   rj   z$MiniMaxCache.batch_repeat_interleaved   s    s4yy)) 	H 	HI +r11/3/@/K/]/]^ekl/]/m/m!),,I&>>wGGGG		H 	Hr4   indicesc                     t          t          |                     D ]Q}| j        |         g k    r| j        |         |df         | j        |<   1| j        |                             |           Rd S )N.)rT   rU   rP   ri   batch_select_indices)r/   rk   rW   s      r3   rm   z!MiniMaxCache.batch_select_indicesk   s    s4yy)) 	E 	EI +r11/3/@/KGUXL/Y!),,I&;;GDDDD		E 	Er4   
max_lengthc                      t          d          )Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r/   rn   s     r3   cropzMiniMaxCache.cropr   s    GHHHr4   )rG   rH   rI   r)   rY   intr\   r_   ra   rc   rj   r+   Tensorrm   rq   rJ   rK   s   @r3   rM   rM   H   s       3 3 3 3 34 4 4#    
> > > > >.S . . . . . .
" " "Hs H H H HEEL E E E EIs I I I I I I I Ir4   rM   c                   L    e Zd Zdedef fdZd Zd Z eddd	          	 	 dde	j
        dee	j
        e	j
        f         dee	j
                 dee         dee	j                 dee         dee	j
        ee	j
                 eee	j
                          f         fd            Z xZS )MiniMaxLightningAttentionconfigrW   c                 |   t                                                       || _        t          |dd           p|j        |j        z  | _        |j        | _        |j        | _        |j        | _        t          |j
                 | _        t          | j        | j        z            | _        t          j        |j        | j        | j        z  dz  d          | _        t          j        | j        | j        z  |j        d          | _        t          j        |j        | j        | j        z  d          | _        |                                 }|                     |          \  }}}|                     d|           |                     d|           |                     d|           |                     d|           d S )	Nhead_dimr   Fbias
slope_ratequery_decay	key_decaydiagonal_decay)r(   r)   rW   getattrr0   num_attention_headsrx   num_hidden_layers
block_sizer   
hidden_actact_fnr%   normr   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)r/   rv   rW   r{   r|   r}   r~   r2   s          r3   r)   z"MiniMaxLightningAttention.__init__w   s   "
D99mV=OSYSm=m#)#= !'!9 +V./"4=43K#KLL		&"4d6NQUQ^6^ab6binooo	$":T]"JFL^ejkkk9V%79QTXTa9ahmnnn((**
151C1CJ1O1O.Y\:666]K888[)444-~>>>>>r4   c                     ddd| j         z  z  z  }t          j        | j                   dz   }d| j        | j        dz
  dz   z  z
  dz   }||z  }||z  }|d d d d f         }|S )Nr!   r6      gh㈵>)r   r+   arangerW   r   )r/   baseexponentfactorrates        r3   r   z(MiniMaxLightningAttention.get_slope_rate   s~    A!d6678< 899A=T^t'='AD'HIIDPX~f}AAAtTM"r4   c                    t          j        | j                  dz   }t          j        | |d d d f         z            }t          j        | | j        |d d d f         z
  z            }|d d d f         |d d d f         z
  }|d d d d d d f         }||z  }t          j        |dk    | t          d                    }t          j        |          }|||fS )Nr!   r   z-inf)r+   r   r   expwherefloat)r/   r{   block_size_ranger|   r}   r~   s         r3   r   z'MiniMaxLightningAttention.decay_factors   s     <881<i.>qqq$w.G GHHIzkT_?OPQPQPQSWPW?X-XYZZ	)!!!T'25EdAAAg5NN'dAAAqqq(89#n4^q%8>/5QW==YY>22I~55r4   past_key_valuepast_key_values4.58new_nameversionNr?   position_embeddingsattention_maskcache_positionkwargsreturnc                 	   |j         \  }}}	|| j        z   dz
  | j        z  }
|                     |                     |                    }|                    ||| j        d| j        z            }t          j        || j        d          \  }}}|	                    dd          }|	                    dd          }|	                    dd          }d }||
                    | j                  }|t          j        || j        | j        | j                                      |          }|]|                    t          j                  }|                    |                    d                              d           d          }g }t#          |
          D ]a}|| j        z  }t%          || j        z   |          }||z
  }|d d d d ||f         }|d d d d ||f         }|d d d d ||f         }| j        d d d |f         }| j        d d | d f         }| j        d d d d d |d |f         }t          j        | j         |z            }t          j        ||	                    dd                    }t          j        ||z  |          }t          j        ||z  |          }||z   }|                    |           t          j        ||z  	                    dd          |          } ||z  | z   }cnt          j        | j                   }!g }t#          |          D ]}|d d d d ||dz   f         }|d d d d ||dz   f         }|d d d d ||dz   f         }t          j        |	                    dd          |          }"|!|z  |"z   }t          j        ||          }|                    |           t          j        |d          }|	                    dd          }|                    ||| j        | j        z            }|                     |          }t9          j        |                     |                    |z  }|                     |          }||                     | j        |           ||fS )	Nr!   r   rf   r6   r9   r7   r   )!rE   r   r   r   reshaper   rx   r+   split	transposer\   rW   zerosr:   boolmasked_fill	unsqueezerT   minr|   r}   r~   r   r{   matmulrV   catr   Fsigmoidr   r   rY   )#r/   r?   r   r   r   r   r   
batch_sizeseq_lenr0   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputi	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_inters#                                      r3   rB   z!MiniMaxLightningAttention.forward   s5    ,9+>(
G[/!3G
[[}!=!=>>
''
GT=UWX[_[hWhii
16Z\]1^1^1^.j,#--a33))!Q//
#--a33 "&!0!A!A$.!Q!Q%!&Z9QSWS`bfbo!p!p!s!s" "
 )!/!2!2!2!D!D+779Q9QRS9T9T9^9^_a9b9b8bdeffK:&& ` `/	i$/97CC%,y%8"'3AAAqqq)G:K4K'L$%/111i6G0G%H"'3AAAqqq)G:K4K'L$&*&6qqq:M;M:M7M&N#$(N1117I6I6J6J3J$K!)-)<QQQCVDVCVXkYkXk=k)l&#i(8;M(MNN &+\2FHZHdHdegikHlHl%m%m"$)L1CF\1\^r$s$s! %*L1EH[1[]o$p$p! '8:K&K#""#6777 +0,'*;;FFr2NNPd+ +' &8+%EH_%_"";`@ It.//EK7^^ 	8 	8'3AAAqqq!a!e)O'D$%/111a!a%i%@"'3AAAqqq!a!e)O'D$-2\:L:V:VWY[]:^:^`t-u-u*%*-?%?B\%\"&+l3GI[&\&\#""#67777 i444 "++Aq11!))*gt?WZ^Zg?ghhii,,i 0 0 ? ?@@;NmmK00 &,,T^=OPPP...r4   NN)rG   rH   rI   r"   rr   r)   r   r   r   r+   rs   rD   r   r	   
LongTensorr   r   rB   rJ   rK   s   @r3   ru   ru   v   sI       ?} ? ? ? ? ? ? ?,	 	 	6 6 6 _%0A6RRR ,059`/ `/|`/ #5<#=>`/ !.	`/
 "%`/ !!12`/ -.`/ 
u|Xel3XeEL>Q5RR	S`/ `/ `/ SR`/ `/ `/ `/ `/r4   ru   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nr7   r6   rf   )rE   r+   r   )xx1x2s      r3   rotate_halfr   
  s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r4   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r3   apply_rotary_pos_embr     sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr4   r?   n_repr   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rE   expandr   )r?   r   batchnum_key_value_headsslenrx   s         r3   	repeat_kvr   ,  s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr4           modulequerykeyvaluer   scalingdropoutr   c                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr6   r   r   r7   rg   r9   )ptrainingr!   )r   num_key_value_groupsr+   r   r   rE   r   
functionalsoftmaxr;   r:   r9   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightscausal_maskr   s                r3   eager_attention_forwardr   8  s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r4   c                        e Zd ZdZdedef fdZ eddd          	 	 dd
ej	        de
ej	        ej	        f         deej	                 dee         deej                 dee         de
ej	        eej	                 f         fd            Z xZS )MiniMaxAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrv   rW   c                    t                                                       || _        || _        t	          |dd           p|j        |j        z  | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        d| _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        | j        z  |j        d          | _        d S )Nrx   g      TFry   )r(   r)   rv   rW   r   r0   r   rx   r   r   r   attention_dropout	is_causalr   r   q_projk_projv_projo_projr/   rv   rW   r2   s      r3   r)   zMiniMaxAttention.__init__U  s    "
D99mV=OSYSm=m$*$>&B\$\!}d*!'!9i 2F4NQUQ^4^ejkkki 2F4NQUQ^4^ejkkki 2F4NQUQ^4^ejkkki :T] JFL^ejkkkr4   r   r   r   r   Nr?   r   r   r   r   r   c           
      n   |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j	        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        t#          | j        dd           d|\  }} |j        g |dR                                  }|                     |          }||fS )	Nr7   r!   r6   )r   r   r   eagerr   sliding_window)r   r   r  )rE   rx   r   viewr   r   r   r   updaterW   r   rv   _attn_implementationr   r   r   r   r   r   r   r   )r/   r?   r   r   r   r   r   input_shapehidden_shaper   r   r   r   r   cache_kwargsattention_interfacer   r   s                     r3   rB   zMiniMaxAttention.forwardc  s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HL"4;0@$GG
%
 
%
 
%
 
%
!\ *k);;;;;;FFHHkk+..L((r4   r   )rG   rH   rI   __doc__r"   rr   r)   r   r+   rs   rD   r   r	   r   r   r   rB   rJ   rK   s   @r3   r   r   R  s       GGl} l l l l l l l _%0A6RRR ,059*) *)|*) #5<#=>*) !.	*)
 "%*) !!12*) -.*) 
u|Xel33	4*) *) *) SR*) *) *) *) *)r4   r   c                   *     e Zd Zdef fdZd Z xZS )MiniMaxBlockSparseTop2MLPrv   c                    t                                                       |j        | _        |j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          j        | j        | j        d          | _
        t          |j                 | _        d S NFry   )r(   r)   intermediate_sizeffn_dimr0   
hidden_dimr   r   w1w2w3r   r   r   r/   rv   r2   s     r3   r)   z"MiniMaxBlockSparseTop2MLP.__init__  s    / ,)DOT\FFF)DL$/FFF)DOT\FFFV./r4   c                     |                      |                     |                    |                     |          z  }|                     |          }|S rO   )r   r  r  r  )r/   r?   current_hidden_statess      r3   rB   z!MiniMaxBlockSparseTop2MLP.forward  sJ     $DGGM,B,B C CdggmF\F\ \ $(= > >$$r4   )rG   rH   rI   r"   r)   rB   rJ   rK   s   @r3   r  r    sS        	0} 	0 	0 	0 	0 	0 	0% % % % % % %r4   r  c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )MiniMaxSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    c                 |   t                                                       j        | _        j        | _        j        | _        j        | _	        t          j        | j        | j        d          | _        t          j        fdt          | j                  D                       | _        j        | _        d S )NFry   c                 .    g | ]}t                    S  )r  ).0rX   rv   s     r3   
<listcomp>z2MiniMaxSparseMoeBlock.__init__.<locals>.<listcomp>  s"    %i%i%iA&?&G&G%i%i%ir4   )r(   r)   r0   r  r  r  num_local_expertsnum_expertsnum_experts_per_toktop_kr   r   gate
ModuleListrT   expertsrouter_jitter_noisejitter_noiser  s    `r3   r)   zMiniMaxSparseMoeBlock.__init__  s     ,/!3/
 Idot/?eLLL	}%i%i%i%iQVW[WgQhQh%i%i%ijj #6r4   r?   r   c                    |j         \  }}}| j        rF| j        dk    r;|t          j        |                              d| j        z
  d| j        z             z  }|                    d|          }|                     |          }t          j	        |dt          j
                  }t          j        || j        d          \  }}||                    dd          z  }|                    |j                  }t          j        ||z  |f|j        |j        	          }t          j        j                            || j        
                              ddd          }	t          j        |	                    d          d                                          }
|
D ]}| j        |         }t          j        |	|                             d                    \  }}|d|f                             d|          } ||          |||df         z  }|                    d||                    |j                             |                    |||          }||fS ) r   g      ?r7   r!   r   rf   T)rg   r8   )r9   device)num_classesr6   )r7   r   N)rE   r   r)  r+   
empty_likeuniform_r  r%  r   r   r   topkr$  sumr:   r9   r   r,  r   r   one_hotr"  permutegreaternonzeror'  r   squeezer   
index_add_)r/   r?   r   sequence_lengthr  router_logitsrouting_weightsselected_expertsfinal_hidden_statesexpert_mask
expert_hit
expert_idxexpert_layeridxtop_xcurrent_stater  s                    r3   rB   zMiniMaxSparseMoeBlock.forward  s]   2?2E/
OZ= 	xT.22U-m<<EEcDL]F]_beiev_vwwwM%**2z::		-00)MqLLL,1J
XZ,[,[,[))?..2t.DDD),,]-@AA#k/):6m>QZgZn
 
 
 h)112BPTP`1aaiijkmnpqrr];??x?#@#@!DDLLNN
$ 	d 	dJ<
3L[%<%D%DQ%G%GHHJC *$+6>>r:NNM$0L$?$?/RWY\^bRbBc$c!  **1e5J5M5MmNa5b5bcccc199*oWabb"M11r4   )	rG   rH   rI   r  r)   r+   rs   rB   rJ   rK   s   @r3   r  r    sh        	 	7 7 7 7 7%2U\ %2el %2 %2 %2 %2 %2 %2 %2 %2r4   r  c                       e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 dd
ej        de	ej        ej        f         de
ej                 de
ej                 de
e         de
e         de
e         de
e         de
ej                 dee         de	ej        e
e	ej        ej        f                  f         fd            Z xZS )MiniMaxDecoderLayerrv   rW   c                 b   t                                                       |j        | _        t          ||          | _        t          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        || _        |j        |         | _        |j        | _        |j        | _        | j        dk    r/t!          ||          | _        |j        | _        |j        | _        d S t          ||          | _        |j        | _        |j        | _        d S )Nr1   linear_attention)r(   r)   r0   r   	self_attnr  block_sparse_moer%   rms_norm_epsinput_layernormpost_attention_layernormrW   layer_types
layer_typemlp_alpha_factormlp_beta_factorru   linear_attn_alpha_factorattn_alpha_factorlinear_attn_beta_factorattn_beta_factorfull_attn_alpha_factorfull_attn_beta_factorr  s      r3   r)   zMiniMaxDecoderLayer.__init__  s   !-)&)<< 5f = =-f.@fFYZZZ(6v7IvOb(c(c(c%" ,Y7 & 7%5?0006vyIIDN%+%DD"$*$BD!!!-fi@@DN%+%BD"$*$@D!!!r4   r   r   r   r   NFr?   r   r   r   output_attentionsoutput_router_logits	use_cacher   r   r   c
                    |                      |          }|} | j        d||||||||	d|
\  }}|| j        z  || j        z  z   }|                     |          }|}|                     |          \  }}|| j        z  || j        z  z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r?   r   r   r   r   rX  rZ  r   r  )rL  rI  rS  rU  rM  rJ  rP  rQ  )r/   r?   r   r   r   r   rX  rY  rZ  r   r   residualrX   s                r3   rB   zMiniMaxDecoderLayer.forward  s    N ,,];;  *4> 

' 3)%+/)

 

 

 

q !4#99MDLa<aa 55mDD 00??q 4#88=4K_;__r4   )NNNFFFN)rG   rH   rI   r"   rr   r)   r   r+   rs   rD   r   r   r	   r   r   r   FloatTensorrB   rJ   rK   s   @r3   rE  rE    so       A} A A A A A A A0 _%0A6RRR
 2637+/,1/4$)59= =|= #5<#=>= !.	=
 u/0= "%= $D>= 'tn= D>= !!12= -.= 
u (51BEDU1U+V"WW	X= = = SR= = = = =r4   rE  c                   h    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZ eed          eeegd	Zd
S )MiniMaxPreTrainedModelrv   modelTrE  r   Fr!   )index)r9  r?   
attentionsN)rG   rH   rI   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r  rE  r   ru   _can_record_outputsr  r4   r3   r_  r_  @  s         &*#./#4"5N""&'(=QGGG,')BC r4   r_  c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )MiniMaxRotaryEmbeddinginv_freqNrv   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultrp  F)
persistent)r(   r)   hasattr
isinstancerr  dictgetrs  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrv   r   rope_init_fnattention_scalingr   rp  original_inv_freq)r/   rv   r,  rp  r2   s       r3   r)   zMiniMaxRotaryEmbedding.__init__V  s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r4   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r7   r!   mpscpuF)device_typeenabledr6   rf   r   )rp  r   r   rE   r:   r,  rx  rt  strr+   autocastr   r   r   r  r   r9   )
r/   r   r   inv_freq_expandedposition_ids_expandedr  freqsembr   r   s
             r3   rB   zMiniMaxRotaryEmbedding.forwardg  s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/rO   )rG   rH   rI   r+   rs   rc  r"   r)   no_gradr   rB   rJ   rK   s   @r3   ro  ro  S  s         l/ /} / / / / / /" U]__< <  _< < < < <r4   ro  c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 dee
         deej                 d	ee         d
ee         deej                 dee         defd            Z xZS )MiniMaxModelrv   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S r  )rE  )r  rW   rv   s     r3   r   z)MiniMaxModel.__init__.<locals>.<listcomp>  s$    eee	 33eeer4   rG  )rv   F)r(   r)   pad_token_idpadding_idx
vocab_sizer   	Embeddingr0   embed_tokensr&  rT   r   ri   r%   rK  r   ro  
rotary_embgradient_checkpointing	post_initr  s    `r3   r)   zMiniMaxModel.__init__y  s       !. +L):F<NPTP`aameeeeU6KcEdEdeee
 
 #6#56;NOOO	0???&+# 	r4   N	input_idsr   r   r   inputs_embedsrZ  rX  r   r   r   c	                    |d u |d uz  rt          d          |r|t                      }n7|r5t          |t                    s t          dt          |           d          ||                     |          }|B||                                nd}
t          j        |
|
|j        d         z   |j	                  }||
                    d          }| j        j        t          nt          } || j        |||||          }|}|                     ||          }| j        D ]"}|j        dk    r|}n|} ||f||||||d	|	}#|                     |          }t'          ||
          S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r!   )r,  )rv   input_embedsr   r   r   r   full_attention)r   r   r   r   rZ  r   )last_hidden_stater   )
ValueErrorrM   rx  rt  r  get_seq_lengthr+   r   rE   r,  r   rv   r  r   r   r  ri   rO  r   r   )r/   r  r   r   r   r  rZ  rX  r   r   past_seen_tokensmask_functionr   r?   r   decoder_layerinput_attention_masks                    r3   rB   zMiniMaxModel.forward  s    -t";< 	[YZZZ 	0*nnOO 	z/<HH 	~fjkzf{f{~~~     --i88M!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L.2k.H.P**Vw#m;&))+%
 
 
 & #oom\JJ![ 	 	M'+;;;'2$$ (6$)M	$73) /#-	 	 	 	MM 		-00%++
 
 
 	
r4   )NNNNNNNN)rG   rH   rI   r"   r)   r    r   r+   r   rs   rM   r]  r   r   r   r   rB   rJ   rK   s   @r3   r  r  w  s3       }         1515372659$(,059G
 G
E,-G
 !.G
 u/0	G

 ",/G
   12G
 D>G
 $D>G
 !!12G
 +,G
 
 G
 G
 G
 G
 G
 G
 G
 G
r4   r  r6   gate_logitsr"  c                    | t          | t                    sdS t          | t                    r/| d         j        t          j        fd| D             d          }t          j        j                            |d          }t          j        ||d          \  }}t          j        j        	                    ||          }|@t          j
        |                                d          }	t          j
        |d          }
n.|j        \  }}|j        d         ||z  z  }|dddddddf                             |||||f                              d||                                        }t          j        |                                |z  d          t          j        |d          z  }	|ddddddf                             ||||f                              d|                                        }t          j        ||z  d          t          j        |d          z  }
t          j        |	|
                    d          z            }||z  S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                 :    g | ]}|                               S r  )r:   )r  
layer_gatecompute_devices     r3   r   z,load_balancing_loss_func.<locals>.<listcomp>  s&    -j-j-jPZjmmN.K.K-j-j-jr4   rf   r7   )rx  rD   r,  r+   r   r   r   r   r0  r2  r=   r   rE   r   r   r:   r1  r   )r  r"  r$  r   concatenated_gate_logitsr:  rX   r;  r=  tokens_per_expertrouter_prob_per_expertr   r8  r   expert_attention_mask router_per_expert_attention_maskoverall_lossr  s                    @r3   load_balancing_loss_funcr    s   : *[%"@"@q+u%% s$Q.#(9-j-j-j-j^i-j-j-jpq#r#r#r h)112JPR1SSO*_eDDDA(%--.>LLK!J{'8'8':':BBB "'O!C!C!C&4&:#
O4:1=*B^_ 4AAAtT12V&
OUKXYYWR,,R	 	 "Ik&7&7&9&9<Q&QWXYYY\a\e!q]
 ]
 ]
 
 4AAAt+,V&
O[QRRWR%%R	 	) "'?=]+]cd!e!e!ehmhq,!i
 i
 i
 "
 9.1G1Q1QRS1T1TTUUL+%%r4   c                   x    e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e         de	e         de	e
j                 deee
j        f         dee         defd                        Z xZS )MiniMaxForCausalLMzlm_head.weightlm_headcolwise_repr?   logitsc                 F   t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        |j	        | _	        |j
        | _        |j        | _        |                                  d S r  )r(   r)   r  r`  r  r   r   r0   r  router_aux_loss_coefr!  r"  r#  r  r  s     r3   r)   zMiniMaxForCausalLM.__init__,  s       !&))
 +y!3V5FUSSS$*$?!!3#)#=  	r4   Nr   r  r   r   r   r  labelsrZ  rY  r   logits_to_keepr   r   c                    ||n| j         j        } | j        d||||||||	d|}|j        }t	          |
t
                    rt          |
 d          n|
}|                     |dd|ddf                   }d}| | j        ||| j	        fi |}d}|rHt          |j        | j        | j        |          }|%|| j        |                    |j                  z  z  }t#          ||||j        |j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r  r   r   r   r  rZ  rY  r   )lossaux_lossr  r   r?   rb  r9  r  )rv   rY  r`  r  rx  rr   slicer  loss_functionr  r  r9  r"  r#  r  r:   r,  r   r   r?   rb  )r/   r  r   r   r   r  r  rZ  rY  r   r  r   outputsr?   slice_indicesr  r  r  s                     r3   rB   zMiniMaxForCausalLM.forward8  sq   P %9$D  $+Jj 	
 +5$* 
+
)%+'!5)
+
 
+
 
+
 
+
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%ffdoPPPPD 	M/% (	 H !1HKK4L4LLL(#3!/)!/
 
 
 	
r4   )
NNNNNNNNNr   )rG   rH   rI   _tied_weights_keys_tp_plan_pp_planr)   r   r   r   r+   r   rs   r	   r]  r   r   rr   r   r   r   rB   rJ   rK   s   @r3   r  r  &  s       *+=)H_-z:;H
 
 
 
 
  151537+/59-1$(/35934R
 R
E,-R
 !.R
 u/0	R

 "%R
   12R
 )*R
 D>R
 'tnR
 !!12R
 c5</0R
 +,R
 
#R
 R
 R
 ^ R
 R
 R
 R
 R
r4   r  c                       e Zd ZdS ) MiniMaxForSequenceClassificationNrG   rH   rI   r  r4   r3   r  r            Dr4   r  c                       e Zd ZdS )MiniMaxForTokenClassificationNr  r  r4   r3   r  r    r  r4   r  c                       e Zd ZdS )MiniMaxForQuestionAnsweringNr  r  r4   r3   r  r    r  r4   r  )r_  r  r  r  r  r  rS   )r   )Nr6   N)Ktypingr   r   r   r+   torch.nn.functionalr   r   r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   r   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   r    configuration_minimaxr"   Moduler%   rM   ru   r   r   rs   rr   r   r   r   r   r  r  rE  r_  ro  r  rD   r  r  r  r  r  __all__r  r4   r3   <module>r     s  . - , , , , , , , , ,                 ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 R R R R R R R R B B B B B B            R Q Q Q Q Q Q Q K K K K K K K K F F F F F F F F & & & & & & I I I I I I I I I I 0 0 0 0 0 0 ? ? ? ? ? ? ? ? 0 0 0 0 0 0 Y''J J J J JRY J J ('J(+I +I +I +I +I< +I +I +I\Q/ Q/ Q/ Q/ Q/	 Q/ Q/ Q/h( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4<) <) <) <) <)ry <) <) <)~% % % % %	 % % %$@2 @2 @2 @2 @2BI @2 @2 @2FW W W W W4 W W Wt     _   $!< !< !< !< !<RY !< !< !<H Y
 Y
 Y
 Y
 Y
) Y
 Y
 Y
| "&
-1	O& O&u|U5<%8$>?O&#O& U\*	O&
 5<O& O& O& O&d e
 e
 e
 e
 e
/ e
 e
 e
P	 	 	 	 	'GI_ 	 	 		 	 	 	 	$ACY 	 	 		 	 	 	 	"=?U 	 	 	  r4   