
     `iI                    \   d dl mZmZmZmZmZ d dlZd dlmc m	Z
 d dlmZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z,  e*            rd dl-m.Z. d dl/m0Z0m1Z1 ndZ. e)            r	d dl2m3Z3m4Z4 nd\  Z4Z3 e$            rd dl5m6Z6 ddl7m8Z8  e%j9        e:          Z;d Z<dSdZ=dej>        de?dej>        fdZ@	 dTd ejA        d!ej>        d"ej>        d#ej>        d$eej>                 d%eBd&eBfd'ZC G d( d)ejA                  ZD G d* d+          ZEd,ej>        d-e?fd.ZFd/ ZGd0 ZH eIe.e3e4f          ZJd1 ZK G d2 d3ejA                  ZL G d4 d5ej        jA                  ZM G d6 d7ejA                  ZN G d8 d9ed:;          ZO G d< d=ejA                  ZP G d> d?ejA                  ZQ G d@ dAejA                  ZR G dB dCejA                  ZS G dD dEe          ZTe" G dF dGe                      ZU G dH dIejA                  ZVe" G dJ dKeU                      ZW	 	 	 dUdMeej>        eXej>                 df         dNee?         d$eej>                 deej>        e?f         fdOZY G dP dQeUe          ZZg dRZ[dS )V    )AnyCallableOptional	TypedDictUnionN)nn)ACT2FN   )Cache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_2_ssm_available   )GraniteMoeHybridConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_updateNN)	BlockMask)make_flex_block_causal_maskc                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.pyrotate_halfr4   A   s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''    c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer4   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r3   apply_rotary_pos_embr@   H   sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr5   hidden_statesn_repreturnc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r-   expandreshape)rA   rB   batchnum_key_value_headsslenhead_dims         r3   	repeat_kvrK   c   s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr5           modulequerykeyvalueattention_maskscalingdropoutc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr*   r
   r)   )r,   dtype)ptrainingr   )rK   num_key_value_groupsr.   matmul	transposer-   r   
functionalsoftmaxfloat32torV   rS   rX   
contiguous)rM   rN   rO   rP   rQ   rR   rS   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r3   eager_attention_forwardrg   o   s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1 =((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r5   c                   f    e Zd ZdZdedef fdZ eddd          	 	 	 	 	 	 ddej	        de
ej	                 de
ej                 de
e         dede
ej                 de
eej	        ej	        f                  deej	        e
ej	                 e
eej	                          f         fd            Z xZS )GraniteMoeHybridAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    t                                                       || _        || _        |(t                              d| j        j         d           |j        | _        |j	        | _	        |j
        | _        | j	        | j        z  | _        |j        | _        | j        | j        z  | _        d| _        |j        | _        | j        | j        z  | j	        k    r t%          d| j	         d| j         d          t'          j        | j	        | j        | j        z  |j                  | _        t'          j        | j	        | j        | j        z  |j                  | _        t'          j        | j	        | j        | j        z  |j                  | _        t'          j        | j	        | j	        |j                  | _        d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).bias)super__init__rj   rk   loggerwarning_once	__class____name__attention_dropouthidden_sizenum_attention_heads	num_headsrJ   rH   rY   	is_causalattention_multiplierrR   
ValueErrorr   Linearattention_biasq_projk_projv_projo_projselfrj   rk   rs   s      r3   rp   z"GraniteMoeHybridAttention.__init__   s   ",!8 , , ,   "(!9!-3(DN:#)#= $(Nd6N$N!2MDN*t/???8RVRb 8 8%)^8 8 8  
 i 0$.4=2PW]Wlmmmi 0$2JT]2Zagavwwwi 0$2JT]2Zagavwwwi 0$2BI^___r5   past_key_valuepast_key_values4.58new_nameversionNFrA   rQ   r<   	use_cachecache_positionposition_embeddingsrC   c                    |                                 \  }	}
}|                     |          }|                     |          }|                     |          }|                    |	|
| j        | j                                      dd          }|                    |	|
| j        | j                                      dd          }|                    |	|
| j        | j                                      dd          }||nd\  }}|t          ||||          \  }}|&|||d}|
                    ||| j        |          \  }}t          }| j        j        dk    rt          | j        j                 } || ||||f| j        sdn| j        | j        d|\  }}|                    |	|
d          }|                     |          }||fS )	Nr   r*   r%   )r;   r:   r   eagerrL   )rS   rR   r)   )sizer~   r   r   viewrx   rJ   r[   rH   r@   updaterk   rg   rj   _attn_implementationr   rX   ru   rR   r   )r   rA   rQ   r<   r   r   r   r   ra   bszq_len_query_statesrb   rc   r:   r;   cache_kwargsattention_interfacerf   rd   s                        r3   forwardz!GraniteMoeHybridAttention.forward   s    &**,,UA{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm*=*I&&|S*';L*VY[^'_'_$L*&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ "&&sE266kk+..L((r5   )NNNFNN)rt   
__module____qualname____doc__r   intrp   r   r.   Tensorr   
LongTensorr   booltupler   __classcell__rs   s   @r3   ri   ri      sH       GG`5 `# ` ` ` ` ` `@ _%0A6RRR 2637+/59KO0) 0)|0) !.0) u/0	0)
 "%0) 0) !!120) &eEL%,,F&GH0) 
u|Xel3XeEL>Q5RR	S0) 0) 0) SR0) 0) 0) 0) 0)r5   ri   c                       e Zd ZdZdZej        dfdefdZ	 ddej	        dej	        de
d	eeeef                  d
eej	        ej	        f         f
dZdej        fdZddee
         d
e
fdZdS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNrj   c                 $   |j         | _         d| _        |j        }|j        }g | _        g | _        g | _        t          |j                  D ]}| j         |         dk    rw| xj        t          j
        |j        |j        z  d|j        z  |z  z   ||          gz  c_        | xj        t          j
        |j        |j        ||          gz  c_        | xj        t          j        g gz            gz  c_        | xj        t          j        g gz            gz  c_        | j                            |           fdt          |j                  D             | _        fdt          |j                  D             | _        d S )NFmambar*   devicerV   r   c                 D    g | ]}t          j        g gz             S r   r.   tensor.0r   
batch_sizer   s     r3   
<listcomp>z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>  s/    rrrQ%,tj'8HHHrrrr5   c                 D    g | ]}t          j        g gz             S r   r   r   s     r3   r   z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>  s/    tttqEL"
):6JJJtttr5   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr.   zerosmamba_expandrv   mamba_n_groupsmamba_n_headsmamba_d_headr   append	key_cachevalue_cache)r   rj   r   rV   r   conv_kernel_sizessm_state_sizeis     ` `   r3   rp   z)HybridMambaAttentionDynamicCache.__init__   s   !'!9"'!.-"$v/00 	2 	2A%a(G33  K",v/AAAH]D]`nDnn(%#  %    K",+&%#  	$ 	   U\2$2CF%S%S%S$TT  EL"
1B6$R$R$R#SS'..q1111rrrrrRWX^XpRqRqrrrtttttTYZ`ZrTsTstttr5   rb   rc   rk   r   rC   c                 D   | j         |         j        d         dk    r|| j         |<   || j        |<   nVt          j        | j         |         |gd          | j         |<   t          j        | j        |         |gd          | j        |<   | j         |         | j        |         fS )Nr)   r   r*   r+   )r   r-   r   r.   r/   )r   rb   rc   rk   r   s        r3   r   z'HybridMambaAttentionDynamicCache.update  s     >)$*2.!33(2DN9%*6DY''(-	4>)3Lj2Y_`(a(a(aDN9%*/)T5Ei5PR^4_ef*g*g*gDY'~i($*:9*EEEr5   beam_idxc                    t          t          | j                            D ];}| j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   =dS )zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   r   index_selectr_   r   r   r   )r   r   rk   r   s       r3   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_cache*  sE   s4>2233 		i 		iI^I.5F(,y(A(N(NqRZR]R]^dReRe(f(fDN9%%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'_Y/6F)-)C)P)PQRT\T_T_`fTgTg)h)hDOI&&		i 		ir5   r   c                     || j         vr| j         d         n|}t          | j                  |k    rdS | j        |         j        d         S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   rU   )r   r   r   r-   )r   rk   s     r3   get_seq_lengthz/HybridMambaAttentionDynamicCache.get_seq_length7  sS     3<4CZ2Z2ZD+A..`i	t~)++1~i(.r22r5   N)r   )rt   r   r   r   is_compileabler.   float16r   rp   r   r   r   dictstrr   r   r   r   r   r    r5   r3   r   r      s         NIN_c $u $u5 $u $u $u $uV 26F FLF lF 	F
 tCH~.F 
u|U\)	*F F F F"ie&6 i i i i3 3 3c 3 3 3 3 3 3r5   r   input_tensorpad_sizec                     t          | j                  dk    r
ddddd|ddfnddd|ddf}t          j        j                            | |dd          S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moderP   )r   r-   r.   r   r\   pad)r   r   	pad_shapes      r3   pad_tensor_by_sizer   C  sj     47|7I3J3Ja3O3OAq!Q!Q//VWYZ\]_gijlmUnI8""<ST"UUUr5   c                 "   t          | |          } t          | j                  dk    r.|                     | j        d         d|| j        d                   S |                     | j        d         d|| j        d         | j        d                   S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r
   r   r)   r*   )r   r   r-   rF   )r   r   
chunk_sizes      r3   reshape_into_chunksr   N  s     &lH==L
<!####L$6q$92z<K]^_K`aaa ##q!2z<3Ea3H,J\]^J_
 
 	
r5   c                    |                      d          } | d         j        g |                                  |R  } t          j        t          j        ||| j        t          j                  d          }|                     | d          } t          j        | d          }t          j        t          j        ||| j        t          j                  d          }|                    | t          j	                   }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r)   .Nr   diagonalr   rU   r+   )
r   rE   r.   trilonesr   r   masked_fillcumsuminf)r   r   masktensor_segsums       r3   segment_sumr   b  s     ""2&&J 2<	*1S<3D3D3F3FS
SSSL:ejZ@S[`[efffqstttD++TE155LL2666M :ejZ@S[`[efffqrsssD!--teeiZ@@Mr5   c                     |N|j         d         dk    r=|j         d         dk    r,| j        }| |dddddf         z                      |          } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r-   rV   r_   )rA   rQ   rV   s      r3   apply_mask_to_padding_statesr   y  si     !n&:1&=&A&AnFZ[\F]`aFaFa#&111d
)CCGGNNr5   c                   n    e Zd ZdZdedef fdZ	 	 	 	 ddej        de	e
         de	ej                 d	e	ej                 d
e	ej                 f
dZ	 	 	 dde	e
         de	ej                 d	e	ej                 fdZ	 	 	 	 dde	e
         de	ej                 d	e	ej                 d
e	ej                 fdZ xZS )GraniteMoeHybridMambaLayeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    rj   rk   c           	         t                                                       |j        | _        |j        | _        |j        | _        |j        | _        t          |j
        | j        z            | _        || _        |j        | _        |j        | _        t"          |j                 | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        dt;          d          f| _        d| _        d| _         | j        d| j        z  | j        z  z   | _!        tE          j#        | j!        | j!        |j        | j        | j!        | j        dz
            | _$        | j        | j!        z   | j        z   }tE          j%        | j        || j                  | _&        tE          j'        tQ          j)        | j                            | _*        tQ          j+        d| j        dz             }tE          j'        tQ          j,        |                    | _-        t]          | j        | j        	          | _/        tE          j'        tQ          j)        | j                            | _0        tE          j%        | j        | j        | j                  | _1        td          stf          4                    d
           d S tf          4                    d           d S )NrL   r   gMbP?g?r*   r   )in_channelsout_channelsrn   kernel_sizegroupspaddingrm   epsa  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzOThe fast path for GraniteMoeHybrid will be used when running the model on a GPU)5ro   rp   r   rx   rv   r   r   r   r   r   r   intermediate_sizerk   mamba_conv_biasuse_conv_bias
hidden_act
activationr	   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonr   n_groupsr   rJ   mamba_chunk_sizer   floattime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dr|   in_proj	Parameterr.   r   dt_biasarangelogA_logGraniteMoeHybridRMSNormGatednormDout_projis_fast_path_availablerq   rr   )r   rj   rk   projection_sizeArs   s        r3   rp   z#GraniteMoeHybridMambaLayer.__init__  st   -!-$2 & 3!$V%84;K%K!L!L"#3 +&+,."("5-+ 1 !$U5\\2" .T]1BTEX1XXi'-=)A-
 
 
 04=@4>Qy
 
 
 |EJt~$>$>?? LDNQ.//\%)A,,//
01GTMdeee	ej8899	$"8$:JQUQ^___% 	s>      qrrrrrr5   NrA   cache_paramsr   rQ   seq_idxc                    t          ||          }|                     |          }|j        \  }}}	| j        | j        z  }
|d uob|j        o[|dk    oU|j        | j                 j        d         |j        | j                 j        d         cxk    o|k    nc o|d uo|d         dk    }|r|	                    d          
                    | j        | j        | j        gd          \  }}}t          ||j        | j                 | j        j        	                    d          | j        j        | j                  }t'          j
        || j        |
|
gd          \  }}}t'          j        | j                                                   }|d d d df         d d d d d f                             d| j        | j                                      t&          j                  }|d d d d d f                             dd| j                  }| j        d d d df                             d| j                  }| j        d d d df                             d| j                  }|                    || j        |j        d         | j        z            }|                    || j        |j        d         | j        z            }|                    || j        | j                  }t=          |j        | j                 ||||||d |d
  
        }|                    || j        | j        z            }|                     ||          }|                      |          d d d df         }nlt'          j        | j                                                   }| j!        d	t-          d
          fk    ri nd| j!        i}| j"        r|tG          || j        j        	                    d          | j        j        | j        |f| j        | j$        || j        | j        j        | j        j%        | j         j        | j         j        | j        | j        ddd|}n|
                    | j        | j        | j        gd          \  }}}|p|&                    dd          }tN          j(        )                    || j*        |j        d         z
  df          }|j        | j                 +                    |           | j        dvr[| ,                    |                     |&                    dd                    dd |f         &                    dd                    }nht[          |&                    dd          | j        j        	                    d          | j        j        | j        |          &                    dd          }t          ||          }t'          j
        || j        |
|
gd          \  }}}t]          |                    ||d| j                  |||                    ||| j        d          |                    ||| j        d          f| j$        | j        d |d| j        dd|\  }}|'|%|j        | j                 +                    |           |                    ||d          }|                     ||          }|                      |          }|S )Nr   r   r)   r+   .rV   T)zr  dt_softplusrL   r   dt_limitF)r  r   r  r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr*   )siluswish)r0   weightrn   r   r  )r   r  r  r  r(  r  r  )/r   r  r-   r  r   r   r   rk   r   squeezesplitr   r
  rx   r$   r  r+  rn   r   r.   expr  r  rE   rJ   r_   r^   r  r  r   r    r  r  r  rX   r"   r   variance_epsilonr[   r   r\   r   r   copy_r   r#   r!   )r   rA   r  r   rQ   r  projected_statesr   seq_lenr   groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr  r  r  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedr   scan_output	ssm_states                              r3   cuda_kernels_forwardz/GraniteMoeHybridMambaLayer.cuda_kernels_forward  sJ    5]NSS<<66 "/!4
GQ!%1D!D $ &/&1& (8>qA&t~6<Q?      & d*& q!A% 	 " J	1*:*B*B1*E*E*K*K'GR +L + +'D#R
 !5!(8"**1-- ! ! #(+!')?AWX# # #M1a 4:++--...A!!!T3,111d
+222t}dFYZZ]]didq]rrAAAAqqq$J&&r2t}==Bl111dC<077DMJJGqqq$|$++B>>Az4=!'!*2MNNAz4=!'!*2MNNA%2%7%7
DNTXTa%b%b"2'7&   M *..z4>DM;YZZM IImT::M --..qqq$|<CC 4:++--...A$($8S%,,<O$O$ObbV`bfbvUwO } V1!56$K&..q11K$L f####'9#3 $	 :#'=#7!%!3 M M%*(-# $ &% , /?.D.D+T]DNKQS /E / /+'  + 4E3N3NqRS3T3T0"$-"3"34.1M1STV1WWYZ[# #K !,T^<BB;OOO?*;;;(,$5$?$?1$E$EFFsHWH}U__`acdee) )%% )9+55a;;#{199!<<![-#'? ') ) )  i1oo & %AARTb$c$c!&+k%+-CE[\' ' '#q! *C!&&z7BNNFF:wrBBFF:wrBB*  $f#(, L $* * &* *&Y" (\-E +DN;AA)LLL)..z7BGG"iiT:: mmK00
r5   c                    3 |j         \  }}}|j        }t          ||          }                     |          }	|	                     j         j         j        gd          \  }
}}|d uob|j        o[|dk    oU|j	         j
                 j         d         |j         j
                 j         d         cxk    o|k    nc o|d uo|d         dk    }|r|j	         j
                                     dd          |j	         j
        <   |d d dd d f                             |j	         j
                 j                  |j	         j
                 d d d d df<   |j	         j
                                      j        j        j                  }t#          j        | j        j                            d          z  d          } j        r| j        j        z   }                     |          }n|p|                    dd          }t0          j                            | j        |j         d         z
  df          }|j	         j
                                     |                                                     |                    dd                    dd |f                             dd                    }t          ||          }t#          j        | j         j         j        z   j         j        z  gd          \  }}}t#          j         j         !                                           }|ra|j         j
                 j        }|d d dd d f         d d d df         }|                    dd          "                    ||j         d          j#                  } j$        d	         "                     j$        j         d          j#                  }t"          j        j        %                    ||                    |j                  z             }t#          j&        | j'        d          j'        d                   }|d
         "                     j         j#         j                                      t"          j(                  }t#          j        |d	         |z                                |          }|)                    | j        d          dd d d f         }|"                    | j         j         j        z  |j         d                   *                                }|)                    |d|j         d                   }|d	         |dd d d f         z  }|)                    |d j#                  }||d	         z                      |          }|j         j
                                     |j         j
                 |z  |z              |)                    | j        d          dd d d f         }|"                    | j         j         j        z  |j         d                   *                                }|)                    |d|j         d                   }|j         j
                                     |j        |j                  }|+                    | j        z   j#         j                  }|+                    | j        z   j        d          }t#          j,        ||          }|+                    | j         j#                  } j-        d	         "                     j-        j         d          j#                  }|||z  z                       |j                  }|)                    |d          d d d df         }n+t0          j        %                    | j$        z             }t#          j&        | j'        d          j'        d                   }|)                    ||d j#                  !                                }|)                    ||d j                  !                                }|)                    ||d j                  !                                }|.                     j         j        z  d j                  }|.                     j         j        z  d j                  } j/        | j/        z  z
   j/        z  3 j-        d	         ta          |3          z  }||d	         z  }|                    |j                  |z  }3 fd||||fD             \  }}}}|1                    dddd          }t#          j2        |d          }t#          j        tg          |                    } |d d d d d d d d d d d f         |d d d d d d d d d d d f         z  }!|!                    d          }"|"d	         | 1                    ddddd          d	         z  }#|#                    d          }$|$d	         |d d d d d f         z                      d          }%t#          j        |d d d d d d dd f         |z
            }&||&1                    dddd          d	         z  }'|'dd d d f         |d	         z                      d          }(|r7|j         j
                 d d d df                             |(j                  })n t#          j4        |(d d d df                   })t#          j5        |)|(gd          }(t#          j        tg          t0          j                            |d d d d d d df         d                              }*|*                    dd          }*|*d
         |(d d d d d df         z                      d          }+|+d d d df         |+d d df         },}(t#          j        |          }-|dd d d f         |(d d d d d df         z  }.|-1                    dddd          }/|.                    d          |/d	         z  }0|%|0z   }|)                    |d j         j#                  }||z   }3dk    r|d d d |d d d d f         }|)                    ||d          }|,'|%|j         j
                                     |,            6                    ||
          }1 7                    |1                    |                    }2|2S )Nr)   r+   r   r   )shiftsdimsr   r*   .r   ).NNr  r   )r,   output_sizec                 <    g | ]}t          |j                  S r   )r   r   )r   tr   r   s     r3   r   z<GraniteMoeHybridMambaLayer.torch_forward.<locals>.<listcomp>  s)    %z%z%z\]&9!Xt&W&W%z%z%zr5   r
   r   rU   )r   r   )8r-   rV   r   r  r-  r   r
  rx   r   r   rk   r   rollr_   r   r  r+  r.   sumr,  r   rn   r   r[   r   r\   r   r   r0  r  r   r.  r  r  rE   rJ   r  softplusclampr  r^   rF   r`   r   bmmr  repeat_interleaver   r   permuter   r   
zeros_liker/   r  r  )4r   input_statesr  r   rQ   r   r2  r   rV   r1  r5  r6  r7  r4  r   r=  rA   r8  r9  r  cache_devicer  dAdBdBxr   ssm_states_reshaped
C_reshapedyr  
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr?  state_decay_outC_times_statesstate_decay_out_permutedY_offr>  contextualized_statesr   s4   `                                                  @r3   torch_forwardz(GraniteMoeHybridMambaLayer.torch_forward~  s(    ".!3
GQ" 4L.QQ<<55&6&<&<'GR '= '
 '
#
 $ &/&1& (8>qA&t~6<Q?      & d*& q!A% 	 " 	x7C7OPTP^7_7d7dlnuw7d7x7xL$T^4ARSTSTSTVWYZYZYZSZA[A^A^_k_wx|  yG  `H  `O  BP  BPL$T^4QQQ2X> '24>BEET[M_MfEggK %	dk088;;;! ! ! ! I$58H$H! $): ; ; '/@/J/J1a/P/P, m//043HKgKmnpKq3qst2u  (8>>{KKK $5F5P5PQRTU5V5V)W)WX[]e^e]eXe)f)p)pqrtu)v)v w w89JN[[#k#T]T5H%H$-Z^ZmJmn
 
 
q! Ytz''))***! F	I'24>BIL AAAq!!!GQQQc\*Ba##**:rx|T]SSBl9-44T\5G5JDMZZG$--b7::bh3G3G.GHHBR!5a!8$:Nq:QRRB/"))$.$-I\]]``glgt`uuA)ByMA-..22,2GGB
 		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66AI3aaa<0B *11*b$-PPMi0044L4IIC #DN399'7"<sB   		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66A &0@CC18[\[bCccJ",//*t~2Mt}^b^q"r"r
T^ ;T=PRSTTJ	-z::Az4>4=AAA y!((a$-HHA]Q&&**1733A 		*b))!!!T3,7AA ''T\(9::BR!5a!8$:Nq:QRRB)11*gr4=YY__aaM		*gr43FGGMMOOA		*gr43FGGMMOOA##DNdm$CX\Xf#ggA##DNdm$CX\Xf#ggA'DO*CCtVH	*-?x-X-XXJ *ByM9M]())B.A &{%z%z%z%zboqrtuwxay%z%z%z"M1a 		!Q1%%A|A2...H 	+a..))A qqq!!!QQQaaa23a111dAAAqqq!!!8K6LLN""r"**A y\AIIaAq!,D,DY,OON""r"**A 	l]111aaa:%>>CCCJJF !9XaaaAAArssl%;h%FGGL,..q"b!<<YGGGc4l+mI.FFKKPQKRRF & B"."9$."I!!!TSV,"W"Z"Zbhbo"Z"p"p"'"26!!!RaR%="A"AY8a@@@F)K0A0A(111aaaQRQRQRTV;BWY_0`0`$a$abbK%//155K%o6111dC9PPUUZ[U\\J *111crc6 2Jqqq"u4EIF $i11OT111oqqq!!!T30GGN'6'>'>q!Q'J'J$#''++.Fy.QQE A		*b$.$-HHAJA!||aaa'111aaa'(		*gr22A $)A'7==iHHHii4((
 !%knnU.C.C D D$$r5   c                 d   t           r1d| j        j        j        j        v r|                     |||||          S |t          d          |j        }|G|j        d         dk    r6|j        d         dk    r%||d d d d d f         z  	                    |          }| 
                    ||||          S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )r  r  r+  r   typer@  NotImplementedErrorrV   r-   r_   rj  )r   rA   r  r   rQ   r  ra   rV   s           r3   r   z"GraniteMoeHybridMambaLayer.forwardM  s     " 	sf0C0J0O&O&O,,]L.Zhjqrrr%n   #%.*>q*AA*E*E.J^_`JadeJeJe*^AAAqqq$J-GGKKERRM!!-~~^^^r5   )NNNN)NNN)rt   r   r   r   r   r   rp   r.   r   r   r   r   	IntTensorr@  rj  r   r   r   s   @r3   r   r     s        ?s5 ?s# ?s ?s ?s ?s ?s ?sH DH5915-1g g|g ?@g !!12	g
 !.g %/*g g g gZ DH5915L% L% ?@L% !!12	L%
 !.L% L% L% L%d DH5915-1_ _ ?@_ !!12	_
 !._ %/*_ _ _ _ _ _ _ _r5   r   c                   (     e Zd Zd fd	ZddZ xZS )r  ư>c                     t                                                       t          j        t	          j        |                    | _        || _        d S r   ro   rp   r   r  r.   r   r+  r/  r   rv   r   rs   s      r3   rp   z%GraniteMoeHybridRMSNormGated.__init__e  sB    l5:k#:#:;; #r5   Nc                    |j         }|                    t          j                  }|?|t          j                            |                    t          j                            z  }|                    d                              dd          }|t          j	        || j
        z             z  }| j        |                    |          z  S Nr*   r)   T)keepdim)rV   r_   r.   r^   r   r\   r)  powmeanrsqrtr/  r+  )r   rA   r5  input_dtypevariances        r3   r   z$GraniteMoeHybridRMSNormGated.forwardj  s    #)%((77)BM,>,>twwu}?U?U,V,VVM $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r5   rq  r   )rt   r   r   rp   r   r   r   s   @r3   r  r  d  sQ        $ $ $ $ $ $
	; 	; 	; 	; 	; 	; 	; 	;r5   r  c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )GraniteMoeHybridMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    rj   c                 D   t                                                       |j        | _        |j        | _        t
          |j                 | _        t          j	        | j        | j        dz  d          | _
        t          j	        | j        | j        d          | _        d S )Nr*   Frm   )ro   rp   rv   
input_sizeshared_intermediate_sizer	   r   r   r   r|   input_linearoutput_linearr   rj   rs   s     r3   rp   zGraniteMoeHybridMLP.__init__  s     ,!: !23Idot7G!7KRWXXXYt'7uUUUr5   rA   rC   c                     |                      |          }|                    dd          }|                     |d                   |d         z  }|                     |          }|S )Nr*   r)   r+   r   r   )r  chunkr   r  )r   rA   chunked_hidden_statess      r3   r   zGraniteMoeHybridMLP.forward  sj    ))-88 - 3 3A2 3 > >(=a(@AADYZ[D\\**=99r5   )
rt   r   r   r   r   rp   r.   r   r   r   r   s   @r3   r  r  v  s|         V5 V V V V V VU\ el        r5   r  c                   d    e Zd ZU dZej        ed<   ej        ed<   eed<   eed<   ej        ed<   dS )GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kr  N)	rt   r   r   r   r.   r   __annotations__r   ro  r   r5   r3   r  r    sb          " ########_r5   r  F)totalc                   ,     e Zd Zd fd	Zd Zd Z xZS )GraniteMoeHybridRMSNormrq  c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zF
        GraniteMoeHybridRMSNorm is equivalent to T5LayerNorm
        Nrs  rt  s      r3   rp   z GraniteMoeHybridRMSNorm.__init__  sD     	l5:k#:#:;; #r5   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S rv  )	rV   r_   r.   r^   rx  ry  rz  r/  r+  )r   rA   r{  r|  s       r3   r   zGraniteMoeHybridRMSNorm.forward  s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r5   c                 H    t          | j        j                   d| j         S )Nz, eps=)r   r+  r-   r/  )r   s    r3   
extra_reprz"GraniteMoeHybridRMSNorm.extra_repr  s&    )**II$2GIIIr5   r}  )rt   r   r   rp   r   r  r   r   s   @r3   r  r    sb        $ $ $ $ $ $; ; ;J J J J J J Jr5   r  c                   6     e Zd Zdedededdf fdZd Z xZS )GraniteMoeHybridParallelExpertsnum_expertsr  rD  rC   Nc                     t                                                       t          j        t	          j        |||                    | _        || _        || _        || _	        dS )a  
        Initialize the GraniteMoeHybridParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
ro   rp   r   r  r.   emptyr+  r  r  rD  )r   r  r  rD  rs   s       r3   rp   z(GraniteMoeHybridParallelExperts.__init__  sW    " 	l5;{K#T#TUU&$&r5   c                    |                     |d          }g }t          | j                  D ];}|                    t	          j        ||         | j        |                              <t          j        |d          }|S )a  
        Forward pass of the GraniteMoeHybridParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   r+   )	r-  r   r  r   Flinearr+  r.   r/   )r   inputsexpert_size
input_listoutput_listr   resultss          r3   r   z'GraniteMoeHybridParallelExperts.forward  s     \\+1\55
t'(( 	H 	HAqx
1t{1~FFGGGG)KQ///r5   rt   r   r   r   rp   r   r   r   s   @r3   r  r    sh        'C 'S 's 't ' ' ' ' ' '.      r5   r  c                   2     e Zd Zdededef fdZd Z xZS )GraniteMoeHybridTopKGatingr  r  top_kc                     t                                                       || _        || _        || _        t          j        ||d          | _        dS )a  
        Initialize the top-k gating mechanism.
        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        Frm   N)ro   rp   r  r  r  r   r|   layer)r   r  r  r  rs   s       r3   rp   z#GraniteMoeHybridTopKGating.__init__  sM     	&$
Yz;UCCC


r5   c                    |                      |                                          }|                    | j        d          \  }}t	          j        |d                              |          }t	          j        |                    d          | j	        g|j
        |j                  }|                    d|d          }|                                                    d          }|                                }|                                }	|	                    d          \  }
}|                    | j        d          }|                                }||         }|||||fS )Nr   r+   r   rV   r   trunc)rounding_mode)r  r  topkr  r.   r]   type_asr   r   r  rV   r   scatterlongrH  tolistflattensortdiv)r   rA   logitstop_k_logitstop_k_indicestop_k_gatesr   gatesr  top_k_expertsr   index_sorted_expertsbatch_indexbatch_gatess                 r3   r   z"GraniteMoeHybridTopKGating.forward  sS   M**0022&,kk$*!k&D&D#mmLa888@@OO a  $"23;;LU`Ug
 
 
 a22jjll&&q)) "((** &--//"/"4"4Q"7"7*..tz.QQ "))++!"67#[+{FRRr5   r  r   s   @r3   r  r    sq        D3 DS D D D D D D D&S S S S S S Sr5   r  c                   .     e Zd ZdZdef fdZd Z xZS )GraniteMoeHybridMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    rj   c                    t                                                       |j        | _        |j        | _        t
          |j                 | _        t          |j	        | j        | j        dz            | _
        t          |j	        | j        | j                  | _        t          | j        |j	        |j                  | _        d S )Nr*   )r  r  r  )ro   rp   rv   r  r   r	   r   r   r  num_local_expertsr  r  r  num_experts_per_tokrouterr  s     r3   rp   zGraniteMoeHybridMoE.__init__#  s     ,!3 !23;$dot7G!7K
 
 =$d&6
 
 10,
 
 
r5   c                 T   |                                 \  }}}|                    d|          }|                     |          \  }}}}}	||         }
|                     |
|          }|                    dd          }|                     |d                   |d         z  }|                     ||          }||dddf         z  }t          j        ||z  | j	        f|j
        |j                  }|                    d||          }|                    ||| j	                  }||	fS )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r)   r*   r+   r   r   Nr  )r   rF   r  r  r  r   r  r.   r   r  rV   r   	index_addr   )r   layer_inputr   lengthemb_sizer   r  r  r  router_logitsexpert_inputsrA   r  expert_outputsr   layer_outputs                   r3   r   zGraniteMoeHybridMoE.forward6  s5    !, 0 0 2 2VX!))"h77BF++kBZBZ?;[-#K0))-EE - 3 3A2 3 > >(=a(@AADYZ[D\\++M;GG'+aaag*>>S6\4?;>CW`n`uvvvq+~FF#((fdoFF]**r5   )rt   r   r   r   r   rp   r   r   r   s   @r3   r  r    s^         
5 
 
 
 
 
 
&+ + + + + + +r5   r  c                   |    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 dd
ej        de	ej                 de	e
         de	e         de	e         de	ej                 de	e         de	eej        ej        f                  dee         deej        e	eej        ej        f                  f         fd            Z xZS )GraniteMoeHybridDecoderLayerrj   rk   c                 J   t                                                       |j        | _        d | _        |j        dk    rt          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        |j        | _        t          |          | _        d | _        |j        |         dk    rt!          ||          | _        nt#          ||          | _        |j        |         | _        t'          |dd          dk    | _        d S )Nr   r   r   r  )ro   rp   rv   	self_attnr  r  block_sparse_moer  r  input_layernormpost_attention_layernormresidual_multiplierr  
shared_mlpr   r   r   ri   
layer_typegetattrhas_expertsr   s      r3   rp   z%GraniteMoeHybridDecoderLayer.__init__W  s	   !-#a''$7$?$?D!6v7IvObccc(?@RX^Xk(l(l(l%#)#= -f55
#I.'993FIFFDJJ6vyIIDN 29= #6+>BBQFr5   r   r   r   r   NFrA   rQ   output_attentionsr   r   output_router_logitsr   ra   rC   c	                    |}
|                      |          }| j         | j        d||||d|	}d}n | j        d|||||||d|	\  }}|
|| j        z  z   }|}
|                     |          }| j        r1|                     |          \  }}||                     |          z   }n|                     |          }d}|
|| j        z  z   }|f}|r||fz  }|r||fz  }|S )a0  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        N)rA   r   r  rQ   )rA   rQ   r   r  r   r   r   r   )r  r   r  r  r  r  r  r  )r   rA   rQ   r   r  r   r   r  r   ra   residualself_attn_weightsmoe_hidden_statesr  outputss                  r3   r   z$GraniteMoeHybridDecoderLayer.forwardn  so   L !,,];;:!&DJ +-,-	 
  M !%/=t~ 	0+- /"3#-$7	0 	0 	0 	0,M, !=43K#KK !55mDD 	!/3/D/D]/S/S,}-0N0NNMM OOM::M M =43K#KK " 	,)++G 	(''Gr5   )NNFFNFN)rt   r   r   r   r   rp   r   r.   r   r   r   r   r   r   r   r  FloatTensorr   r   r   s   @r3   r  r  V  sv       G5 G# G G G G G G. _%0A6RRR 26+/,1$)59/4KOU U|U !.U "%	U
 $D>U D>U !!12U 'tnU &eEL%,,F&GHU 45U 
u (51BEDU1U+V"WW	XU U U SRU U U U Ur5   r  c                   N     e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZ fdZ xZS )GraniteMoeHybridPreTrainedModelrj   modelTr  r   Fc                 R   t                                          |           t          |t                    r+|j        j                            d| j        j                   t          |t                    ry|j
        j                            d           t          j        t          j        d|j        dz                       |j        _        |j        j                            d           d S t          |t$                    r!|j        j                            d           d S d S )NrL   )ry  stdg      ?r   )ro   _init_weights
isinstancer  r+  datanormal_rj   initializer_ranger   r  fill_r.   r  r  rx   r  r  r  )r   rM   rs   s     r3   r  z-GraniteMoeHybridPreTrainedModel._init_weights  s    f%%%f=>> 	TM&&CT[5R&SSSf899 	*N%%c*** %	%,q&:JQ:N*O*O P PFLHM$$$$$ <== 	*M$$S)))))	* 	*r5   )rt   r   r   r   r  base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_is_statefulr  r   r   s   @r3   r  r    sz         """"&*#78#4"5N"L	* 	* 	* 	* 	* 	* 	* 	* 	*r5   r  c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )GraniteMoeHybridRotaryEmbeddinginv_freqNrj   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typerm  defaultr  F)
persistent)ro   rp   hasattrr  r  r   getr  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrj   r   rope_init_fnattention_scalingregister_bufferr  original_inv_freq)r   rj   r   r  rs   s       r3   rp   z(GraniteMoeHybridRotaryEmbedding.__init__  s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r5   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r)   r   mpscpuF)device_typeenabledr*   r+   r  )r  r  rE   r-   r_   r   r  rm  r   r.   autocastr[   r/   r:   r  r;   rV   )
r   r0   r<   inv_freq_expandedposition_ids_expandedr	  freqsembr:   r;   s
             r3   r   z'GraniteMoeHybridRotaryEmbedding.forward  s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/r   )rt   r   r   r.   r   r  r   rp   no_gradr   r   r   r   s   @r3   r  r    s         l/ /5 / / / / / /" U]__< <  _< < < < <r5   r  c                   2    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 deeeeej                 f                  deej                 d	ee         d
ee         dee         dee         dee         deej	                 dee         deeef         fd                        Z	 ddeej
        df         dej
        dej
        ded
ef
dZedej
        dededej        dej
        defd            Zd Z xZS )GraniteMoeHybridModelrj   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        d| _        j        | _        j        | _        j        | _        | j        | j        z  | _        j        | _        j        | _        j        | _        | j        dk    rt1                    nd | _        |                                  d S )Nc                 0    g | ]}t          |          S r   )r  )r   rk   rj   s     r3   r   z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>  s$    nnn)&)<<nnnr5   r   Frope)ro   rp   pad_token_idpadding_idx
vocab_sizer   	Embeddingrv   embed_tokens
ModuleListr   r   layersr  r  r  gradient_checkpointingembedding_multiplierrw   rx   rJ   r  
rope_thetaposition_embedding_typer  
rotary_emb	post_initr  s    `r3   rp   zGraniteMoeHybridModel.__init__  s8      !. +L):F<NPTP`aamnnnneTZTlNmNmnnn
 
 ,F,>FDWXXX	&+#$*$?!!-3(DN:'-'E$ +'-'E$EIEaekEkEk9&AAAqu 	r5   N	input_idsrQ   r<   r   inputs_embedsr   r  output_hidden_statesr  return_dictr   ra   rC   c                 P   ||n| j         j        }||n| j         j        }||n| j         j        }|
|
n| j         j        }
|d u |d uz  rt          d          | j        r%| j        r|rt          	                    d           d}|| 
                    |          }|| j        z  }|r|t          	                    d           |B||                                nd}t          j        |||j        d         z   |j                  }||                    d          }|                     |||||          }|                     ||          }|}d }| j        |                     ||          }|rdnd }|rdnd }|	rdnd }| j        D ]^}|j        d	k    r|n|}|r||fz  } ||f||||||	|d
|}|d         }|r|d         ||d         fz  }|	r|d         ||d         fz  }_|                     |          }|r||fz  }|r|j        sd|_        t3          |||||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   r   r   r   )rQ   r   r  r   r   r  r   r)   T)last_hidden_stater   rA   
attentionsr  )rj   r  r%  r   use_return_dictr{   r  rX   rq   rr   r  r  r   r.   r  r-   r   r7   _update_causal_mask_update_mamba_maskr!  r  r  r  r   r   )r   r#  rQ   r<   r   r$  r   r  r%  r  r&  r   ra   past_seen_tokensre   
mamba_maskrA   r   all_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputss                           r3   r   zGraniteMoeHybridModel.forward  sN   " 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M%(AA  	0K  
 !CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L..M>?L]
 
 ,,^^LL
 &"?&"&//-"N"N #7@BBD0:d"6@BBD![ 	> 	>M'4'?7'J'JP[J# 6!m%55!)M
) /"3#-%9$7
 
 
 
M *!,M  : #/"}Q'7&99N# > $0%-*;)==%		-00   	2-!11 	6?#E 	615O.%+++%+
 
 
 	
r5   Fr&   r   c           	      $   | j         j        dk    r||dk                                    r|S d S | j         j        dk    r+t          |t          j                  rt          |          }|S ||                                nd}||j        nd}| j         j        dk    r#|s!|st          j
        |||| j                  rd S |j        }|j        d         }	|r|                                }
n/t          |t          j                  r|j        d	         n||	z   dz   }
|                     ||	|
|||j        d         
          }| j         j        dk    r@|>|j        j        dv r0|s.t	          j        |          j        }t          j        ||          }|S )Nflash_attention_2rL   flex_attentionr   Fsdpa)r$  past_key_values_lengthis_trainingr   r)   )sequence_lengthtarget_lengthrV   r   r   )rl  xpunpu)rj   r   anyr  r.   r   r'   r   r   r   _ignore_causal_mask_sdparX   rV   r-   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   rm  finfomin_unmask_unattended)r   rQ   r   r   r   r  r-  using_compilable_cacherV   r;  r<  re   	min_dtypes                r3   r+  z)GraniteMoeHybridModel._update_causal_mask  s    ;+/BBB)~/D.I.I.K.K)%%4;+/???.%,77 M!<^!L!L!!
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5]n5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD% E E**.I0CKQZ[[Kr5   r;  r<  rV   r   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )
fill_valuerV   r   r   r   r   r)   r   )r,   r.   rC  rD  fullr   triur  rF   rE   cloner-   r_   r   )rQ   r;  r<  rV   r   r   ra   re   rG  mask_lengthpadding_masks              r3   rB  zKGraniteMoeHybridModel._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r5   c                 Z    |}|d         dk    s|t          j        |dk              rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r.   all)r   rQ   r   r.  s       r3   r,  z(GraniteMoeHybridModel._update_mamba_mask  s>     $
!q  ^%?EIn`aNaDbDb%?Jr5   )NNNNNNNNNNN)F)rt   r   r   r   rp   r   r   r   r.   r   r   r   r   listr  r   r   r  r   r   r   r+  staticmethodr   rV   rB  r,  r   r   s   @r3   r  r    se       5      2  151537KO59$(,0/3/3&*59s
 s
E,-s
 !.s
 u/0	s

 "%tE4E/F(F"GHs
   12s
 D>s
 $D>s
 'tns
 'tns
 d^s
 !!12s
 45s
 
u--	.s
 s
 s
 ^ s
v #(B BelK78B lB 	B
 B  B B B BH 444 4 {	4
 4 4 4 4 \4l	 	 	 	 	 	 	r5   r  r*   gate_logitsr  c                    | t          | t                    sdS t          | t                    r/| d         j        t          j        fd| D             d          }t          j        j                            |d          }t          j        ||d          \  }}t          j        j        	                    ||          }|@t          j
        |                                d          }	t          j
        |d          }
nD|j        \  }}|j        d         ||z  z  }|dddddddf                             |||||f                              d||                                        }t          j        |                                |z  d          t          j        |d          z  }	|ddddddf                             ||||j        d         f                              d|j        d                                                 }t          j        ||z  d          t          j        |d          z  }
|j        j        |j        j        nd}|j        d         t%          |          z  }t          j        |	dd|||j        d         z   f         |
                    d          z            }||z  S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                 :    g | ]}|                               S r   )r_   )r   
layer_gatecompute_devices     r3   r   z,load_balancing_loss_func.<locals>.<listcomp>?  s&    -j-j-jPZjmmN.K.K-j-j-jr5   r+   r)   r   )r  r   r   r.   r/   r   r\   r]   r  one_hotry  r  r-   rE   rF   r_   rH  indexr   r7   )rS  r  r  rQ   concatenated_gate_logitsrouting_weightsr   selected_expertsexpert_masktokens_per_expertrouter_prob_per_expertr   r;  r   expert_attention_mask router_per_expert_attention_maskdevice_indexrankoverall_lossrW  s                      @r3   load_balancing_loss_funcre    s   : *[%"@"@q+u%% s$Q.#(9-j-j-j-j^i-j-j-jpq#r#r#r h)112JPR1SSO*_eDDDA(%--.>LLK!J{'8'8':':BBB "'O!C!C!C&4&:#
O4:1=*B^_ 4AAAtT12V&
OUKXYYWR,,R	 	 "Ik&7&7&9&9<Q&QWXYYY\a\e!q]
 ]
 ]
 
 4AAAt+,V&
O_EZ[\E]^__WR.q122R	 	) "'?=]+]cd!e!e!ehmhq,!i
 i
 i
 "
 4C3I3O3[?)//abL #c,&7&77D9!!!TD?+@+C$CCCDG]GgGghiGjGjj L +%%r5   c                        e Zd ZdgZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 d	eeeeej                 f                  d
eej                 deej	                 dee         dee         dee         dee         dee         deej	                 deeej
        f         deeef         fd            Z	 	 	 	 	 	 ddZ xZS )GraniteMoeHybridForCausalLMzlm_head.weightrj   c                 F   t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        |j	        | _	        |j
        | _        |j        | _        |                                  d S )NFrm   )ro   rp   r  r  r  r   r|   rv   lm_headrouter_aux_loss_coefr  r  r  r"  r  s     r3   rp   z$GraniteMoeHybridForCausalLM.__init__v  s       *622
 +y!3V5FUSSS$*$?!!3#)#=  	r5   Nr   r#  rQ   r<   r   r$  labelsr   r  r%  r  r&  r   logits_to_keeprC   c                     ||n| j         j        }|
|
n| j         j        }
|	|	n| j         j        }	||n| j         j        } | j        d||||||||	|
||d|}|d         }t          |t                    rt          | d          n|}| 	                    |dd|ddf                   }|| j         j
        z  }d}|/|                                } | j        ||fd| j         j        i|}d}|
rRt          |r|j        n|d         | j        | j        |          }|%|| j        |                    |j                  z  z  }|s |f|dd         z   }|
r|f|z   }||f|z   n|S t+          ||||j        |j        |j        |j                  S )	ax  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r#  rQ   r<   r   r$  r   r  r%  r  r&  r   r   r  r)   r   )lossaux_lossr  r   rA   r)  r  r   )rj   r  r  r%  r*  r  r  r   sliceri  logits_scalingr  loss_functionr  re  r  r  r  rj  r_   r   r   r   rA   r)  )r   r#  rQ   r<   r   r$  rk  r   r  r%  r  r&  r   rl  ra   r  rA   slice_indicesr  rn  ro  outputs                         r3   r   z#GraniteMoeHybridForCausalLM.forward  sO   P 2C1N--TXT_Tq$8$D  $+Jj 	 %9$D  $+Jj 	 &1%<kk$+B] $* 
)%+'/!5!5#)
 
 
 
   
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA$+44\\^^F%4%   ;1 	 D  	M/)4E%%'"+ (	 H !1HKK4L4LLL 	DY,F# ."v-'+'7D7V##VC(#3!/)!/
 
 
 	
r5   Tc                    |d u }	|	s]||d         |j         d         k    r|d d |j         d          d f         }nX|j         d         |j         d         k    r|d d |f         }n/|r-t          | j        |j         d         | j        | j                  }|b|`|                                                    d          dz
  }|                    |dk    d           |	s|d d |j         d          d f         }||	rd|i}
nd|                                i}
|
	                    |||||d           |
                                D ]\  }}||
vr||
|<   |
S )Nr)   r   r   r   r$  r#  )r<   r   r   rQ   r   )r-   r   rj   rV   r   r  r   masked_fill_r`   r   items)r   r#  r   rQ   r$  r   r<   r   ra   empty_past_kvmodel_inputsrO   rP   s                r3   prepare_inputs_for_generationz9GraniteMoeHybridForCausalLM.prepare_inputs_for_generation  s    (4/  	)!"%);;;%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	 	>Y_Q/DK  O %,*>)..0077;;a?L%%n&91===  F+AAA	0B/B/D/D,DE $$+];LL')=)=)?)?@L ,#2&"0"0 	
 	
 	
 !,,.. 	* 	*JC,&&$)S!r5   )NNNNNNNNNNNNr   )NNNNNT)rt   r   r   _tied_weights_keysr   rp   r   r   r.   r   r   r   r   rQ  r  r   r   r   r   r   rz  r   r   s   @r3   rg  rg  s  s       *+5        151537KO59-1$(,0/3/3&*5934k
 k
E,-k
 !.k
 u/0	k

 "%tE4E/F(F"GHk
   12k
 )*k
 D>k
 $D>k
 'tnk
 'tnk
 d^k
 !!12k
 c5</0k
  
u//	0!k
 k
 k
 ^k
` = = = = = = = =r5   rg  )rg  r  r  )Nr   )rL   )Nr*   N)\typingr   r   r   r   r   r.   torch.nn.functionalr   r\   r  transformers.activationsr	   cache_utilsr   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.import_utilsr   r   configuration_granitemoehybridr   +mamba_ssm.ops.triton.selective_state_updater    !mamba_ssm.ops.triton.ssd_combinedr!   r"   causal_conv1dr#   r$   !torch.nn.attention.flex_attentionr&   integrations.flex_attentionr'   
get_loggerrt   rq   r4   r@   r   r   rK   Moduler  rg   ri   r   r   r   r   rP  r  r   r   r  r  r  r  r  r  r  r  r  r  r  r   re  rg  __all__r   r5   r3   <module>r     s  , = < < < < < < < < < < < < <                 + + + + + +             ) ) ) ) ) ) > > > > > > 9 9 9 9 9 9 j j j j j j j j j j K K K K K K K K F F F F F F F F & & & & & & \ \ \ \ \ \ \ \ \ \ \ \ 0 0 0 0 0 0 V V V V V V V V B B B B B B  "RRRRRRmmmmmmmmm! 8DDDDDDDDD-7**  !! K;;;;;;JJJJJJ 
	H	%	%( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % % % %:T) T) T) T) T)	 T) T) T)nZ3 Z3 Z3 Z3 Z3 Z3 Z3 Z3@VU\ VS V V V V
 
 
(  ( 46FH\]^^   \_ \_ \_ \_ \_ \_ \_ \_~; ; ; ; ;58? ; ; ;$    ")   4    )5    2J J J J Jbi J J J(* * * * *bi * * *Z-S -S -S -S -S -S -S -S`9+ 9+ 9+ 9+ 9+") 9+ 9+ 9+xn n n n n#= n n nb * * * * *o * * *0!< !< !< !< !<bi !< !< !<H U U U U U; U U Ut "&
-1	S& S&u|U5<%8$>?S&#S& U\*	S&
 5<S& S& S& S&l{ { { { {"A? { { {| f
e
er5   