
     `i                    h   d dl mZmZmZmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*  e(            rd dl+m,Z, d dl-m.Z.m/Z/ ndZ, e'            r	d dl0m1Z1m2Z2 nd\  Z2Z1 e#j3        e4          Z5 G d ded          Z6 G d d          Z7 G d dej8                  Z9d  Z:d!ej;        d"e<d#ej;        fd$Z=	 dJd&ej8        d'ej;        d(ej;        d)ej;        d*eej;                 d+e>d,e>d-ee          fd.Z?dKd/Z@ G d0 d1ej8                  ZA G d2 d3ej        j8                  ZBd4ej;        d5e<fd6ZCd7 ZDd8 ZE eFe,e1e2f          ZGd9 ZH G d: d;ej8                  ZI G d< d=ej8                  ZJ ed>           G d? d@ej8                              ZK G dA dBe          ZLe! G dC dDe                      ZMe! G dE dFeM                      ZNe! G dG dHeMe                      ZOg dIZPdS )L    )AnyCallableOptional	TypedDictUnionN)nn)ACT2FN   )Cache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_updateNNc                   d    e Zd ZU dZej        ed<   ej        ed<   eed<   eed<   ej        ed<   dS )BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bamba/modeling_bamba.pyr'   r'   @   sb          " ########_r7   r'   F)totalc                       e Zd ZdZdZej        dfdefdZ	 ddej	        dej	        de
d	eeeef                  d
eej	        ej	        f         f
dZdej        fdZddee
         d
e
fdZdS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNconfigc                 $   |j         | _         d| _        |j        }|j        }g | _        g | _        g | _        t          |j                  D ]}| j         |         dk    rw| xj        t          j
        |j        |j        z  d|j        z  |z  z   ||          gz  c_        | xj        t          j
        |j        |j        ||          gz  c_        | xj        t          j        g gz            gz  c_        | xj        t          j        g gz            gz  c_        | j                            |           fdt          |j                  D             | _        fdt          |j                  D             | _        d S )NFmamba   devicedtyperA   c                 D    g | ]}t          j        g gz             S rC   r1   tensor.0_
batch_sizerA   s     r8   
<listcomp>z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>   s/    rrrQ%,tj'8HHHrrrr7   c                 D    g | ]}t          j        g gz             S rE   rF   rH   s     r8   rL   z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>   s/    tttqEL"
):6JJJtttr7   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr1   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headrG   append	key_cachevalue_cache)selfr<   rK   rB   rA   conv_kernel_sizessm_state_sizeis     ` `   r8   __init__z)HybridMambaAttentionDynamicCache.__init__i   s   !'!9"'!.-"$v/00 	2 	2A%a(G33  K",v/AAAH]D]`nDnn(%#  %    K",+&%#  	$ 	   U\2$2CF%S%S%S$TT  EL"
1B6$R$R$R#SS'..q1111rrrrrRWX^XpRqRqrrrtttttTYZ`ZrTsTstttr7   
key_statesvalue_states	layer_idxcache_kwargsreturnc                 D   | j         |         j        d         dk    r|| j         |<   || j        |<   nVt          j        | j         |         |gd          | j         |<   t          j        | j        |         |gd          | j        |<   | j         |         | j        |         fS )Nr   r?   dim)r^   shaper_   r1   cat)r`   re   rf   rg   rh   s        r8   updatez'HybridMambaAttentionDynamicCache.update   s     >)$*2.!33(2DN9%*6DY''(-	4>)3Lj2Y_`(a(a(aDN9%*/)T5Ei5PR^4_ef*g*g*gDY'~i($*:9*EEEr7   beam_idxc                    t          t          | j                            D ];}| j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   =dS )zDReorders the cache for beam search, given the selected beam indices.r   N)	rU   lenr^   rA   index_selecttor_   rR   rS   )r`   rq   rg   rA   s       r8   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_cache   sE   s4>2233 		i 		iI^I.5F(,y(A(N(NqRZR]R]^dReRe(f(fDN9%%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'_Y/6F)-)C)P)PQRT\T_T_`fTgTg)h)hDOI&&		i 		ir7   r   c                     || j         vr| j         d         n|}t          | j                  |k    rdS | j        |         j        d         S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rT   rs   r^   rn   )r`   rg   s     r8   get_seq_lengthz/HybridMambaAttentionDynamicCache.get_seq_length   sS     3<4CZ2Z2ZD+A..`i	t~)++1~i(.r22r7   N)r   )r-   r.   r/   r0   is_compileabler1   float16r   rd   Tensorr4   r   dictstrr   tuplerp   r2   rv   ry   r6   r7   r8   r;   r;   Y   s         N>CmTX $u $u{ $u $u $u $uV 26F FLF lF 	F
 tCH~.F 
u|U\)	*F F F F"ie&6 i i i i3 3 3c 3 3 3 3 3 3r7   r;   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )BambaRotaryEmbeddinginv_freqNr<   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)superrd   hasattr
isinstancer   r~   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr<   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r`   r<   rA   r   	__class__s       r8   rd   zBambaRotaryEmbedding.__init__   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r7   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   rk   r   mpscpuF)device_typeenabledr?   rl   rB   )r   floatexpandrn   ru   rA   r   r   r   r1   autocast	transposero   cosr   sinrB   )
r`   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r8   forwardzBambaRotaryEmbedding.forward   s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/rz   )r-   r.   r/   r1   r}   r3   r   rd   no_gradr   r   __classcell__r   s   @r8   r   r      s         l/ /{ / / / / / /" U]__< <  _< < < < <r7   r   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nrk   r?   rl   )rn   r1   ro   )r   x1x2s      r8   rotate_halfr      s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r7   hidden_statesn_repri   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rn   r   reshape)r   r   batchnum_key_value_headsslenhead_dims         r8   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr7           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr?   r
   rx   rk   )rm   rB   )ptrainingr   )r   num_key_value_groupsr1   matmulr   rn   r   
functionalsoftmaxfloat32ru   rB   r   r   
contiguous)r   r   r   r   r   r   r   r   re   rf   attn_weightscausal_maskattn_outputs                r8   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r7   c                    |                     |          }|                     |          }|j        d         }| dd|f         | d|df         }}|dd|f         |d|df         }
}	||z  t          |          |z  z   }|	|z  t          |	          |z  z   }t          j        ||gd          }t          j        ||
gd          }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rk   .Nrl   )	unsqueezern   r   r1   ro   )qkr   r   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r8   apply_rotary_pos_embr     s    , --
&
&C
--
&
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{511C78Gs{{511C78G i&)r222Gi&)r222GGr7   c                       e Zd ZdZdedef fdZ eddd          	 	 dd
ej	        de
ej	        ej	        f         deej	                 dee         deej                 dee         de
ej	        ej	        f         fd            Z xZS )BambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr<   rg   c                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        d| _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        | j        z  |j        |j                  | _        d S )Nr   g      Tbias)r   rd   r<   rg   getattrrY   num_attention_headsr   r   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_proj)r`   r<   rg   r   s      r8   rd   zBambaAttention.__init__3  sB   "
F4F&Jd4dee$*$>&B\$\!}d*!'!9i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i&68JQWQf
 
 
r7   past_key_valuepast_key_values4.58new_nameversionNr   position_embeddingsr   cache_positionr   ri   c                 D   |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j	        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nrk   r   r?   )r   r   r   eagerr   )r   r   )rn   r   r   viewr   r   r   r   rp   rg   r   r<   _attn_implementationr   r   r   r   r   r   r   )r`   r   r   r   r   r   r   input_shapehidden_shapequery_statesre   rf   r   r   rh   attention_interfacer   r   s                     r8   r   zBambaAttention.forwardJ  s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r7   r%   )r-   r.   r/   r0   r   r4   rd   r   r1   r}   r   r   r   r2   r   r   r   r   r   s   @r8   r   r   0  s       GG
{ 
s 
 
 
 
 
 
. _%0A6RRR ,059)) ))|)) #5<#=>)) !.	))
 "%)) !!12)) +,)) 
u|U\)	*)) )) )) SR)) )) )) )) ))r7   r   c                   (     e Zd Zd fd	ZddZ xZS )BambaRMSNormGatedư>c                     t                                                       t          j        t	          j        |                    | _        || _        d S rz   r   rd   r   	Parameterr1   onesweightvariance_epsilonr`   rY   epsr   s      r8   rd   zBambaRMSNormGated.__init__x  sB    l5:k#:#:;; #r7   Nc                    |j         }|                    t          j                  }|?|t          j                            |                    t          j                            z  }|                    d                              dd          }|t          j	        || j
        z             z  }| j        |                    |          z  S Nr?   rk   T)keepdim)rB   ru   r1   r   r   r   silupowmeanrsqrtr  r  )r`   r   gateinput_dtypevariances        r8   r   zBambaRMSNormGated.forward}  s    #)%((77)BM,>,>twwu}?U?U,V,VVM $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r7   r   rz   r-   r.   r/   rd   r   r   r   s   @r8   r   r   w  sQ        $ $ $ $ $ $
	; 	; 	; 	; 	; 	; 	; 	;r7   r   input_tensorpad_sizec                     t          | j                  dk    r
ddddd|ddfnddd|ddf}t          j        j                            | |dd          S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )rs   rn   r1   r   r   pad)r  r  	pad_shapes      r8   pad_tensor_by_sizer    sj     47|7I3J3Ja3O3OAq!Q!Q//VWYZ\]_gijlmUnI8""<ST"UUUr7   c                 "   t          | |          } t          | j                  dk    r.|                     | j        d         d|| j        d                   S |                     | j        d         d|| j        d         | j        d                   S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r
   r   rk   r?   )r  rs   rn   r   )r  r  
chunk_sizes      r8   reshape_into_chunksr    s     &lH==L
<!####L$6q$92z<K]^_K`aaa ##q!2z<3Ea3H,J\]^J_
 
 	
r7   c                    |                      d          } | d         j        g |                                  |R  } t          j        t          j        ||| j        t          j                  d          }|                     | d          } t          j        | d          }t          j        t          j        ||| j        t          j                  d          }|                    | t          j	                   }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    rk   .Nr@   diagonalr   rx   rl   )
sizer   r1   trilr  rA   boolmasked_fillcumsuminf)r  r  masktensor_segsums       r8   segment_sumr*    s     ""2&&J 2<	*1S<3D3D3F3FS
SSSL:ejZ@S[`[efffqstttD++TE155LL2666M :ejZ@S[`[efffqrsssD!--teeiZ@@Mr7   c                     |N|j         d         dk    r=|j         d         dk    r,| j        }| |dddddf         z                      |          } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rn   rB   ru   )r   r   rB   s      r8   apply_mask_to_padding_statesr,    si     !n&:1&=&A&AnFZ[\F]`aFaFa#&111d
)CCGGNNr7   c                   n    e Zd ZdZdedef fdZ	 	 	 	 ddej        de	e
         de	ej                 d	e	ej                 d
e	ej                 f
dZ	 	 	 dde	e
         de	ej                 d	e	ej                 fdZ	 	 	 	 dde	e
         de	ej                 d	e	ej                 d
e	ej                 fdZ xZS )
BambaMixeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r<   rg   c           	         t                                                       |j        | _        |j        | _        |j        | _        |j        | _        t          |j
        | j        z            | _        || _        |j        | _        |j        | _        t"          |j                 | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        dt;          d          f| _        d| _        d| _         | j        d| j        z  | j        z  z   | _!        tE          j#        | j!        | j!        |j        | j        | j!        | j        dz
            | _$        | j        | j!        z   | j        z   }tE          j%        | j        || j                  | _&        tE          j'        tQ          j)        | j                            | _*        tQ          j+        d| j        dz             }tE          j'        tQ          j,        |                    | _-        t]          | j        | j        	          | _/        tE          j'        tQ          j)        | j                            | _0        tE          j%        | j        | j        | j                  | _1        td          stf          4                    d
           d S tf          4                    d           d S )Nr   r'  gMbP?g?r?   r   )in_channelsout_channelsr   kernel_sizegroupspaddingr   r  a  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)5r   rd   r[   	num_headsrY   rQ   rb   rP   ra   r4   rX   intermediate_sizerg   mamba_conv_biasuse_conv_bias
hidden_act
activationr	   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrZ   n_groupsr\   r   mamba_chunk_sizer  r   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dr   in_projr   r1   r  dt_biasarangelogA_logr   normDout_projis_fast_path_availableloggerwarning_once)r`   r<   rg   projection_sizeAr   s        r8   rd   zBambaMixer.__init__  st   -!-$2 & 3!$V%84;K%K!L!L"#3 +&+,."("5-+ 1 !$U5\\2" .T]1BTEX1XXi'-=)A-
 
 
 04=@4>Qy
 
 
 |EJt~$>$>?? LDNQ.//\%)A,,//
%d&<$BYZZZ	ej8899	$"8$:JQUQ^___% 	h>      fgggggr7   Nr   cache_paramsr   r   r,   c                    t          ||          }|                     |          }|j        \  }}}	| j        | j        z  }
|d uob|j        o[|dk    oU|j        | j                 j        d         |j        | j                 j        d         cxk    o|k    nc o|d uo|d         dk    }|r|	                    d          
                    | j        | j        | j        gd          \  }}}t          ||j        | j                 | j        j        	                    d          | j        j        | j                  }t'          j
        || j        |
|
gd          \  }}}t'          j        | j                                                   }|d d d df         d d d d d f                             d| j        | j                                      t&          j                  }|d d d d d f                             dd| j                  }| j        d d d df                             d| j                  }| j        d d d df                             d| j                  }|                    || j        |j        d         | j        z            }|                    || j        |j        d         | j        z            }|                    || j        | j                  }t=          |j        | j                 ||||||d |d
  
        }|                    || j        | j        z            }|                     ||          }|                      |          d d d df         }nlt'          j        | j                                                   }| j!        d	t-          d
          fk    ri nd| j!        i}| j"        r|tG          || j        j        	                    d          | j        j        | j        |f| j        | j$        || j        | j        j        | j        j%        | j         j        | j         j        | j        | j        ddd|}n|
                    | j        | j        | j        gd          \  }}}|p|&                    dd          }tN          j(        )                    || j*        |j        d         z
  df          }|j        | j                 +                    |           | j        dvr[| ,                    |                     |&                    dd                    dd |f         &                    dd                    }nht[          |&                    dd          | j        j        	                    d          | j        j        | j        |          &                    dd          }t          ||          }t'          j
        || j        |
|
gd          \  }}}t]          |                    ||d| j                  |||                    ||| j        d          |                    ||| j        d          f| j$        | j        d |d| j        dd|\  }}|'|%|j        | j                 +                    |           |                    ||d          }|                     ||          }|                      |          }|S )Nr   r   rk   rl   .r   T)zrJ  dt_softplusr   r'  dt_limitF)rO  r  r,   r;  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr?   )r	  swish)r   r  r   r;  r,   )r  rO  rX  r,   rb  rJ  rY  )/r,  rI  rn   rA  rb   rO   rR   rg   rS   squeezesplitr7  rF  r6  r$   rH  r  r   r;  r1   exprM  r   r   r   ru   r   rJ  rO  r   r    rN  rP  rC  r   r"   r  r  r   r   r   r  ra   copy_r<  r#   r!   )r`   r   rV  r   r   r,   projected_statesrK   seq_lenrJ   groups_time_state_sizeuse_precomputed_statesr  hidden_states_B_CdtBCrU  rJ  rO  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrR   scan_output	ssm_states                              r8   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward  sJ    5]NSS<<66 "/!4
GQ!%1D!D $ &/&1& (8>qA&t~6<Q?      & d*& q!A% 	 " J	1*:*B*B1*E*E*K*K'GR +L + +'D#R
 !5!(8"**1-- ! ! #(+!')?AWX# # #M1a 4:++--...A!!!T3,111d
+222t}dFYZZ]]didq]rrAAAAqqq$J&&r2t}==Bl111dC<077DMJJGqqq$|$++B>>Az4=!'!*2MNNAz4=!'!*2MNNA%2%7%7
DNTXTa%b%b"2'7&   M *..z4>DM;YZZM IImT::M --..qqq$|<CC 4:++--...A$($8S%,,<O$O$ObbV`bfbvUwO } V1!56$K&..q11K$L f####'9#3 $	 :#'=#7!%!3 M M%*(-# $ &% , /?.D.D+T]DNKQS /E / /+'  + 4E3N3NqRS3T3T0"$-"3"34.1M1STV1WWYZ[# #K !,T^<BB;OOO?*;;;(,$5$?$?1$E$EFFsHWH}U__`acdee) )%% )9+55a;;#{199!<<![-#'? ') ) )  i1oo & %AARTb$c$c!&+k%+-CE[\' ' '#q! *C!&&z7BNNFF:wrBBFF:wrBB*  $f#(, L $* * &* *&Y" (\-E +DN;AA)LLL)..z7BGG"iiT:: mmK00
r7   c                    3 |j         \  }}}|j        }t          ||          }                     |          }	|	                     j         j         j        gd          \  }
}}|d uob|j        o[|dk    oU|j	         j
                 j         d         |j         j
                 j         d         cxk    o|k    nc o|d uo|d         dk    }|r|j	         j
                                     dd          |j	         j
        <   |d d dd d f                             |j	         j
                 j                  |j	         j
                 d d d d df<   |j	         j
                                      j        j        j                  }t#          j        | j        j                            d          z  d          } j        r| j        j        z   }                     |          }n|p|                    dd          }t0          j                            | j        |j         d         z
  df          }|j	         j
                                     |                                                     |                    dd                    dd |f                             dd                    }t          ||          }t#          j        | j         j         j        z   j         j        z  gd          \  }}}t#          j         j         !                                           }|ra|j         j
                 j        }|d d dd d f         d d d df         }|                    dd          "                    ||j         d          j#                  } j$        d	         "                     j$        j         d          j#                  }t"          j        j        %                    ||                    |j                  z             }t#          j&        | j'        d          j'        d                   }|d
         "                     j         j#         j                                      t"          j(                  }t#          j        |d	         |z                                |          }|)                    | j        d          dd d d f         }|"                    | j         j         j        z  |j         d                   *                                }|)                    |d|j         d                   }|d	         |dd d d f         z  }|)                    |d j#                  }||d	         z                      |          }|j         j
                                     |j         j
                 |z  |z              |)                    | j        d          dd d d f         }|"                    | j         j         j        z  |j         d                   *                                }|)                    |d|j         d                   }|j         j
                                     |j        |j                  }|+                    | j        z   j#         j                  }|+                    | j        z   j        d          }t#          j,        ||          }|+                    | j         j#                  } j-        d	         "                     j-        j         d          j#                  }|||z  z                       |j                  }|)                    |d          d d d df         }n+t0          j        %                    | j$        z             }t#          j&        | j'        d          j'        d                   }|)                    ||d j#                  !                                }|)                    ||d j                  !                                }|)                    ||d j                  !                                }|.                     j         j        z  d j                  }|.                     j         j        z  d j                  } j/        | j/        z  z
   j/        z  3 j-        d	         ta          |3          z  }||d	         z  }|                    |j                  |z  }3 fd||||fD             \  }}}}|1                    dddd          }t#          j2        |d          }t#          j        tg          |                    } |d d d d d d d d d d d f         |d d d d d d d d d d d f         z  }!|!                    d          }"|"d	         | 1                    ddddd          d	         z  }#|#                    d          }$|$d	         |d d d d d f         z                      d          }%t#          j        |d d d d d d dd f         |z
            }&||&1                    dddd          d	         z  }'|'dd d d f         |d	         z                      d          }(|r7|j         j
                 d d d df                             |(j                  })n t#          j4        |(d d d df                   })t#          j5        |)|(gd          }(t#          j        tg          t0          j                            |d d d d d d df         d                              }*|*                    dd          }*|*d
         |(d d d d d df         z                      d          }+|+d d d df         |+d d df         },}(t#          j        |          }-|dd d d f         |(d d d d d df         z  }.|-1                    dddd          }/|.                    d          |/d	         z  }0|%|0z   }|)                    |d j         j#                  }||z   }3dk    r|d d d |d d d d f         }|)                    ||d          }|,'|%|j         j
                                     |,            6                    ||
          }1 7                    |1                    |                    }2|2S )Nrk   rl   r   r   )shiftsdimsrC   r?   .r  ).NNr   r@   )rm   output_sizec                 <    g | ]}t          |j                  S r6   )r  r  )rI   tr  r`   s     r8   rL   z,BambaMixer.torch_forward.<locals>.<listcomp>P  s)    %z%z%z\]&9!Xt&W&W%z%z%zr7   r
   r  rx   )r   r   )8rn   rB   r,  rI  re  r7  rF  r6  rO   rR   rg   rS   rollru   rA   rH  r  r1   sumrd  r9  r   r<  r   r   r   r  ra   rg  rA  rb   rf  rM  r   r   r   rJ  softplusclamprC  r   r   r   r   bmmrO  repeat_interleaver  r  permuter&  r*  
zeros_likero   rN  rP  )4r`   input_statesrV  r   r   rK   ri  rJ   rB   rh  r  rl  rm  rk  rR   rs  r   rn  ro  rU  cache_devicerJ  dAdBdBxrS   ssm_states_reshaped
C_reshapedyrO  
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesru  state_decay_outC_times_statesstate_decay_out_permutedY_offrt  contextualized_statesr  s4   `                                                  @r8   torch_forwardzBambaMixer.torch_forward  s(    ".!3
GQ" 4L.QQ<<55&6&<&<'GR '= '
 '
#
 $ &/&1& (8>qA&t~6<Q?      & d*& q!A% 	 " 	x7C7OPTP^7_7d7dlnuw7d7x7xL$T^4ARSTSTSTVWYZYZYZSZA[A^A^_k_wx|  yG  `H  `O  BP  BPL$T^4QQQ2X> '24>BEET[M_MfEggK %	dk088;;;! ! ! ! I$58H$H! $): ; ; '/@/J/J1a/P/P, m//043HKgKmnpKq3qst2u  (8>>{KKK $5F5P5PQRTU5V5V)W)WX[]e^e]eXe)f)p)pqrtu)v)v w w89JN[[#k#T]T5H%H$-Z^ZmJmn
 
 
q! Ytz''))***! F	I'24>BIL AAAq!!!GQQQc\*Ba##**:rx|T]SSBl9-44T\5G5JDMZZG$--b7::bh3G3G.GHHBR!5a!8$:Nq:QRRB/"))$.$-I\]]``glgt`uuA)ByMA-..22,2GGB
 		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66AI3aaa<0B *11*b$-PPMi0044L4IIC #DN399'7"<sB   		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66A &0@CC18[\[bCccJ",//*t~2Mt}^b^q"r"r
T^ ;T=PRSTTJ	-z::Az4>4=AAA y!((a$-HHA]Q&&**1733A 		*b))!!!T3,7AA ''T\(9::BR!5a!8$:Nq:QRRB)11*gr4=YY__aaM		*gr43FGGMMOOA		*gr43FGGMMOOA##DNdm$CX\Xf#ggA##DNdm$CX\Xf#ggA'DO*CCtVH	*-?x-X-XXJ *ByM9M]())B.A &{%z%z%z%zboqrtuwxay%z%z%z"M1a 		!Q1%%A|A2...H 	+a..))A qqq!!!QQQaaa23a111dAAAqqq!!!8K6LLN""r"**A y\AIIaAq!,D,DY,OON""r"**A 	l]111aaa:%>>CCCJJF !9XaaaAAArssl%;h%FGGL,..q"b!<<YGGGc4l+mI.FFKKPQKRRF & B"."9$."I!!!TSV,"W"Z"Zbhbo"Z"p"p"'"26!!!RaR%="A"AY8a@@@F)K0A0A(111aaaQRQRQRTV;BWY_0`0`$a$abbK%//155K%o6111dC9PPUUZ[U\\J *111crc6 2Jqqq"u4EIF $i11OT111oqqq!!!T30GGN'6'>'>q!Q'J'J$#''++.Fy.QQE A		*b$.$-HHAJA!||aaa'111aaa'(		*gr22A $)A'7==iHHHii4((
 !%knnU.C.C D D$$r7   c                 d   t           r1d| j        j        j        j        v r|                     |||||          S |t          d          |j        }|G|j        d         dk    r6|j        d         dk    r%||d d d d d f         z  	                    |          }| 
                    ||||          S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )rQ  rI  r  rA   r   rv  NotImplementedErrorrB   rn   ru   r  )r`   r   rV  r   r   r,   r   rB   s           r8   r   zBambaMixer.forward  s     " 	sf0C0J0O&O&O,,]L.Zhjqrrr%n   #%.*>q*AA*E*E.J^_`JadeJeJe*^AAAqqq$J-GGKKERRM!!-~~^^^r7   )NNNN)NNN)r-   r.   r/   r0   r   r4   rd   r1   r}   r   r;   r2   r5   rv  r  r   r   r   s   @r8   r.  r.    s        ?h{ ?hs ?h ?h ?h ?h ?h ?hH DH5915-1g g|g ?@g !!12	g
 !.g %/*g g g gZ DH5915L% L% ?@L% !!12	L%
 !.L% L% L% L%d DH5915-1_ _ ?@_ !!12	_
 !._ %/*_ _ _ _ _ _ _ _r7   r.  c                   $     e Zd Z fdZd Z xZS )BambaMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        t          j        | j        | j        |j                  | _
        t          |j                 | _        d S )Nr   )r   rd   r<   rY   r7  r   r   mlp_bias	gate_projup_proj	down_projr	   r:  act_fnr`   r<   r   s     r8   rd   zBambaMLP.__init__  s    !-!'!94#3T5KRXRabbby!143IPVP_```4#94;KRXRabbbV./r7   c                     |                      |                     |                     |                    |                     |          z            }|S rz   )r  r  r  r  )r`   r   r  s      r8   r   zBambaMLP.forward  sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r7   r  r   s   @r8   r  r    sG        0 0 0 0 0      r7   r  RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )BambaRMSNormr   c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z;
        BambaRMSNorm is equivalent to T5LayerNorm
        Nr   r  s      r8   rd   zBambaRMSNorm.__init__  sD     	l5:k#:#:;; #r7   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S r  )	rB   ru   r1   r   r
  r  r  r  r  )r`   r   r  r  s       r8   r   zBambaRMSNorm.forward  s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r7   c                 H    t          | j        j                   d| j         S )Nz, eps=)r   r  rn   r  )r`   s    r8   
extra_reprzBambaRMSNorm.extra_repr  s&    )**II$2GIIIr7   r  )r-   r.   r/   rd   r   r  r   r   s   @r8   r  r    sb        $ $ $ $ $ $; ; ;J J J J J J Jr7   r  c                       e Zd Zddededef fdZ eddd	          	 	 	 	 	 	 	 ddej	        de
ej	                 de
ej                 de
e         de
e         de
e         de
ej                 de
eej	        ej	        f                  dee         deej        e
eej        ej        f                  f         fd            Z xZS )BambaDecoderLayerr>   r<   rg   
layer_typec                    t                                                       d}|dk    rt          nd } ||          | _        t	          |j        |j                  | _        t	          |j        |j                  | _        || _	        |dk    rt          ||          | _        d S |dk    rt          ||          | _        d S t          d          )Nr   r5  r>   )r<   rg   	attentionzInvalid layer_type)r   rd   r  feed_forwardr  rY   r?  input_layernormpre_ff_layernormr  r.  r>   r   	self_attn
ValueError)r`   r<   rg   r  num_expertsffn_layer_classr   s         r8   rd   zBambaDecoderLayer.__init__  s    &1Q&6&6((D+OF33+F,>FDWXXX ,V-?VEX Y Y Y$  #6YGGGDJJJ;&&+FI>>DNNN1222r7   r   r   r   r   NFr   r   r   output_attentions	use_cacher   r   r   ri   c	                 >   |}
|                      |          }| j        dk    r | j        d||||d|	}d}n$| j        dk    r | j        d||||||||d|	\  }}|
|z   }|}
|                     |          }|                     |          }|
|z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r>   )r   rV  r   r   Nr  )r   r   r   r   r  r  r   r   r6   )r  r  r>   r  r  r  )r`   r   r   r   r   r  r  r   r   r   residualself_attn_weightsoutputss                r8   r   zBambaDecoderLayer.forward  s"   F !,,];; ?g%%&DJ +,--	 
  M !%_++/=t~ 
0+-) /"3#-$7
0 
0 
0 
0,M, !=0 !--m<<))-88 =0 " 	,)++Gr7   )r>   )NNNFFNN)r-   r.   r/   r   r4   r   rd   r   r1   r}   r   r2   r;   r$  r   r   r'   FloatTensorr   r   r   s   @r8   r  r    s       3 3{ 3s 3 3 3 3 3 3 3" _%0A6RRR 2637FJ,1$)59KOK K|K !.K u/0	K
 ""BCK $D>K D>K !!12K &eEL%,,F&GHK 23K 
u (51BEDU1U+V"WW	XK K K SRK K K K Kr7   r  c                   H     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZ fdZ xZS )BambaPreTrainedModelr<   modelTr  r   c                 f   t                                          |           t          |t                    ry|j        j                            d           t          j        t          j	        d|j
        dz                       |j        _        |j        j                            d           d S d S )Ng      ?r   )r   _init_weightsr   r.  rJ  datafill_r1   rL  rK  r6  rM  rO  )r`   r   r   s     r8   r  z"BambaPreTrainedModel._init_weights?  s    f%%%fj)) 	%N%%c*** %	%,q&:JQ:N*O*O P PFLHM$$$$$	% 	%r7   )r-   r.   r/   r   r3   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  r   r   s   @r8   r  r  3  sq         &*#,-"3NL% % % % % % % % %r7   r  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         deej                 d	ee         d
ee         dee         deej	                 dee         defd                        Zdej
        dej
        dej
        ded
ef
dZedej
        dededej        dej
        defd            Zd Z xZS )
BambaModelr<   c           	      J   t                                          |           |j        | _        |j        | _        t          j        |j        |j        | j                  | _        g }t          |j
                  D ]2}|                    t          |||j        |                              3t          j        |          | _        |j        | _        t#          |j        |j                  | _        t)          |          | _        d| _        |                                  d S )N)rg   r  r5  )r<   F)r   rd   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrY   embed_tokensrU   rV   r]   r  rN   
ModuleListlayersr   r  r?  final_layernormr   
rotary_embgradient_checkpointing	post_init)r`   r<   decoder_layersrc   r   s       r8   rd   zBambaModel.__init__I  s	      !. +L):F<NPTP`aav/00 	r 	rA!!"3FaTZTlmnTo"p"p"pqqqqmN33$*$?!+F,>FDWXXX.f===&+#r7   N	input_idsr   r   r   inputs_embedsr  r  output_hidden_statesr   r   ri   c
                    ||n| j         j        }||n| j         j        }||n| j         j        }|d u |d uz  rt	          d          | j        r%| j        r|rt                              d           d}|| 	                    |          }|}|r|t                              d           |	&t          j        |j        d         |j                  }	||	                    d          }|                     |||	||          }|                     ||	          }|                     ||          }|rdnd }|rdnd }| j        D ]H}|j        d	k    r|n|}|r||fz  } ||f||||||	|d
|
}|d         }|r|d         ||d         fz  }I|                     |          }|r||fz  }|r|j        sd|_        |sd n|}t-          ||||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   rC   r   r6   r>   )r   r   r   r  r  r   r   T)last_hidden_stater   r   
attentions)r<   r  r  r  r  r  r   rR  rS  r  r1   rK  rn   rA   r   _update_causal_mask_update_mamba_maskr  r  r  r  rO   r   )r`   r  r   r   r   r  r  r  r  r   r   r   r   
mamba_maskr   all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r8   r   zBambaModel.forward\  s    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M% 	0:  
 !"\-*=a*@I]^^^N)33A66L..M>?L]
 
 ,,^^LL
 #oom\JJ"6@BBD0:d![ 	: 	:M'4'?7'J'JP[J# 6!m%55!)M
)) /"3#-$7
 
 
 
M *!,M  : #/"}Q'7&99N,,];;   	2-!11 	6?#E 	615O.!*?TT
&+&+%	
 
 
 	
r7   r  c           	      >   | j         j        dk    r
|d|v r|S d S ||                                nd}| j         j        dk    r!|st          j        |||| j                  rd S |j        }|j        d         }t          |t          j
                  r|j        d         n||z   dz   }	|                     |||	|||j        d                   }
| j         j        dk    r@|>|j        j        d	v r0|s.t          j        |          j        }t          j        |
|          }
|
S )
Nflash_attention_2r   r   sdpa)r  past_key_values_lengthis_trainingr   rk   )sequence_lengthtarget_lengthrB   r   rK   )r  xpunpu)r<   r   ry   r   _ignore_causal_mask_sdpar   rB   rn   r   r1   r}   5_prepare_4d_causal_attention_mask_with_cache_positionrA   r   finfomin_unmask_unattended)r`   r   r  r   r   r  past_seen_tokensrB   r  r  r   	min_dtypes               r8   r  zBambaModel._update_causal_mask  sy    ;+/BBB)c^.C.C%%4
 @O?Z?99;;;`a ;+v55>O5%>*'7 M	    t"&,Q/ .%,778N $$!O3a7 	 PP+')#)!, Q 
 
 K,66*%*.DDD% E E**.I0CKQZ[[Kr7   r  r  rB   rK   c                     | |                                  dk    r| }not          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	| ddddddf         | ddddddf         k    dddd| dddf                             |          }
|ddddddd|	f         |
z   }|dk    }|ddddddd|	f                             ||          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuerB   rA   r   r   rC   rk   r   )rm   r1   r  r   fullrA   triurK  r   r   clonern   ru   r%  )r   r  r  rB   r   rK   r   r   r  mask_lengthpadding_attention_maskpadding_masks               r8   r  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*8D$9I*Jn]^]^]^`dfgfgfgim]mNn*nAAqqq?*++QQQ.*"U)) '  +111aaaL[L+@ADZZ+q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r7   c                 Z    |}|d         dk    s|t          j        |dk              rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r1   all)r`   r   r   r  s       r8   r  zBambaModel._update_mamba_mask4  s>     $
!q  ^%?EIn`aNaDbDb%?Jr7   )	NNNNNNNNN)r-   r.   r/   r   rd   r   r   r   r1   r2   r}   r;   r  r$  r   r'   r   r   r  staticmethodr4   rB   r  r  r   r   s   @r8   r  r  G  s       {      &  151537FJ59$(,0/359`
 `
E,-`
 !.`
 u/0	`

 ""BC`
   12`
 D>`
 $D>`
 'tn`
 !!12`
 23`
 
!`
 `
 `
 ^ `
D:: l: 	:
 ::  : : : :x 555 5 {	5
 5 5 5 5 \5n	 	 	 	 	 	 	r7   r  c                       e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e         de	e         de	e         de	e
j                 deee
j        f         defd                        Z	 	 	 	 	 	 ddZ xZS )BambaForCausalLMzlm_head.weightlm_headcolwise_repr   logitsc                    t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        |j	        | _	        | 
                                 d S )NFr   )r   rd   r  r  r  r   r   rY   r  z_loss_coefficientr  r  s     r8   rd   zBambaForCausalLM.__init__F  su       ''
 +y!3V5FUSSS"("; 	r7   Nr   r  r   r   r   r  labelsr  r  r  r   logits_to_keepri   c                 n   ||n| j         j        }|	|	n| j         j        }	 | j        d
||||||||	|
d	|}|j        }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}| | j	        d
||| j         j
        d|}| j        dk    ra|                    d                              |j                                      d                                          }|| j        |z  z   }t#          |||j        |j        |j        	          S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  r   r   r   r  r  r  r  r   )r  r  r  r   rk   rl   r   r?   )lossr  r   r   r  r6   )r<   r  r  r  r  r   r4   slicer  loss_functionr  r  	logsumexpru   rB   r
  r  r   r   r   r  )r`   r  r   r   r   r  r  r  r  r  r   r  r   r  r   slice_indicesr  r  z_losss                      r8   r   zBambaForCausalLM.forwardP  s   N 2C1N--TXT_Tq$8$D  $+Jj 	
 ,64: ,
)%+'/!5),
 ,
 ,
 ,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%pVFt{OeppioppD&**))b)11444:4FFJJ1MMRRTTd5>>%#3!/)
 
 
 	
r7   Tc           	         |d u }	|	s]||d         |j         d         k    r|d d |j         d          d f         }nV|j         d         |j         d         k    r|d d |f         }n-t          | j        |j         d         | j        | j                  }|b|`|                                                    d          dz
  }|                    |dk    d           |	s|d d |j         d          d f         }||	rd|i}
nd|                                i}
|
	                    ||||| j        j
        |d           |                                D ]\  }}||
vr||
|<   |
S )Nrk   r   r   rC   r  r  )r   r   r  r   r  r   )rn   r;   r<   rB   rA   longr&  masked_fill_r   rp   num_logits_to_keepitems)r`   r  r   r   r  r   r   r  r   empty_past_kvmodel_inputsr   r   s                r8   prepare_inputs_for_generationz.BambaForCausalLM.prepare_inputs_for_generation  s    (4/  	)!"%);;;%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	>Y_Q/DK  O %,*>)..0077;;a?L%%n&91===  F+AAA	0B/B/D/D,DE $$+];LL')=)=)?)?@L ,#2&"0"&+"@"0 		
 		
 		
 !,,.. 	* 	*JC,&&$)S!r7   )NNNNNNNNNNr   )NNNNNT)r-   r.   r/   _tied_weights_keys_tp_plan_pp_planrd   r   r   r   r1   r2   r}   r;   r  r$  r   r4   r   r   r&  r   r   s   @r8   r  r  @  s       *+=)H_-z:;H      151537FJ59-1$(,0/35934K
 K
E,-K
 !.K
 u/0	K

 ""BCK
   12K
 )*K
 D>K
 $D>K
 'tnK
 !!12K
 c5</0K
 
 K
 K
 K
 ^ K
` > > > > > > > >r7   r  )r  r  r  )r   )Nr   )Qtypingr   r   r   r   r   r1   r   transformers.activationsr	   cache_utilsr   
generationr   integrationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.import_utilsr   r   configuration_bambar   +mamba_ssm.ops.triton.selective_state_updater    !mamba_ssm.ops.triton.ssd_combinedr!   r"   causal_conv1dr#   r$   
get_loggerr-   rR  r'   r;   Moduler   r   r}   r4   r   r   r   r   r   r   r  r  r*  r  rQ  r,  r.  r  r  r  r  r  r  __all__r6   r7   r8   <module>r?     s  6 = < < < < < < < < < < < < <        + + + + + +             ) ) ) ) ) ) 7 7 7 7 7 7 > > > > > > 9 9 9 9 9 9 O O O O O O O O K K K K K K K K F F F F F F F F & & & & & & R R R R R R R R R R R R 0 0 0 0 0 0 V V V V V V V V , , , , , ,  "RRRRRRmmmmmmmmm! 8DDDDDDDDD-7** 
	H	%	%    	    2Z3 Z3 Z3 Z3 Z3 Z3 Z3 Z3z!< !< !< !< !<29 !< !< !<H( ( (	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %6% % % %PD) D) D) D) D)RY D) D) D)N; ; ; ; ; ; ; ;*VU\ VS V V V V
 
 
(  ( 46FH\]^^   \_ \_ \_ \_ \_ \_ \_ \_~    ry     Y''J J J J J29 J J ('J(^ ^ ^ ^ ^2 ^ ^ ^B % % % % %? % % %& u u u u u% u u up \ \ \ \ \+_ \ \ \~ E
D
Dr7   