
     `if                     ,   d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZmZmZmZmZmZ ddlmZmZmZmZ d	d
lmZ d	dlmZmZ d	dlmZ d	dlm Z  d	dl!m"Z"m#Z#m$Z$ d	dl%m&Z& d	dl'm(Z(m)Z) ddl*m+Z+  e)            rddl,m-Z- ddl.m/Z/m0Z0 ndZ- e(            r	ddl1m2Z2m3Z3 nd\  Z3Z2 e4e-e2e3f          Z5 e$j6        e7          Z8 G d ded          Z9 G d de
          Z
 G d de          Z:d4dZ; G d  d!e          Z< G d" d#e          Z=d$ Z> G d% d&ej?                  Z@ G d' d(e          ZA G d) d*e          ZB G d+ d,e          ZCe" G d- d.e                      ZDe" G d/ d0eD                      ZE G d1 d2e          ZFg d3ZGdS )5zPyTorch Bamba model.    )Optional	TypedDictUnionN)nn)ACT2FN) HybridMambaAttentionDynamicCacheJambaAttentionDecoderLayer)LlamaAttentionLlamaForCausalLMLlamaMLPLlamaRMSNormLlamaRotaryEmbeddingrotate_half)MambaRMSNormGatedpad_tensor_by_sizereshape_into_chunkssegment_sum   )AttentionMaskConverter)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_update)NNc                   d    e Zd ZU dZej        ed<   ej        ed<   eed<   eed<   ej        ed<   dS )BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bamba/modular_bamba.pyr(   r(   K   sb          " ########_r8   r(   F)totalc                   .    e Zd ZdZej        dfdefdZdS )r   a  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfigc                 $   |j         | _         d| _        |j        }|j        }g | _        g | _        g | _        t          |j                  D ]}| j         |         dk    rw| xj        t          j
        |j        |j        z  d|j        z  |z  z   ||          gz  c_        | xj        t          j
        |j        |j        ||          gz  c_        | xj        t          j        g gz            gz  c_        | xj        t          j        g gz            gz  c_        | j                            |           fdt          |j                  D             | _        fdt          |j                  D             | _        d S )NFmamba   devicedtyperA   c                 D    g | ]}t          j        g gz             S rC   r2   tensor.0_
batch_sizerA   s     r9   
<listcomp>z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>   s/    rrrQ%,tj'8HHHrrrr8   c                 D    g | ]}t          j        g gz             S rE   rF   rH   s     r9   rL   z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>   s/    tttqEL"
):6JJJtttr8   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr2   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headrG   append	key_cachevalue_cache)selfr<   rK   rB   rA   conv_kernel_sizessm_state_sizeis     ` `   r9   __init__z)HybridMambaAttentionDynamicCache.__init__s   s   !'!9"'!.-"$v/00 	2 	2A%a(G33  K",v/AAAH]D]`nDnn(%#  %    K",+&%#  	$ 	   U\2$2CF%S%S%S$TT  EL"
1B6$R$R$R#SS'..q1111rrrrrRWX^XpRqRqrrrtttttTYZ`ZrTsTstttr8   )r.   r/   r0   r1   r2   float16r!   rd   r7   r8   r9   r   r   e   sQ          ?DmTX $u $u{ $u $u $u $u $u $ur8   r   c                       e Zd ZdS )BambaRotaryEmbeddingNr.   r/   r0   r7   r8   r9   rg   rg              Dr8   rg   c                    |                     |          }|                     |          }|j        d         }| dd|f         | d|df         }}|dd|f         |d|df         }
}	||z  t          |          |z  z   }|	|z  t          |	          |z  z   }t          j        ||gd          }t          j        ||
gd          }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Ndim)	unsqueezeshaper   r2   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r9   apply_rotary_pos_embr~      s    , --
&
&C
--
&
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{511C78Gs{{511C78G i&)r222Gi&)r222GGr8   c                       e Zd ZdS )BambaAttentionNrh   r7   r8   r9   r   r      ri   r8   r   c                       e Zd ZdS )BambaRMSNormGatedNrh   r7   r8   r9   r   r      ri   r8   r   c                     |N|j         d         dk    r=|j         d         dk    r,| j        }| |dddddf         z                      |          } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr    r   )ro   rB   to)hidden_statesattention_maskrB   s      r9   apply_mask_to_padding_statesr      si     !n&:1&=&A&AnFZ[\F]`aFaFa#&111d
)CCGGNNr8   c                   n    e Zd ZdZdedef fdZ	 	 	 	 ddej        de	e
         de	ej                 d	e	ej                 d
e	ej                 f
dZ	 	 	 dde	e
         de	ej                 d	e	ej                 fdZ	 	 	 	 dde	e
         de	ej                 d	e	ej                 d
e	ej                 fdZ xZS )
BambaMixeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r<   	layer_idxc           	         t                                                       |j        | _        |j        | _        |j        | _        |j        | _        t          |j
        | j        z            | _        || _        |j        | _        |j        | _        t"          |j                 | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        dt;          d          f| _        d| _        d| _         | j        d| j        z  | j        z  z   | _!        tE          j#        | j!        | j!        |j        | j        | j!        | j        dz
            | _$        | j        | j!        z   | j        z   }tE          j%        | j        || j                  | _&        tE          j'        tQ          j)        | j                            | _*        tQ          j+        d| j        dz             }tE          j'        tQ          j,        |                    | _-        t]          | j        | j        	          | _/        tE          j'        tQ          j)        | j                            | _0        tE          j%        | j        | j        | j                  | _1        td          stf          4                    d
           d S tf          4                    d           d S )N        infgMbP?g?r?   r    )in_channelsout_channelsbiaskernel_sizegroupspadding)r   epsa  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)5superrd   r[   	num_headsrY   rQ   rb   rP   ra   r5   rX   intermediate_sizer   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrZ   n_groupsr\   head_dimmamba_chunk_size
chunk_sizefloattime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearin_proj	Parameterr2   onesdt_biasarangelogA_logr   normDout_projis_fast_path_availableloggerwarning_once)r`   r<   r   projection_sizeA	__class__s        r9   rd   zBambaMixer.__init__   st   -!-$2 & 3!$V%84;K%K!L!L"#3 +&+,."("5-+ 1 !$U5\\2" .T]1BTEX1XXi'-=)A-
 
 
 04=@4>Qy
 
 
 |EJt~$>$>?? LDNQ.//\%)A,,//
%d&<$BYZZZ	ej8899	$"8$:JQUQ^___% 	h>      fgggggr8   Nr   cache_paramscache_positionr   r-   c                    t          ||          }|                     |          }|j        \  }}}	| j        | j        z  }
|d uob|j        o[|dk    oU|j        | j                 j        d         |j        | j                 j        d         cxk    o|k    nc o|d uo|d         dk    }|r|	                    d          
                    | j        | j        | j        gd          \  }}}t          ||j        | j                 | j        j        	                    d          | j        j        | j                  }t'          j
        || j        |
|
gd          \  }}}t'          j        | j                                                   }|d d d df         d d d d d f                             d| j        | j                                      t&          j                  }|d d d d d f                             dd| j                  }| j        d d d df                             d| j                  }| j        d d d df                             d| j                  }|                    || j        |j        d         | j        z            }|                    || j        |j        d         | j        z            }|                    || j        | j                  }t=          |j        | j                 ||||||d |d
  
        }|                    || j        | j        z            }|                     ||          }|                      |          d d d df         }nlt'          j        | j                                                   }| j!        d	t-          d
          fk    ri nd| j!        i}| j"        r|tG          || j        j        	                    d          | j        j        | j        |f| j        | j$        || j        | j        j        | j        j%        | j         j        | j         j        | j        | j        ddd|}n|
                    | j        | j        | j        gd          \  }}}|p|&                    dd          }tN          j(        )                    || j*        |j        d         z
  df          }|j        | j                 +                    |           | j        dvr[| ,                    |                     |&                    dd                    dd |f         &                    dd                    }nht[          |&                    dd          | j        j        	                    d          | j        j        | j        |          &                    dd          }t          ||          }t'          j
        || j        |
|
gd          \  }}}t]          |                    ||d| j                  |||                    ||| j        d          |                    ||| j        d          f| j$        | j        d |d| j        dd|\  }}|'|%|j        | j                 +                    |           |                    ||d          }|                     ||          }|                      |          }|S )Nr    r   rk   rl   .rB   T)zr   dt_softplusr   r   dt_limitF)r   r   r-   r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr?   )siluswish)xweightr   r   r-   )r   r   r   r-   r   r   r   )/r   r   ro   r   rb   rO   rR   r   rS   squeezesplitr   r   r   r&   r   r   r   r   r2   expr   r   expandr   r   float32r   r   viewr"   r   r   r   trainingr$   r   variance_epsilon	transposer   
functionalpadra   copy_r   r%   r#   )r`   r   r   r   r   r-   projected_statesrK   seq_lenrJ   groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrR   scan_output	ssm_states                              r9   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward*  sJ    5]NSS<<66 "/!4
GQ!%1D!D $ &/&1& (8>qA&t~6<Q?      & d*& q!A% 	 " J	1*:*B*B1*E*E*K*K'GR +L + +'D#R
 !5!(8"**1-- ! ! #(+!')?AWX# # #M1a 4:++--...A!!!T3,111d
+222t}dFYZZ]]didq]rrAAAAqqq$J&&r2t}==Bl111dC<077DMJJGqqq$|$++B>>Az4=!'!*2MNNAz4=!'!*2MNNA%2%7%7
DNTXTa%b%b"2'7&   M *..z4>DM;YZZM IImT::M --..qqq$|<CC 4:++--...A$($8S%,,<O$O$ObbV`bfbvUwO } V1!56$K&..q11K$L f####'9#3 $	 :#'=#7!%!3 M M%*(-# $ &% , /?.D.D+T]DNKQS /E / /+'  + 4E3N3NqRS3T3T0"$-"3"34.1M1STV1WWYZ[# #K !,T^<BB;OOO?*;;;(,$5$?$?1$E$EFFsHWH}U__`acdee) )%% )9+55a;;#{199!<<![-#'? ') ) )  i1oo & %AARTb$c$c!&+k%+-CE[\' ' '#q! *C!&&z7BNNFF:wrBBFF:wrBB*  $f#(, L $* * &* *&Y" (\-E +DN;AA)LLL)..z7BGG"iiT:: mmK00
r8   c                    3 |j         \  }}}|j        }t          ||          }                     |          }	|	                     j         j         j        gd          \  }
}}|d uob|j        o[|dk    oU|j	         j
                 j         d         |j         j
                 j         d         cxk    o|k    nc o|d uo|d         dk    }|r|j	         j
                                     dd          |j	         j
        <   |d d dd d f                             |j	         j
                 j                  |j	         j
                 d d d d df<   |j	         j
                                      j        j        j                  }t#          j        | j        j                            d          z  d          } j        r| j        j        z   }                     |          }n|p|                    dd          }t0          j                            | j        |j         d         z
  df          }|j	         j
                                     |                                                     |                    dd                    dd |f                             dd                    }t          ||          }t#          j        | j         j         j        z   j         j        z  gd          \  }}}t#          j         j         !                                           }|ra|j         j
                 j        }|d d dd d f         d d d df         }|                    dd          "                    ||j         d          j#                  } j$        d	         "                     j$        j         d          j#                  }t"          j        j        %                    ||                    |j                  z             }t#          j&        | j'        d          j'        d                   }|d
         "                     j         j#         j                                      t"          j(                  }t#          j        |d	         |z                                |          }|)                    | j        d          dd d d f         }|"                    | j         j         j        z  |j         d                   *                                }|)                    |d|j         d                   }|d	         |dd d d f         z  }|)                    |d j#                  }||d	         z                      |          }|j         j
                                     |j         j
                 |z  |z              |)                    | j        d          dd d d f         }|"                    | j         j         j        z  |j         d                   *                                }|)                    |d|j         d                   }|j         j
                                     |j        |j                  }|+                    | j        z   j#         j                  }|+                    | j        z   j        d          }t#          j,        ||          }|+                    | j         j#                  } j-        d	         "                     j-        j         d          j#                  }|||z  z                       |j                  }|)                    |d          d d d df         }n+t0          j        %                    | j$        z             }t#          j&        | j'        d          j'        d                   }|)                    ||d j#                  !                                }|)                    ||d j                  !                                }|)                    ||d j                  !                                }|.                     j         j        z  d j                  }|.                     j         j        z  d j                  } j/        | j/        z  z
   j/        z  3 j-        d	         ta          |3          z  }||d	         z  }|                    |j                  |z  }3 fd||||fD             \  }}}}|1                    dddd          }t#          j2        |d          }t#          j        tg          |                    } |d d d d d d d d d d d f         |d d d d d d d d d d d f         z  }!|!                    d          }"|"d	         | 1                    ddddd          d	         z  }#|#                    d          }$|$d	         |d d d d d f         z                      d          }%t#          j        |d d d d d d dd f         |z
            }&||&1                    dddd          d	         z  }'|'dd d d f         |d	         z                      d          }(|r7|j         j
                 d d d df                             |(j                  })n t#          j4        |(d d d df                   })t#          j5        |)|(gd          }(t#          j        tg          t0          j                            |d d d d d d df         d                              }*|*                    dd          }*|*d
         |(d d d d d df         z                      d          }+|+d d d df         |+d d df         },}(t#          j        |          }-|dd d d f         |(d d d d d df         z  }.|-1                    dddd          }/|.                    d          |/d	         z  }0|%|0z   }|)                    |d j         j#                  }||z   }3dk    r|d d d |d d d d f         }|)                    ||d          }|,'|%|j         j
                                     |,            6                    ||
          }1 7                    |1                    |                    }2|2S )Nrk   rl   r    r   )shiftsdimsrC   r?   .).N).NNr   r@   )rm   output_sizec                 <    g | ]}t          |j                  S r7   )r   r   )rI   tpad_sizer`   s     r9   rL   z,BambaMixer.torch_forward.<locals>.<listcomp>]  s)    %z%z%z\]&9!Xt&W&W%z%z%zr8   r      )r    r   )8ro   rB   r   r   r   r   r   r   rO   rR   r   rS   rollr   rA   r   r   r2   sumr   r   r   r   r   r   r   r   ra   r   r   rb   r   r   r   r   r   r   softplusclampr   r   reshape
contiguousr   bmmr   repeat_interleaver   r   permutecumsumr   
zeros_likerp   r   r   )4r`   input_statesr   r   r   rK   r   rJ   rB   r   r   r   r   r   rR   r   r   r   r   r   cache_devicer   dAdBdBxrS   ssm_states_reshaped
C_reshapedyr   
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statesr   s4   `                                                  @r9   torch_forwardzBambaMixer.torch_forward  s(    ".!3
GQ" 4L.QQ<<55&6&<&<'GR '= '
 '
#
 $ &/&1& (8>qA&t~6<Q?      & d*& q!A% 	 " 	x7C7OPTP^7_7d7dlnuw7d7x7xL$T^4ARSTSTSTVWYZYZYZSZA[A^A^_k_wx|  yG  `H  `O  BP  BPL$T^4QQQ2X> '24>BEET[M_MfEggK %	dk088;;;! ! ! ! I$58H$H! $): ; ; '/@/J/J1a/P/P, m//043HKgKmnpKq3qst2u  (8>>{KKK $5F5P5PQRTU5V5V)W)WX[]e^e]eXe)f)p)pqrtu)v)v w w89JN[[#k#T]T5H%H$-Z^ZmJmn
 
 
q! Ytz''))***! F	I'24>BIL AAAq!!!GQQQc\*Ba##**:rx|T]SSBl9-44T\5G5JDMZZG$--b7::bh3G3G.GHHBR!5a!8$:Nq:QRRB/"))$.$-I\]]``glgt`uuA)ByMA-..22,2GGB
 		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66AI3aaa<0B *11*b$-PPMi0044L4IIC #DN399'7"<sB   		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66A &0@CC18[\[bCccJ",//*t~2Mt}^b^q"r"r
T^ ;T=PRSTTJ	-z::Az4>4=AAA y!((a$-HHA]Q&&**1733A 		*b))!!!T3,7AA ''T\(9::BR!5a!8$:Nq:QRRB)11*gr4=YY__aaM		*gr43FGGMMOOA		*gr43FGGMMOOA##DNdm$CX\Xf#ggA##DNdm$CX\Xf#ggA'DO*CCtVH	*-?x-X-XXJ *ByM9M]())B.A &{%z%z%z%zboqrtuwxay%z%z%z"M1a 		!Q1%%A|A2...H 	+a..))A qqq!!!QQQaaa23a111dAAAqqq!!!8K6LLN""r"**A y\AIIaAq!,D,DY,OON""r"**A 	l]111aaa:%>>CCCJJF !9XaaaAAArssl%;h%FGGL,..q"b!<<YGGGc4l+mI.FFKKPQKRRF & B"."9$."I!!!TSV,"W"Z"Zbhbo"Z"p"p"'"26!!!RaR%="A"AY8a@@@F)K0A0A(111aaaQRQRQRTV;BWY_0`0`$a$abbK%//155K%o6111dC9PPUUZ[U\\J *111crc6 2Jqqq"u4EIF $i11OT111oqqq!!!T30GGN'6'>'>q!Q'J'J$#''++.Fy.QQE A		*b$.$-HHAJA!||aaa'111aaa'(		*gr22A $)A'7==iHHHii4((
 !%knnU.C.C D D$$r8   c                 d   t           r1d| j        j        j        j        v r|                     |||||          S |t          d          |j        }|G|j        d         dk    r6|j        d         dk    r%||d d d d d f         z  	                    |          }| 
                    ||||          S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r    r   )r   r   r   rA   typer   NotImplementedErrorrB   ro   r   r  )r`   r   r   r   r   r-   kwargsrB   s           r9   forwardzBambaMixer.forward  s     " 	sf0C0J0O&O&O,,]L.Zhjqrrr%n   #%.*>q*AA*E*E.J^_`JadeJeJe*^AAAqqq$J-GGKKERRM!!-~~^^^r8   )NNNN)NNN)r.   r/   r0   r1   r!   r5   rd   r2   Tensorr   r   r3   r6   r   r  r"  __classcell__r   s   @r9   r   r      s        ?h{ ?hs ?h ?h ?h ?h ?h ?hH DH5915-1g g|g ?@g !!12	g
 !.g %/*g g g gZ DH5915L% L% ?@L% !!12	L%
 !.L% L% L% L%d DH5915-1_ _ ?@_ !!12	_
 !._ %/*_ _ _ _ _ _ _ _r8   r   c                       e Zd ZdS )BambaMLPNrh   r7   r8   r9   r'  r'    ri   r8   r'  c                       e Zd ZdS )BambaRMSNormNrh   r7   r8   r9   r)  r)    ri   r8   r)  c                       e Zd Zddededef fdZ eddd	          	 	 	 	 	 	 	 ddej	        de
ej	                 de
ej                 de
e         de
e         de
e         de
ej                 de
eej	        ej	        f                  dee         deej        e
eej        ej        f                  f         fd            Z xZS )BambaDecoderLayerr>   r<   r   
layer_typec                 0   t                                          ||           | `d}|dk    rt          nd } ||          | _        || _        |dk    rt          ||          | _        d S |dk    rt          ||          | _        d S t          d          )Nr    r>   )r<   r   	attentionzInvalid layer_type)
r   rd   	self_attnr'  feed_forwardr,  r   r>   r   
ValueError)r`   r<   r   r,  num_expertsffn_layer_classr   s         r9   rd   zBambaDecoderLayer.__init__  s    +++N&1Q&6&6((D+OF33$  #6YGGGDJJJ;&&+FI>>DNNN1222r8   past_key_valuepast_key_valuesz4.58)new_nameversionNFr   r   ru   output_attentions	use_cacher   position_embeddingsr!  returnc	                 >   |}
|                      |          }| j        dk    r | j        d||||d|	}d}n$| j        dk    r | j        d||||||||d|	\  }}|
|z   }|}
|                     |          }|                     |          }|
|z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r>   )r   r   r   r   Nr.  )r   r   ru   r5  r8  r9  r   r:  r7   )input_layernormr,  r>   r/  pre_ff_layernormr0  )r`   r   r   ru   r5  r8  r9  r   r:  r!  residualself_attn_weightsoutputss                r9   r"  zBambaDecoderLayer.forward  s"   F !,,];; ?g%%&DJ +,--	 
  M !%_++/=t~ 
0+-) /"3#-$7
0 
0 
0 
0,M, !=0 !--m<<))-88 =0 " 	,)++Gr8   )r>   )NNNFFNN)r.   r/   r0   r!   r5   strrd   r   r2   r#  r   r3   r   booltupler   r(   FloatTensorr"  r$  r%  s   @r9   r+  r+    s       3 3{ 3s 3 3 3 3 3 3 3" _%0A6RRR 2637FJ,1$)59KOK K|K !.K u/0	K
 ""BCK $D>K D>K !!12K &eEL%,,F&GHK 23K 
u (51BEDU1U+V"WW	XK K K SRK K K K Kr8   r+  c                   H     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZ fdZ xZS )BambaPreTrainedModelr<   modelTr+  r5  c                 f   t                                          |           t          |t                    ry|j        j                            d           t          j        t          j	        d|j
        dz                       |j        _        |j        j                            d           d S d S )Ng      ?r    )r   _init_weights
isinstancer   r   datafill_r2   r   r   r   r   r   )r`   moduler   s     r9   rJ  z"BambaPreTrainedModel._init_weights/  s    f%%%fj)) 	%N%%c*** %	%,q&:JQ:N*O*O P PFLHM$$$$$	% 	%r8   )r.   r/   r0   r!   r4   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulrJ  r$  r%  s   @r9   rG  rG  #  sq         &*#,-"3NL% % % % % % % % %r8   rG  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         deej                 d	ee         d
ee         dee         deej	                 dee         defd                        Zdej
        dej
        dej
        ded
ef
dZedej
        dededej        dej
        defd            Zd Z xZS )
BambaModelr<   c           	      J   t                                          |           |j        | _        |j        | _        t          j        |j        |j        | j                  | _        g }t          |j
                  D ]2}|                    t          |||j        |                              3t          j        |          | _        |j        | _        t#          |j        |j                  | _        t)          |          | _        d| _        |                                  d S )N)r   r,  r   )r<   F)r   rd   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrY   embed_tokensrU   rV   r]   r+  rN   
ModuleListlayers_attn_implementationr)  r   final_layernormrg   
rotary_embgradient_checkpointing	post_init)r`   r<   decoder_layersrc   r   s       r9   rd   zBambaModel.__init__9  s	      !. +L):F<NPTP`aav/00 	r 	rA!!"3FaTZTlmnTo"p"p"pqqqqmN33$*$?!+F,>FDWXXX.f===&+#r8   N	input_idsr   ru   r5  inputs_embedsr9  r8  output_hidden_statesr   r!  r;  c
                    ||n| j         j        }||n| j         j        }||n| j         j        }|d u |d uz  rt	          d          | j        r%| j        r|rt                              d           d}|| 	                    |          }|}|r|t                              d           |	&t          j        |j        d         |j                  }	||	                    d          }|                     |||	||          }|                     ||	          }|                     ||          }|rdnd }|rdnd }| j        D ]H}|j        d	k    r|n|}|r||fz  } ||f||||||	|d
|
}|d         }|r|d         ||d         fz  }I|                     |          }|r||fz  }|r|j        sd|_        |sd n|}t-          ||||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r    rC   r   r7   r>   )r   ru   r5  r8  r9  r   r:  T)last_hidden_stater5  r   
attentions)r<   r8  rh  r9  r1  rc  r   r   r   r]  r2   r   ro   rA   rn   _update_causal_mask_update_mamba_maskrb  r_  r,  ra  rO   r   )r`   rf  r   ru   r5  rg  r9  r8  rh  r   r!  r   causal_mask
mamba_maskr:  all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r9   r"  zBambaModel.forwardL  s    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M% 	0:  
 !"\-*=a*@I]^^^N)33A66L..M>?L]
 
 ,,^^LL
 #oom\JJ"6@BBD0:d![ 	: 	:M'4'?7'J'JP[J# 6!m%55!)M
)) /"3#-$7
 
 
 
M *!,M  : #/"}Q'7&99N,,];;   	2-!11 	6?#E 	615O.!*?TT
&+&+%	
 
 
 	
r8   input_tensorc           	      >   | j         j        dk    r
|d|v r|S d S ||                                nd}| j         j        dk    r!|st          j        |||| j                  rd S |j        }|j        d         }t          |t          j
                  r|j        d         n||z   dz   }	|                     |||	|||j        d                   }
| j         j        dk    r@|>|j        j        d	v r0|s.t          j        |          j        }t          j        |
|          }
|
S )
Nflash_attention_2r   r   sdpa)rg  past_key_values_lengthis_trainingr    rk   )sequence_lengthtarget_lengthrB   r   rK   )r  xpunpu)r<   r`  get_seq_lengthr   _ignore_causal_mask_sdpar   rB   ro   rK  r2   r#  5_prepare_4d_causal_attention_mask_with_cache_positionrA   r  finfomin_unmask_unattended)r`   r   rv  r   r5  r8  past_seen_tokensrB   r|  r}  rn  	min_dtypes               r9   rl  zBambaModel._update_causal_mask  sy    ;+/BBB)c^.C.C%%4
 @O?Z?99;;;`a ;+v55>O5%>*'7 M	    t"&,Q/ .%,778N $$!O3a7 	 PP+')#)!, Q 
 
 K,66*%*.DDD% E E**.I0CKQZ[[Kr8   r|  r}  rB   rK   c                     | |                                  dk    r| }not          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	| ddddddf         | ddddddf         k    dddd| dddf                             |          }
|ddddddd|	f         |
z   }|dk    }|ddddddd|	f                             ||          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )
fill_valuerB   rA   r    )diagonalrC   rk   r   )rm   r2   r  r  fullrA   triur   r   r   clonero   r   masked_fill)r   r|  r}  rB   r   rK   r!  rn  r  mask_lengthpadding_attention_maskpadding_masks               r9   r  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*8D$9I*Jn]^]^]^`dfgfgfgim]mNn*nAAqqq?*++QQQ.*"U)) '  +111aaaL[L+@ADZZ+q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r8   c                 Z    |}|d         dk    s|t          j        |dk              rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr    )r2   all)r`   r   r   ro  s       r9   rm  zBambaModel._update_mamba_mask$  s>     $
!q  ^%?EIn`aNaDbDb%?Jr8   )	NNNNNNNNN)r.   r/   r0   r!   rd   r   r   r   r2   r3   r#  r   rE  rC  r   r(   r   r"  rl  staticmethodr5   rB   r  rm  r$  r%  s   @r9   rW  rW  7  s       {      &  151537FJ59$(,0/359`
 `
E,-`
 !.`
 u/0	`

 ""BC`
   12`
 D>`
 $D>`
 'tn`
 !!12`
 23`
 
!`
 `
 `
 ^ `
D:: l: 	:
 ::  : : : :x 555 5 {	5
 5 5 5 5 \5n	 	 	 	 	 	 	r8   rW  c                   P    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         deej	                 d	eej                 d
ee
         dee
         dee
         deej                 deeej        f         defdZ	 	 	 	 	 	 ddZ xZS )BambaForCausalLMc                     t                                          |           |j        | _        |                                  d S )N)r   rd   z_loss_coefficientrd  )r`   r<   r   s     r9   rd   zBambaForCausalLM.__init__1  s>       "("; 	r8   Nr   rf  r   ru   r5  rg  labelsr9  r8  rh  r   logits_to_keepr;  c                 n   ||n| j         j        }|	|	n| j         j        }	 | j        d
||||||||	|
d	|}|j        }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}| | j	        d
||| j         j
        d|}| j        dk    ra|                    d                              |j                                      d                                          }|| j        |z  z   }t#          |||j        |j        |j        	          S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	rf  r   ru   r5  rg  r9  r8  rh  r   )logitsr  r[  r   rk   rl   r   r?   )lossr  r5  r   rk  r7   )r<   r8  rh  rH  rj  rK  r5   slicelm_headloss_functionr[  r  	logsumexpr   rB   powmeanr   r5  r   rk  )r`   rf  r   ru   r5  rg  r  r9  r8  rh  r   r  r!  rA  r   slice_indicesr  r  z_losss                      r9   r"  zBambaForCausalLM.forward8  s   J 2C1N--TXT_Tq$8$D  $+Jj 	
 ,64: ,
)%+'/!5),
 ,
 ,
 ,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%pVFt{OeppioppD&**))b)11444:4FFJJ1MMRRTTd5>>%#3!/)
 
 
 	
r8   Tc           	         |d u }	|	s]||d         |j         d         k    r|d d |j         d          d f         }nV|j         d         |j         d         k    r|d d |f         }n-t          | j        |j         d         | j        | j                  }|b|`|                                                    d          dz
  }|                    |dk    d           |	s|d d |j         d          d f         }||	rd|i}
nd|                                i}
|
	                    ||||| j        j
        |d           |                                D ]\  }}||
vr||
|<   |
S )Nrk   r    r   rC   rg  rf  )ru   r5  r9  r   r  r   )ro   r   r<   rB   rA   longr   masked_fill_r   updatenum_logits_to_keepitems)r`   rf  r5  r   rg  r   ru   r9  r!  empty_past_kvmodel_inputskeyvalues                r9   prepare_inputs_for_generationz.BambaForCausalLM.prepare_inputs_for_generation  s    (4/  	)!"%);;;%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	>Y_Q/DK  O %,*>)..0077;;a?L%%n&91===  F+AAA	0B/B/D/D,DE $$+];LL')=)=)?)?@L ,#2&"0"&+"@"0 		
 		
 		
 !,,.. 	* 	*JC,&&$)S!r8   )NNNNNNNNNNr   )NNNNNT)r.   r/   r0   rd   r   r2   r3   r#  r   rE  rC  r   r5   r   r"  r  r$  r%  s   @r9   r  r  0  sq            151537FJ59-1$(,0/35934K
 K
E,-K
 !.K
 u/0	K

 ""BCK
   12K
 )*K
 D>K
 $D>K
 'tnK
 !!12K
 c5</0K
 
 K
 K
 K
 K
` > > > > > > > >r8   r  )rW  r  rG  )Nr    )Hr1   typingr   r   r   r2   r   transformers.activationsr   (transformers.models.jamba.modeling_jambar   r	   (transformers.models.llama.modeling_llamar
   r   r   r   r   r   *transformers.models.mamba2.modeling_mamba2r   r   r   r   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.import_utilsr   r   configuration_bambar!   +mamba_ssm.ops.triton.selective_state_updater"   !mamba_ssm.ops.triton.ssd_combinedr#   r$   causal_conv1dr%   r&   r  r   
get_loggerr.   r   r(   rg   r~   r   r   r   Moduler   r'  r)  r+  rG  rW  r  __all__r7   r8   r9   <module>r     s  (   - - - - - - - - - -        + + + + + + q q q q q q q q                           ? > > > > > O O O O O O O O - - - - - - & & & & & &         
 1 0 0 0 0 0 V V V V V V V V , , , , , ,  "RRRRRRmmmmmmmmm! 8DDDDDDDDD-7**46FH\]^^  
	H	%	%    	    42u 2u 2u 2u 2u'G 2u 2u 2uj	 	 	 	 	/ 	 	 	
% % % %P	 	 	 	 	^ 	 	 		 	 	 	 	) 	 	 	  \_ \_ \_ \_ \_ \_ \_ \_~	 	 	 	 	x 	 	 		 	 	 	 	< 	 	 	^ ^ ^ ^ ^2 ^ ^ ^B % % % % %? % % %& u u u u u% u u upS S S S S' S S Sl E
D
Dr8   