
     `id                     p   d Z ddlZddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ  ej        e          Z e            rddlmZ ddlm Z m!Z! nd\  Z Z!Z e            r	ddl"m#Z#m$Z$ nd\  Z$Z# e%ee e!e#e$f          Z&dej'        de(fdZ)d Z*d Z+d Z, G d d          Z- G d dej        j.                  Z/ G d dej.                  Z0 G d  d!ej.                  Z1 G d" d#e          Z2e G d$ d%e                      Z3e ed&'           G d( d)e                                  Z4e ed*'           G d+ d,e                                  Z5e G d- d.e3                      Z6 ed/'           G d0 d1e3e                      Z7g d2Z8dS )3zPyTorch MAMBA2 model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )Mamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_update)NNinput_tensorpad_sizec                     t          | j                  dk    r
ddddd|ddfnddd|ddf}t          j        j                            | |dd          S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)modevalue)lenshapetorchr   
functionalpad)r   r   	pad_shapes      ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mamba2/modeling_mamba2.pypad_tensor_by_sizer'   A   sj     47|7I3J3Ja3O3OAq!Q!Q//VWYZ\]_gijlmUnI8""<ST"UUU    c                 "   t          | |          } t          | j                  dk    r.|                     | j        d         d|| j        d                   S |                     | j        d         d|| j        d         | j        d                   S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r      )r'   r    r!   reshape)r   r   
chunk_sizes      r&   reshape_into_chunksr.   L   s     &lH==L
<!####L$6q$92z<K]^_K`aaa ##q!2z<3Ea3H,J\]^J_
 
 	
r(   c                    |                      d          } | d         j        g |                                  |R  } t          j        t          j        ||| j        t          j                  d          }|                     | d          } t          j        | d          }t          j        t          j        ||| j        t          j                  d          }|                    | t          j	                   }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r*   .Ndevicedtype)diagonalr   dim)
sizeexpandr"   trilonesr2   boolmasked_fillcumsuminf)r   r-   masktensor_segsums       r&   segment_sumrB   `   s     ""2&&J 2<	*1S<3D3D3F3FS
SSSL:ejZ@S[`[efffqstttD++TE155LL2666M :ejZ@S[`[efffqrsssD!--teeiZ@@Mr(   c                     |N|j         d         dk    r=|j         d         dk    r,| j        }| |dddddf         z                      |          } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r!   r3   to)hidden_statesattention_maskr3   s      r&   apply_mask_to_padding_statesrG   t   si     !n&:1&=&A&AnFZ[\F]`aFaFa#&111d
)CCGGNNr(   c            
           e Zd ZdZej        dfdededej        de	e
         fdZ	 dd	ed
ej        dedej        fdZd	edej        fdZd ZdS )Mamba2Cachea  
    Arguments:
        config: Mamba2Config
        batch_size: int
        dtype: torch.dtype
        device: torch.device

    Attributes:
        dtype: (`torch.dtype`):
            The default `dtype` used to initializing the cache.
        conv_kernel_size: (`int`):
            Model's convolution kernel size taken from config.
        n_groups: (`int`):
            Model's number of groups taken from the config - similar to tensor parallel in Transformer.
        state_size: (`int`):
            Model's SSM state size taken from config.
        num_heads: (`int`):
            The number of heads used in the linear attention / SSM.
        head_dim: (`int`):
            The respective dimension of the heads used in the linear attention / SSM.
        intermediate_size: (`int`):
            Model's intermediate_size based on (expand * hidden_dim) from config.
        conv_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, conv_kernel_size, intermediate_size + 2 * n_groups * state_size]` that holds convolutional states.
        ssm_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, num_heads, head_dim, state_size]` that holds ssm states.
    Nconfig
batch_sizer3   r2   c           	         || _         |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        t          |j        |j	        z            | _
        t          j        |j        || j
        d| j        z  | j        z  z   | j        ||          | _        t          j        |j        || j        | j        | j        ||          | _        d S )Nr+   r1   )r3   conv_kernelconv_kernel_sizen_groups
state_size	num_headshead_dimintr9   hidden_sizeintermediate_sizer"   zerosnum_hidden_layersconv_states
ssm_states)selfrJ   rK   r3   r2   s        r&   __init__zMamba2Cache.__init__   s     
 & 2 +)!$V]V5G%G!H!H ;$"Q%6%HH!
 
 
  +$NMO
 
 
r(   F	layer_idxnew_conv_state
cache_initreturnc                 F   |r(|                     | j        j                  | j        |<   nk| j        |                             dd          | j        |<   |d d dd d f                              | j        j                  | j        |         d d d d df<   | j        |         S )Nr*   )shiftsdimsr   )rD   rX   r2   roll)rZ   r\   r]   r^   s       r&   update_conv_statezMamba2Cache.update_conv_state   s      	h*8*;*;D<L<S*T*TDY''*.*:9*E*J*JRT[]*J*^*^DY'4B111a74K4N4NtO_Of4g4gDY'111b1	**r(   new_ssm_statec                 j    |                     | j        j                  | j        |<   | j        |         S N)rD   rY   r2   )rZ   r\   re   s      r&   update_ssm_statezMamba2Cache.update_ssm_state   s.    %2%5%5do6L%M%M	"y))r(   c                 j    | j                                          | j                                         d S rg   )rX   zero_rY   rZ   s    r&   resetzMamba2Cache.reset   s1       r(   )F)__name__
__module____qualname____doc__r"   float16r   rS   r3   r   strr[   Tensorr<   rd   rh   rl    r(   r&   rI   rI      s         : KP-qu
 
"
03
<AK
aijman
 
 
 
< PU+ ++.3l+HL+	+ + + +*# *el * * * *         r(   rI   c                   (     e Zd Zd fd	ZddZ xZS )MambaRMSNormGatedư>c                     t                                                       t          j        t	          j        |                    | _        || _        d S rg   superr[   r   	Parameterr"   r;   weightvariance_epsilonrZ   rT   eps	__class__s      r&   r[   zMambaRMSNormGated.__init__   sB    l5:k#:#:;; #r(   Nc                    |j         }|                    t          j                  }|?|t          j                            |                    t          j                            z  }|                    d                              dd          }|t          j	        || j
        z             z  }| j        |                    |          z  S Nr+   r*   T)keepdim)r3   rD   r"   float32r   r#   silupowmeanrsqrtr}   r|   )rZ   rE   gateinput_dtypevariances        r&   forwardzMambaRMSNormGated.forward   s    #)%((77)BM,>,>twwu}?U?U,V,VVM $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r(   rw   rg   rm   rn   ro   r[   r   __classcell__r   s   @r&   rv   rv      sQ        $ $ $ $ $ $
	; 	; 	; 	; 	; 	; 	; 	;r(   rv   c            
       D    e Zd ZdZdedef fdZ	 	 	 ddej        de	e
         de	ej                 d	e	ej                 fd
Z	 	 	 ddej        de	e
         de	ej                 d	e	ej                 fdZ	 	 	 dde	e
         de	ej                 d	e	ej                 fdZ xZS )Mamba2Mixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    rJ   r\   c           	         t                                                       |j        | _        |j        | _        |j        | _        |j        | _        t          |j	        | j        z            | _
        t          |j                  | _        || _        |j        | _        |j        | _        t           |j                 | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        | j
        d| j        z  | j        z  z   | _        t7          j        | j        | j        |j        |j        | j        |j        dz
            | _        | j
        | j        z   | j        z   }t7          j        | j        ||j                  | _         t7          j!        tE          j#        | j                            | _$        tE          j%        d| j        dz             }t7          j!        tE          j&        |                    | _'        tQ          | j
        | j                  | _)        t7          j!        tE          j#        | j                            | _*        t7          j        | j
        | j        |j                  | _+        |j        | _        tX          stZ          .                    d           d S d S )Nr+   r   )in_channelsout_channelsbiaskernel_sizegroupspaddingr   r   a  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)/rz   r[   rQ   rT   rP   ssm_state_sizerM   rN   rS   r9   rU   time_step_rankr\   use_conv_bias
hidden_act
activationr   actlayer_norm_epsilonrms_normrO   rR   r-   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearuse_biasin_projr{   r"   r;   dt_biasarangelogA_logrv   normDout_projis_fast_path_availableloggerwarning_once)rZ   rJ   r\   projection_sizeAr   s        r&   r[   zMamba2Mixer.__init__   st   )!-$/ & 2!$V]T5E%E!F!F!&"788"#1 +&+,"("; +%5#1#1.T]1BTEX1XXi%*=&*
 
 
 04=@4>Qy
 
 
 |EJt~$>$>?? LDNQ.//\%)A,,//
%d&<$BYZZZ	ej8899	$"8$:JQWQ`aaa% 	>    	 	r(   NrE   cache_paramscache_positionrF   c                 H   t          ||          }|                     |          }|j        \  }}}| j        | j        z  }	|j        d         d| j        z  z
  d| j        z  | j        z  z
  | j        z
  dz  }
|||d         dk    r|                    d                              |
|
| j        | j	        | j        gd          \  }}}}}t          ||j        | j                 | j        j                            d          | j        j        | j                  }t#          j        || j        |	|	gd          \  }}}t#          j        | j                                                   }|d d d df         d d d d d f                             d| j        | j                                      t"          j                  }|d d d d d f                             dd| j                  }| j        d d d df                             d| j                  }| j        d d d df                             d| j                  }|                    || j        |j        d         | j        z            }|                    || j        |j        d         | j        z            }|                    || j        | j                  }t9          |j        | j                 ||||||d |d	
  
        }|                    || j        | j        z            }|                     ||          }|                     |          d d d df         }n^t#          j        | j                                                   }| j         d
t)          d          fk    ri nd| j         i}| j!        r|tE          || j        j                            d          | j        j        | j        |f| j        | j#        d | j        | j        j        | j        j$        | j        j        | j        j        | j        | j        ddd|}nw|                    |
|
| j        | j	        | j        gd          \  }}}}}|h|%                    dd          }tL          j'        (                    ||j)        |j        d         z
  df          }|*                    | j        |d           | j        dvr[| +                    |                     |%                    dd                    dd |f         %                    dd                    }ngtY          |%                    dd          | j        j                            d          | j        j        | j                  %                    dd          }t          ||          }t#          j        || j        |	|	gd          \  }}}t[          |                    ||d| j                  |||                    ||| j        d          |                    ||| j        d          f| j#        | j        d d d| j        dd|\  }}|||.                    | j        |           |                    ||d          }|                     ||          }|                     |          }|S )Nr*   r+   r   r   r6   .r3   T)zr   dt_softplusg        r?   dt_limitF)r   r-   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr\   r]   r^   )r   swish)xr|   r   r   )r-   r   r   r   r   r   r   r\   re   )/rG   r   r!   rO   r   rU   rQ   squeezesplitr   r   rX   r\   r   r|   r   r   r"   expr   floatr9   rR   rD   r   r   r   viewr   rY   r   r   r   trainingr   r-   r}   	transposer   r#   r$   rN   rd   r   r   r   rh   )rZ   rE   r   r   rF   projected_statesrK   seq_len_groups_time_state_sized_mlpr   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrX   scan_output	ssm_states                             r&   cuda_kernels_forwardz Mamba2Mixer.cuda_kernels_forward&  s    5]NSS<<66 "/!4
GQ!%1D!D"2&$(()$-$"556 n  #(B~VWGX[\G\G\0@0H0H0K0K0Q0Qt5t}dnU[] 1R 1 1-Aq$)2
 !5!(8"**1-- ! ! #(+!')?AWX# # #M1a 4:++--...A!!!T3,111d
+222t}dFYZZ]]didq]rrAAAAqqq$J&&r2t}==Bl111dC<077DMJJGqqq$|$++B>>Az4=!'!*2MNNAz4=!'!*2MNNA%2%7%7
DNTXTa%b%b"2'7&   M *..z4>DM;YZZM IImT::M --..qqq$|<CC 4:++--...A$($8S%,,<O$O$ObbV`bfbvUwO } T1!56$K&..q11K$L f# ##'9#3 $	 :#'=#7!%!3 M M%*(-# $ &% , 5E4J4JE4#94=$.Y_a 5K 5 511d-r  +3D3N3NqRS3T3T0"$-"3"34%69U9[\^9__abc# #K !22"&.Y] 3    ?*;;;(,$5$?$?1$E$EFFsHWH}U__`acdee) )%% )9+55a;;#{199!<<![-#'?	) ) )
  i1oo & %AARTb$c$c!&+k%+-CE[\' ' '#q! *C!&&z7BNNFF:wrBBFF:wrBB*  $f (, L $* * &* *&Y" (\-E 11DNZc1ddd)..z7BGG"iiT:: mmK00
r(   c                    2 |j         \  }}}|j        }t          ||          }                     |          }	|	j         d         d j        z  z
  d j        z   j        z  z
   j        z
  dz  }
|	                    |
|
 j         j	         j        gd          \  }}}}}|||d         dk    r|
                     j        |d           |j         j                                      j        j        j                  }t#          j        | j        j                            d          z  d          } j        r| j        j        z   }                     |          }n|h|                    dd          }t0          j                            ||j        |j         d         z
  df          }|
                     j        |d	                                                     |                    dd                    d
d |f                             dd                    }t          ||          }t#          j        | j         j         j        z   j         j        z  gd          \  }}}t#          j         j                                                   }|]|Z|d         dk    rM|j        j        }|d d dd d f         d d d d
f         }|                    dd                               ||j         d          j!                  } j"        d                               j"        j         d          j!                  }t"          j        j        #                    ||                    |j                  z             }t#          j$        | j%        d          j%        d                   }|d                               j         j!         j                                      t"          j&                  }t#          j        |d         |z                                |          }|'                    | j        d          d
d d d f         }|                     | j         j         j        z  |j         d                   (                                }|'                    |d|j         d                   }|d         |d
d d d f         z  }|'                    |d j!                  }||d         z                      |          }|)                     j        |j         j                 |z  |z              |'                    | j        d          d
d d d f         }|                     | j         j         j        z  |j         d                   (                                }|'                    |d|j         d                   }|j         j                                     |j        |j                  }|*                    | j        z   j!         j                  }|*                    | j        z   j        d          }t#          j+        ||          }|*                    | j         j!                  } j,        d                               j,        j         d          j!                  }|||z  z                       |j                  }|'                    |d          d d d d
f         }n0t0          j        #                    | j"        z             }t#          j$        | j%        d          j%        d                   }|'                    ||d j!                                                  }|'                    ||d j                                                  }|'                    ||d j                                                  }|-                     j         j        z  d j                  }|-                     j         j        z  d j                  } j.        | j.        z  z
   j.        z  2 j,        d         t_          |2          z  }||d         z  }|                    |j                  |z  }2 fd||||fD             \  }}}}|0                    dddd          }t#          j1        |d          }t#          j        te          |                    }|d d d d d d d d d d d f         |d d d d d d d d d d d f         z  } |                     d          }!|!d         |0                    ddddd          d         z  }"|"                    d          }#|#d         |d d d d d f         z                      d          }$t#          j        |d d d d d d dd f         |z
            }%||%0                    dddd          d         z  }&|&d
d d d f         |d         z                      d          }'|E|C|d         dk    r7|j         j                 d d d d
f                             |'j                  }(n t#          j3        |'d d d df                   }(t#          j4        |(|'gd          }'t#          j        te          t0          j                            |d d d d d d df         d                              })|)                    dd          })|)d         |'d d d d d d
f         z                      d          }*|*d d d df         |*d d df         }+}'t#          j        |          },|d
d d d f         |'d d d d d d
f         z  }-|,0                    dddd          }.|-                    d          |.d         z  }/|$|/z   }|'                    |d j         j!                  }||z   }2dk    r|d d d |d d d d f         }|'                    ||d          }|+||)                     j        |+            5                    ||          }0 6                    |0                    |                    }1|1S )Nr*   r+   r6   r   Fr   r2   r   T.r0   ).NNr   r   r1   )r7   output_sizec                 <    g | ]}t          |j                  S rt   )r.   r-   ).0tr   rZ   s     r&   
<listcomp>z-Mamba2Mixer.torch_forward.<locals>.<listcomp>H  s)    %z%z%z\]&9!Xt&W&W%z%z%zr(   r   r   r5   )r   r   )7r!   r3   rG   r   rU   rO   r   rQ   r   r   rd   r\   rX   rD   r   r|   r2   r"   sumr   r   r   r   r   r   r#   r$   rN   r   r   r   rY   r9   rR   r   softplusclampr   r   r,   
contiguousrh   r   bmmr   repeat_interleaver-   r'   permuter>   rB   
zeros_likecatr   r   )3rZ   rE   r   r   rF   rK   r   r   r3   r   r   r   r   r   rX   r   r   r   r   cache_devicer   dAdBdBxrY   ssm_states_reshaped
C_reshapedyr   
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statesr   s3   `                                                 @r&   torch_forwardzMamba2Mixer.torch_forward  s    "/!4
GQ# 5]NSS<<66!'+a$2H.HH1t}K\_c_rKrrsw  tB  B  GH  H,<,B,Bt5t~V\^ -C -
 -
)1d%r
 #(B~VWGX[\G\G\**T^Terw*xxx '24>BEET[M_MfEggK %	dk088;;;! ! ! ! I$58H$H! $): ; ; '/@/J/J1a/P/P, m//0<3PSoSuvxSy3y{|2}  ..Xcpt.uuu $5F5P5PQRTU5V5V)W)WX[]e^e]eXe)f)p)pqrtu)v)v w w89JN[[#k#T]T5H%H$-Z^ZmJmn
 
 
q! Ytz''))***#(B~VWGX[\G\G\'29L AAAq!!!GQQQc\*Ba##**:rx|T]SSBl9-44T\5G5JDMZZG$--b7::bh3G3G.GHHBR!5a!8$:Nq:QRRB/"))$.$-I\]]``glgt`uuA)ByMA-..22,2GGB
 		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66AI3aaa<0B *11*b$-PPMi0044L4IIC )).*5dnEJSP *    		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66A &0@CC18[\[bCccJ",//*t~2Mt}^b^q"r"r
T^ ;T=PRSTTJ	-z::Az4>4=AAA y!((a$-HHA]Q&&**1733A 		*b))!!!T3,7AA ''T\(9::BR!5a!8$:Nq:QRRB)11*gr4=YY__aaM		*gr43FGGMMOOA		*gr43FGGMMOOA##DNdm$CX\Xf#ggA##DNdm$CX\Xf#ggA'DO*CCtVH	*-?x-X-XXJ *ByM9M]())B.A &{%z%z%z%zboqrtuwxay%z%z%z"M1a 		!Q1%%A|A2...H 	+a..))A qqq!!!QQQaaa23a111dAAAqqq!!!8K6LLN""r"**A y\AIIaAq!,D,DY,OON""r"**A 	l]111aaa:%>>CCCJJF !9XaaaAAArssl%;h%FGGL,..q"b!<<YGGGc4l+mI.FFKKPQKRRF 'N,F>Z[K\_`K`K`"."9$."I!!!TSV,"W"Z"Zbhbo"Z"p"p"'"26!!!RaR%="A"AY8a@@@F)K0A0A(111aaaQRQRQRTV;BWY_0`0`$a$abbK%//155K%o6111dC9PPUUZ[U\\J *111crc6 2Jqqq"u4EIF $i11OT111oqqq!!!T30GGN'6'>'>q!Q'J'J$#''++.Fy.QQE A		*b$.$-HHAJA!||aaa'111aaa'(		*gr22A $)A--V_-```ii4((
 !%knnU.C.C D D$$r(   c                     t           r0d| j        j        j        j        v r|                     ||||          S |                     ||||          S )Ncuda)r   r   r|   r2   typer   r  )rZ   rE   r   r   rF   s        r&   r   zMamba2Mixer.forward  sW     " 	jf0C0J0O&O&O,,]L.Zhiii!!-~~^^^r(   r   )rm   rn   ro   rp   r   rS   r[   r"   rs   r   rI   
LongTensorr   r  r   r   r   s   @r&   r   r      s        >| > > > > > > >F /35915` `|` {+` !!12	`
 !.` ` ` `L -126/3B% B%|B% {+B%   01	B%
 !.B% B% B% B%P /35915	_ 	_ {+	_ !!12		_
 !.	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_r(   r   c                   &     e Zd Zd fd	Zd Z xZS )Mamba2RMSNormrw   c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zM
        Mamba2RMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        Nry   r~   s      r&   r[   zMamba2RMSNorm.__init__  sD     	l5:k#:#:;; #r(   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S r   )	r3   rD   r"   r   r   r   r   r}   r|   )rZ   rE   r   r   s       r&   r   zMamba2RMSNorm.forward  s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r(   r   r   r   s   @r&   r  r    sL        $ $ $ $ $ $; ; ; ; ; ; ;r(   r  c                   r     e Zd Z fdZ	 	 	 ddee         deej                 deej                 fdZ	 xZ
S )Mamba2Blockc                     t                                                       || _        || _        |j        | _        t          |j        |j                  | _        t          ||          | _
        d S )Nr   r\   )rz   r[   rJ   r\   residual_in_fp32r  rT   r   r   r   mixer)rZ   rJ   r\   r   s      r&   r[   zMamba2Block.__init__  sd    " & 7!&"4&:STTT	 9===


r(   Nr   r   rF   c                    |}|                      |                    | j         j        j                            }| j        r|                    t
          j                  }|                     ||||          }||z   }|S )Nr   r   r   rF   )r   rD   r|   r3   r  r"   r   r  )rZ   rE   r   r   rF   residuals         r&   r   zMamba2Block.forward  s     !		-"2"29I9O"2"P"PQQ  	2{{5=11H

^dr # 
 
 !=0r(   r   )rm   rn   ro   r[   r   rI   r"   r  rs   r   r   r   s   @r&   r  r    s        > > > > > /35915  {+ !!12	
 !.       r(   r  c                   2    e Zd ZU eed<   dZdgZdZdZd Z	dS )Mamba2PreTrainedModelrJ   backboner  Tc                 Z   | j         j        }t          |t                    rlt	          j        d| j         j        dz             }|j                            t	          j	        |                     |j
        j                            d           t	          j        t	          j        | j         j                  t          j	        | j         j                  t          j	        | j         j                  z
  z  t          j	        | j         j                  z                                 | j         j                  }|t	          j	        t	          j        |                      z   }|j                            |           d|j        _        t.          j                            |j        j        t          j        d                     |j        j        Dt=          |j        j        dd	          s)t.          j                            |j        j                   t.          j                            |j         j        t          j        d                     | j         j!        r-|j         j        }|t          j        | j         j"                  z  }t          |t.          j#                  rt=          |j        dd	          s&t.          j        $                    |j        |
           |j        <t=          |j        dd	          s(t.          j                            |j                   dS dS dS t          |tJ          tL          f          r!|j        j                            d           dS t          |t.          j'                  r(t.          j        $                    |j        |
           dS dS )zInitialize the weights.r   g      ?)minT   )aN
_no_reinitF)std)(rJ   initializer_range
isinstancer   r"   r   rQ   r   copy_r   r   datafill_r   randmathr   r   r   time_step_floorexpm1r   r'  r   initkaiming_uniform_r   r|   sqrtr   getattrzeros_r   rescale_prenorm_residualrW   r   normal_r  rv   	Embedding)rZ   moduler(  r   r   inv_dtps          r&   _init_weightsz#Mamba2PreTrainedModel._init_weights  s   k+fk** $	> Q 5 9::ALuy||,,,HM$$$
4;0118DK566$+B[9\9\\^(4;4556  e3e44	  %)U["%5%5$5666FN  ((((,FN%G$$V]%9TYq\\$JJJ}!-v}1<GG 7GNN6=#5666G$$V_%;ty||$LLL{3 > O*TYt{<===fbi(( 		46=,>> 83777{&v{L%@@ 0GNN6;///// '&0 00A BCC 	4M$$S)))))-- 	4GOOFMsO33333	4 	4r(   N)
rm   rn   ro   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr=  rt   r(   r&   r!  r!    sJ         "&&*#L24 24 24 24 24r(   r!  z-
    Class for the MAMBA2 model outputs.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dS )Mamba2Outputa:  
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   rE   )rm   rn   ro   rp   rF  r   r"   FloatTensorr>  r   rI   rE   tuplert   r(   r&   rE  rE    si           6:x 12999*.L(;'...8<M8E%"345<<<<<r(   rE  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dS )Mamba2CausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   rE   )rm   rn   ro   rp   rK  r   r"   rG  r>  rL  r   rI   rE   rH  rt   r(   r&   rJ  rJ    s         
 
 )-D(5$
%,,,*.FHU&'...*.L(;'...8<M8E%"345<<<<<r(   rJ  c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee         d	ee         d
ee         dee         dee	j
                 dee	j                 deeef         fd            Z xZS )Mamba2Modelc                    t                                                     t          j        j        j                  | _        t          j        fdt          j	                  D                       | _
        d| _        t          j        j                  | _        |                     | j                   |                                  d S )Nc                 2    g | ]}t          |           S )r  )r  )r   idxrJ   s     r&   r   z(Mamba2Model.__init__.<locals>.<listcomp>9  s&    $s$s$sC[3%G%G%G$s$s$sr(   Fr   )rz   r[   r   r9  
vocab_sizerT   
embeddings
ModuleListrangerW   layersgradient_checkpointingr  r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_initrZ   rJ   r   s    `r&   r[   zMamba2Model.__init__5  s       ,v'8&:LMMm$s$s$s$sSXY_YqSrSr$s$s$stt&+##F$6F<UVVV//???r(   c                 v    |D ]5}d|v r/|                     |          ||                    dd          <    d S 6d S )Nz
embedding.zembeddings.)popreplace)rZ   
state_dictprefixargsks        r&   rZ  zMamba2Model.load_hookA  sW     	 	Aq  EO^^TUEVEV
199\=AAB !	 	r(   c                     | j         S rg   rS  rk   s    r&   get_input_embeddingsz Mamba2Model.get_input_embeddingsG  s
    r(   c                     || _         d S rg   re  rZ   new_embeddingss     r&   set_input_embeddingsz Mamba2Model.set_input_embeddingsJ  s    (r(   N	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr   rF   r_   c	                    ||n| j         j        }||n| j        s| j         j        nd}||n| j         j        }|du |duz  rt          d          ||                     |          }| j        r| j        r|rd}|rp|\t          | j         |	                    d          |j
        |j                  }t          j        d| j         j        |j
                  }n|t          d          nd}|}
|rdnd}| j        D ]} ||
|||	          }
|r||
fz   }|                     |
          }
|r||
fz   }|st#          d
 |
||fD                       S t%          |
|r|nd|          S )a  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r1   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrt   r  c              3      K   | ]}||V  	d S rg   rt   )r   vs     r&   	<genexpr>z&Mamba2Model.forward.<locals>.<genexpr>  s(      ffqXYXeXeXeXeXeffr(   )rF  r   rE   )rJ   rn  r   rm  use_return_dict
ValueErrorrS  rW  rI   r8   r2   r3   r"   r   rM   rV  rX  rH  rE  )rZ   rk  rl  r   rm  rn  ro  r   rF   kwargsrE   all_hidden_statesmixer_blocks                r&   r   zMamba2Model.forwardM  s#   0 %9$D  $+Jj 	 "+!6IIZ^Zg=rT[=R=Rmr	%0%<kk$+B]-t";< 	[YZZZ  OOI66M& 	4= 	Y 	I 	 #*K!3!3A!6!6}?S[h[n      "'a1HQ^Qe!f!f!f' !;  	 (  L%"6@BBD; 		I 		IK'K)--	  M $ I$58H$H!M22 	E 1]4D D 	gff]LBS$Tffffff+)2<+
 
 
 	
r(   )NNNNNNNN)rm   rn   ro   r[   rZ  rf  rj  r   r   r"   r  rI   r<   rs   r   rH  rE  r   r   r   s   @r&   rN  rN  3  sH       
 
 
 
 
    ) ) )  1548.2$(/3&*5915P
 P
E,-P
   01P
 {+	P

 D>P
 'tnP
 d^P
 !!12P
 !.P
 
ul"	#P
 P
 P
 ^P
 P
 P
 P
 P
r(   rN  z
    The MAMBA2 Model transformer with a language modeling head on top (linear layer with weights not tied to the input
    embeddings).
    c                       e Zd Zg Z fdZd Zd Z	 	 	 	 	 ddee         dee	j
                 dee	j                 fdZe	 	 	 	 	 	 	 	 	 dd	ee	j
                 d
ee	j                 dee         dee	j
                 dee         dee         dee         dee	j                 dee	j                 deeef         fd            Z xZS )Mamba2ForCausalLMc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S )NFr   )
rz   r[   rN  r"  r   r   rT   rR  lm_headr[  r\  s     r&   r[   zMamba2ForCausalLM.__init__  s^       #F++y!3V5FUSSSr(   c                 4    | j                                         S rg   )r"  rf  rk   s    r&   rf  z&Mamba2ForCausalLM.get_input_embeddings  s    }11333r(   c                 6    | j                             |          S rg   )r"  rj  rh  s     r&   rj  z&Mamba2ForCausalLM.set_input_embeddings  s    }11.AAAr(   Nr   r   rF   c                 R   d|                                 i}|r|t          j        d| j        j        j        |j                  }|d|i}|                    d          }	n|                    d          }	t          | j        j        |	| j        | j	                  }|rB|d         dk    r6|d d df         
                    d                                           |d<   d }|s|d|i}|                    ||||d           |                                D ]\  }
}|
|vr|||
<   |S )Nrk  r   r   rl  r1   r*   )r   rm  r   rF   )r   r"   r   r"  rJ   rM   r2   r8   rI   r3   	unsqueezeupdateitems)rZ   rk  rl  rm  r   r   rF   rv  model_inputsmax_batch_sizekeyr   s               r&   prepare_inputs_for_generationz/Mamba2ForCausalLM.prepare_inputs_for_generation  sp    $Y%9%9%;%;< 	s-
 #\!T]-A-MV_VfgggN( /?!.!3!3A!6!6!*!2!2&t}';^TXT_gkgqrrrL 	"*Q..(1!!!R%(8(B(B2(F(F(Q(Q(S(SL%!N 	<]6+];L ,&"0"0	 	
 	
 	
 !,,.. 	* 	*JC,&&$)S!r(   rk  rl  labelsrn  ro  rm  r_   c
           
         ||n| j         j        }|                     ||||||||	          }|d         }|                     |                    | j        j        j                                                            }d}| | j        d||| j         j	        d|
}|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )ao  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        N)r   rl  rn  ro  rm  r   rF   r   )rL  r  rR  r   )rK  rL  r   rE   rt   )rJ   rt  r"  r|  rD   r|   r3   r   loss_functionrR  rJ  r   rE   )rZ   rk  rl  r   r  rn  ro  rm  r   rF   rv  mamba2_outputsrE   rL  rK  outputs                   r&   r   zMamba2ForCausalLM.forward  s   8 &1%<kk$+B]%'!5#)) ' 	
 	
 'q)m..t|/B/HIIJJPPRR%4%pVFt{OeppioppD 	FY!33F)-)9TGf$$vE#'4(6	
 
 
 	
r(   )NNNNN)	NNNNNNNNN)rm   rn   ro   _tied_weights_keysr[   rf  rj  r   rI   r"   r  rs   r  r   rG  r<   r   rH  rJ  r   r   r   s   @r&   rz  rz    s            4 4 4B B B .25915. .
 {+. !!12. !.. . . .`  1559.2-1/3&*$(15158
 8
E,-8
   128
 {+	8

 )*8
 'tn8
 d^8
 D>8
 !.8
 !.8
 
u**	+8
 8
 8
 ^8
 8
 8
 8
 8
r(   rz  )rz  rN  r!  )9rp   r/  dataclassesr   typingr   r   r"   r   activationsr   
generationr	   modeling_layersr
   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   configuration_mamba2r   
get_loggerrm   r   +mamba_ssm.ops.triton.selective_state_updater   !mamba_ssm.ops.triton.ssd_combinedr   r   causal_conv1dr   r   allr   rs   rS   r'   r.   rB   rG   rI   Modulerv   r   r  r  r!  rE  rJ  rN  rz  __all__rt   r(   r&   <module>r     s      ! ! ! ! ! ! " " " " " " " "        ! ! ! ! ! ! ) ) ) ) ) ) 9 9 9 9 9 9 - - - - - -         
 W V V V V V V V . . . . . . 
	H	%	%  kRRRRRRmmmmmmmmmZjW?AW 8DDDDDDDDD-7**!(  VU\ VS V V V V
 
 
(  (  J  J  J  J  J  J  J  J Z; ; ; ; ; ; ; ;$y_ y_ y_ y_ y_") y_ y_ y_x; ; ; ; ;BI ; ; ;"    ,   8 94 94 94 94 94O 94 94 94x   = = = = =; = =  =   = = = = =; = =  =& j
 j
 j
 j
 j
' j
 j
 j
Z   y
 y
 y
 y
 y
- y
 y
 y
x H
G
Gr(   