
     `iÜ                        d Z ddlZddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZ ddlm Z   ej!        e"          Z# e            rddl$m%Z% ndZ% e            rddl&m'Z'm(Z( ddl)m*Z* nd\  Z*Z(Z'da+d Z, G d d          Z- G d de	j.                  Z/ G d de	j.                  Z0 G d de          Z1e G d de                      Z2e ed !           G d" d#e                                  Z3e ed$!           G d% d&e                                  Z4e G d' d(e2                      Z5 ed)!           G d* d+e2e                      Z6g d,Z7dS )-zPyTorch MAMBA model.    N)	dataclass)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)PretrainedConfig)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_kernels_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNNc                      t           t           S t                      r ddlm}   | d          }|j        |j        fa nt                      rddlm}m} ||fa nda t           S )Nr   )
get_kernelzkernels-community/causal-conv1d)causal_conv1d_fncausal_conv1d_update)NN)_causal_conv1d_cacher   kernelsr   r    r   r   causal_conv1d)r   _causal_conv1d_kernelr   r    s       |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mamba/modeling_mamba.py_lazy_load_causal_conv1dr&   <   s    '## 
,&&&&&& *
+L M M 5 JLaLrs	#	%	% ,HHHHHHHH 46FG+    c                       e Zd ZdZdZej        dfdededej	        de
ej        edf         fdZd	ed
ej        dej        dej        fdZd	edej        fdZd ZdS )
MambaCachea.  
    Cache for mamba model which does not have attention mechanism and key value states.

    Arguments:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if
            a smaller batch size is used.
        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
            The default `dtype` to use when initializing the layer.
        device (`torch.device` or `str`, *optional*):
            The device on which the cache should be initialized. Should be the same as the layer.

    Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

        >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

        >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> cache_params = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
        >>> cache_position = torch.arange(len(inputs["input_ids"][0]), device=model.device)  # sequence length
        >>> outputs = model(**inputs, cache_params=cache_params, cache_position=cache_position, use_cache=True)
        >>> outputs.cache_params
        ```
    TNconfigmax_batch_sizedtypedevicec                 |   || _         || _        |j        | _        |j        | _        |j        | _        g | _        g | _        |t          j
        |          nd }t          |j                  D ]}t          j        | j         | j        | j        || j                  }t          j        | j         | j        | j        || j                  }t          j                            |           t          j                            |           | j                            |           | j                            |           d S )Nr-   r,   )r+   _dtypeintermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statestorchr-   rangenum_hidden_layerszeros_dynamomark_static_addressappend)selfr*   r+   r,   r-   _
conv_state	ssm_states           r%   __init__zMambaCache.__init__t   s@    -!'!9$/ & 2/1.0)/);f%%%v/00 	. 	.A',{#&%k( ( (J ',k#&#k' ' 'I M--j999M--i888##J///O""9----'	. 	.r'   	layer_idxnew_conv_statecache_positionreturnc                    | j         |         j        |j        k    r-| j         |                             |j                  | j         |<   | j         |         }|                    d| j        dz
            }|                    dd          }|                    |j        |j                  |d d d d |f<   | j         |                                          | j         |xx         |z  cc<   | j         |         S )Nr   r   )shiftsdimsr/   )r6   r-   toclampr5   rollr,   zero_)r?   rD   rE   rF   rA   s        r%   update_conv_statezMambaCache.update_conv_state   s    
 I&-1FFF*.*:9*E*H*HI^*_*_DY'%i0
'--a1F1JKK__BR_88
+9+<+<JDU]g]m+<+n+n
111aaa'(#))+++###z1###	**r'   new_ssm_statec                     | j         |                                          | j         |xx         |                    | j         |         j                  z  cc<   | j         |         S N)r7   rO   rL   r-   )r?   rD   rQ   s      r%   update_ssm_statezMambaCache.update_ssm_state   s^    	"((***	"""m&6&6ty7Q7X&Y&YY"""y))r'   c                     t          t          | j                            D ]@}| j        |                                          | j        |                                          Ad S rS   )r9   lenr6   rO   r7   )r?   rD   s     r%   resetzMambaCache.reset   sd    s4#34455 	/ 	/IY'--///OI&,,....	/ 	/r'   )__name__
__module____qualname____doc__is_compileabler8   float16r   intr,   r   r-   strrC   Tensor
LongTensorrP   rT   rW    r'   r%   r)   r)   O   s         B N #]15#. #. #. #. {	#.
 elC-.#. #. #. #.J++.3l+LQL\+	+ + + +"*# *el * * * *
/ / / / /r'   r)   c            
       6    e Zd ZdZdedef fdZd Z	 	 	 ddej	        de
e         d	e
ej                 d
e
ej                 fdZdde
e         d	e
ej                 d
e
ej                 fdZ	 	 	 dde
e         d	e
ej                 d
e
ej                 fdZ xZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r*   rD   c           	         t                                                       || _        |j        | _        |j        | _        |j        | _        |j        | _        t          |j
                  | _
        || _        |j        | _        t          j        | j        | j        |j        |j        | j        |j        dz
            | _        |j        | _        t$          |j                 | _        |j        | _        t          j        | j        | j        dz  |j                  | _        t          j        | j        | j
        | j        dz  z   d          | _        t          j        | j
        | j        d          | _        t5          j        d| j        dz   t4          j                  d d d f         }|                    | j        d                                          }t          j        t5          j         |                    | _!        t          j        t5          j"        | j                            | _#        t          j        | j        | j        |j                  | _$        |j        | _        | %                                 d S )	Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   rh   FTr,   rI   )&superrC   r*   hidden_sizer2   r3   r4   r5   r1   r^   time_step_rankrD   use_conv_biasr   Conv1dconv1d
hidden_act
activationr
   actuse_mambapyLinearuse_biasin_projx_projdt_projr8   arangefloat32expand
contiguous	ParameterlogA_logonesDout_projwarn_slow_implementation)r?   r*   rD   A	__class__s       r%   rC   zMambaMixer.__init__   s   !-$/ & 2!'!9!&"788"#1i./%*)&*
 
 
 !+&+,!- y!143IA3MTZTcdddi 68KdNadeNe8elqrrry!4d6LSWXXX LD/!35=III$PQPQPQ'RHHT+R00;;==\%)A,,//
ej)?@@AA	$"8$:JQWQ`aaa%%'''''r'   c                 *   t                      \  }}t          t          t          ||t          f          }|s\| j        r9t                      rt                              d           d S t          d          t                              d           d S d S )Na  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)
r&   allr   r   r   rx   r   loggerwarning_onceImportError)r?   r    r   is_fast_path_availables       r%   r   z#MambaMixer.warn_slow_implementation   s    1I1K1K..!$#%68HJ^`no"
 "
 & 	 ')) 	''S     & Z   ##W    	 	r'   Nhidden_statescache_paramsrF   attention_maskc                 $	   |                      |                              dd          }| j        r|t          || j        j        | j        r| j        j        nd | j        j        | j	        j        | j
        j        | j        r| j
        j                                        nd t          j        | j                                                   d d | j                                        | j	        j                                        d          }nt#                      \  }}|                    dd          \  }}	|||                    d          z  }| j        j                            | j        j                            d          | j        j                            d                    }
|c|d         dk    rW ||                    d          |j        | j                 |
| j        j        | j                  }|                    d          }nq|Qt4          j                            || j        |j        d         z
  df          }|                    | j        ||            |||
| j        j        | j                  }|||                    d          z  }|                     |                    dd                    }t          j         || j!        | j"        | j"        gd          \  }}}| j	        j        |                    dd          z  }t          j        | j                                                   }tG          | j	        d	          r| j	        j                                        nd }|t|d         dk    rhtI          |j%        | j                 |d
         |d
         ||d d df         |d d df         | j        |	d
         |d
  
                            d          }nztM          ||||                    dd          |                    dd          | j                                        |	|dd
  
        \  }}|||'                    | j        |           | 
                    |                    dd                    }|S )Nr   rl   T)
delta_biasdelta_softplusdimr   rI   )rv   rh   ).r   )dt_softplus)r   return_last_state)(r{   	transposetrainingr   rt   weightrr   rh   r|   r}   r   rz   floatr8   expr   r   r&   chunk	unsqueezeviewsizesqueezer6   rD   rv   r   
functionalpadr5   shaperP   splitrq   r3   hasattrr   r7   r   rT   )r?   r   r   rF   r   projected_statescontextualized_statesr    r   gateconv_weightsr6   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsrB   s                        r%   cuda_kernels_forwardzMambaMixer.cuda_kernels_forward  s     <<66@@AFF= Y	P\1$2 "$($6@  D"#$.2mE"((***4:++--...<,2244#% % %!!" 6N5O5O2 "2"2"8"8"8"B"BM4) -0H0H0K0K K  ;-224;3E3J3J13M3Mt{OaOfOfghOiOijjL'N1,=,A,A 4 4!))"-- ,T^< K$O! ! !. 7 7 ; ;+"$-"3"3%(=@STV@W(WYZ'[# #K !224>;P^___ 0 0!<1Ado! ! ! ) -0H0H0K0K K "[[)@)@A)F)FGGN#k!4d6I4K^ _eg  OIq! "&!4y7J7J1a7P7P!P4:++--...A:A$,PV:W:WaT\.44666]aN'N1,=,A,A5 +DN;!&)&v.aaadGaaadGFL" $      )B--  +<!&KK1%%KK1%%FLLNN"#'&*+ + +'i (\-E 11$.)LLL %)MM,2H2HA2N2N$O$O!$$r'   c           	         |j         \  }}}|j        }|                     |                              dd          }	|	                    dd          \  }
}||
|                    d          z  }
||j        | j                                                 }|	                    |
j
                  }|j         d         | j        k    rt          j                            |
| j        |
j         d         z
  df          }|                    | j        ||           |                     |                     |
          dd |f                   }
n|                    | j        |
|          }|	                    | j        j        j
                  }t'          j        || j        j        d d dd d f         z  d          }
| j        r|
| j        j        z  }
|                     |
          	                    |                              d          }
n[t'          j        || j        | j        f|
j
        |          }|                     |                     |
          dd |f                   }
||
|                    d          z  }
|                     |
                    dd                    }t'          j        || j        | j        | j        gd          \  }}}|                     |          }t          j                            |                              dd          }t'          j        | j         !                                           }t'          j        |d d d d d d f         |d d d d d d d f         z            }|d d d d d d d f         |d d d d d d d f         !                                z  }||
d d d d d d d f         !                                z  }| j"        r| j#        r|tI          |                    dd          |                    dd                    }||                    d          z  %                    d                              dd          }||
| j&        d d d d f         z  z   }||                     |          z  }ng }tO          |          D ]}|d d d d |d d f         |z  |d d d d |d d f         z   }t'          j(        |	                    |          |d d |d d f                             d                    }|)                    |d d d d df                    t'          j*        |d          }||
| j&        d d d d f         z  z   }||                     |          z  }|%|j        | j                 +                    |           | ,                    |                    dd                    }|S )	Nr   rl   r   r   rI   .r/   r	   )-r   r,   r{   r   r   r   r7   rD   clonerL   r-   r5   r   r   r   rP   rw   rt   r   r8   sumrr   rh   r;   r1   r3   r|   r   rq   r}   softplusr   r   r   rx   r   r   r   r   r9   matmulr>   stackcopy_r   )r?   input_statesr   rF   r   
batch_sizeseq_lenr@   r,   r   r   r   rB   rA   r   r   r   r   r   r   
discrete_A
discrete_BdeltaB_uhsscan_outputr   ir   s                               r%   slow_forwardzMambaMixer.slow_forwardh  s   !-!3
GQ"<<55??1EE.44QA4>>t%)N,D,DQ,G,GGM #$/?EEGGI!]%9::I #A&$*???]..!*]-@-DDaH 

 ..t~z>ZZZ $])C)CC'M)R S S);;DNM[ijj
']]4;+=+DEE
 %	*t{7I!!!QPQPQPQ'7R*RXZ [ [ [% 6!T[%55M $ 7 7 : :5 A A K KB O OT3T5HI$+5  I !HHT[[%?%?XgX%NOOM%)N,D,DQ,G,GGM ]%<%<Q%B%BCC+T0$2EtGZ[ac
 
 
	1a "\\)44]334FGGQQRSUVWW Ytz''))***Yqqqq$!125G111aaaQU5VVWW
'111aaa6111dAAAqqq=9I9O9O9Q9QQ
aaaAAAtm < B B D DD  	I 	I,2Fz++Aq1183E3Ea3K3KLLBB/88;;EEaKKK%tQQQ}8M(MMK%6KKL7^^ : :&qqq!!!Qz2Y>!!!QQQPQSTSTST*AUU	#l9<<+>+>!!!Q'
@T@TUW@X@XYY##K111a$89999+l;;;K%aaa9N)NOK&$7K''7==iHHH !%k.C.CAq.I.I J J$$r'   c                 :   t                      \  }}t          t          t          ||t          f          }|rNd| j        j        j        j        v r6t          j
                                        s|                     ||||          S |                     ||||          S )Ncuda)r&   r   r   r   r   r|   r   r-   typer8   r<   is_compilingr   r   )r?   r   r   rF   r   r    r   r   s           r%   forwardzMambaMixer.forward  s     2J1K1K..!$#%68HJ^`no"
 "
 " 	jf0B0I0N&N&NW\WdWqWqWsWs&N,,]L.Zhiii  nn]]]r'   r   )rX   rY   rZ   r[   r   r^   rC   r   r8   r`   r   r)   ra   r   r   r   __classcell__r   s   @r%   rd   rd      s        )({ )(s )( )( )( )( )( )(V  6 .25959d% d%|d% z*d% !!12	d%
 !!12d% d% d% d%NO% O%x
7K O%aijojza{ O%  S[  \a  \l  Sm O% O% O% O%j .25959^ ^ z*^ !!12	^
 !!12^ ^ ^ ^ ^ ^ ^ ^r'   rd   c                   ,     e Zd Zd fd	Zd Zd Z xZS )MambaRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)ro   rC   r   r   r8   r   r   variance_epsilon)r?   rp   epsr   s      r%   rC   zMambaRMSNorm.__init__  sD     	l5:k#:#:;; #r'   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nrl   rI   T)keepdim)	r,   rL   r8   r   powmeanrsqrtr   r   )r?   r   input_dtypevariances       r%   r   zMambaRMSNorm.forward  s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r'   c                 :    | j         j        d          d| j         S )Nr   z, eps=)r   r   r   r?   s    r%   
extra_reprzMambaRMSNorm.extra_repr  s#    +#A&EEd.CEEEr'   )r   )rX   rY   rZ   rC   r   r   r   r   s   @r%   r   r     sb        $ $ $ $ $ $; ; ;F F F F F F Fr'   r   c                   r     e Zd Z fdZ	 	 	 ddee         deej                 deej                 fdZ xZ	S )
MambaBlockc                     t                                                       || _        || _        |j        | _        t          |j        |j                  | _        t          ||          | _
        d S )Nr   rD   )ro   rC   r*   rD   residual_in_fp32r   rp   layer_norm_epsilonnormrd   mixer)r?   r*   rD   r   s      r%   rC   zMambaBlock.__init__  sd    " & 7 !39RSSS	)<<<


r'   Nr   rF   r   c                    |}|                      |                    | j         j        j                            }| j        r|                    t
          j                  }|                     ||||          }||z   }|S )Nrn   r   rF   r   )r   rL   r   r,   r   r8   r   r   )r?   r   r   rF   r   residuals         r%   r   zMambaBlock.forward  s     !		-"2"29I9O"2"P"PQQ  	2{{5=11H

^dr # 
 
 !=0r'   r   )
rX   rY   rZ   rC   r   r)   r8   ra   r   r   r   s   @r%   r   r     s        = = = = = .25959  z* !!12	
 !!12       r'   r   c                   4    e Zd ZU eed<   dZddgZdZdZd Z	dS )MambaPreTrainedModelr*   backboner   rd   Tc                 	   | j         j        }t          |t                    rGt	          j        d|j        dz   t          j                  dddf         }|                    |j	        d          
                                }|j                            t	          j        |                     |j        j                            d           | j         j        dz  | j         j        z  }| j         j        dk    r+t(          j                            |j        j        |           n<| j         j        dk    r,t(          j                            |j        j        | |           t	          j        t	          j        | j         j	                  t9          j        | j         j                  t9          j        | j         j                  z
  z  t9          j        | j         j                  z                                 | j         j         	          }|t	          j        t	          j!        |                      z   }|j        j"                            |           d
|j        j"        _#        t(          j        $                    |j%        j        t9          j&        d                     |j%        j"        DtO          |j%        j"        dd          s)t(          j        (                    |j%        j"                   t(          j        $                    |j)        j        t9          j&        d                     | j         j*        r-|j)        j        }|t9          j&        | j         j+                  z  }t          |t(          j,                  rtO          |j        dd          s&t(          j        -                    |j        |           |j"        <tO          |j"        dd          s(t(          j        (                    |j"                   dS dS dS t          |t\                    r!|j        j                            d           dS t          |t(          j/                  r(t(          j        -                    |j        |           dS dS )zInitialize the weights.r   rn   NrI   g      ?g      constantrandom)minT   )a
_no_reinitF)std)0r*   initializer_range
isinstancerd   r8   r~   r3   r   r   r1   r   r   r   r   r   datafill_rq   time_step_scaletime_step_init_schemer   init	constant_r}   r   uniform_r   randmathtime_step_maxtime_step_minrM   time_step_floorexpm1rh   r   kaiming_uniform_rt   sqrtgetattrzeros_r   rescale_prenorm_residualr:   ry   normal_r   	Embedding)r?   moduler   r   dt_init_stddtinv_dtps           r%   _init_weightsz"MambaPreTrainedModel._init_weights  s   k+fj)) *	> Q 5 9OOOPTVWVWVWPWXA1266AACCALuy||,,,HM$$$+4d:T[=XXK{0J>>!!&."7EEEE2h>>  !6kRRR
4;8998DK566$+B[9\9\\^(4;4556  e3e44	  %)U["%5%5$5666FN%%f----1FN*G$$V]%9TYq\\$JJJ}!-v}1<GG 7GNN6=#5666G$$V_%;ty||$LLL{3 > O*TYt{<===fbi(( 		46=,>> 83777{&v{L%@@ 0GNN6;///// '&0 0-- 	4M$$S)))))-- 	4GOOFMsO33333	4 	4r'   N)
rX   rY   rZ   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr  rb   r'   r%   r   r     sM         "%|4&*#L84 84 84 84 84r'   r   z,
    Class for the MAMBA model outputs.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dS )MambaOutputa9  
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r   )rX   rY   rZ   r[   r  r   r8   FloatTensorr  r   r)   r   tuplerb   r'   r%   r  r  =  si           6:x 12999)-L(:&---8<M8E%"345<<<<<r'   r  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dS )MambaCausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r   )rX   rY   rZ   r[   r  r   r8   r  r  r  r   r)   r   r  rb   r'   r%   r  r  Q  s         
 
 )-D(5$
%,,,*.FHU&'...)-L(:&---8<M8E%"345<<<<<r'   r  c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee         d	ee         d
ee         dee         dee	j
                 dee	j
                 deeef         fd            Z xZS )
MambaModelc                    t                                                     t          j        j        j                  | _        t          j        fdt          j	                  D                       | _
        d| _        t          j        j                  | _        |                     | j                   |                                  d S )Nc                 2    g | ]}t          |           S )r   )r   ).0idxr*   s     r%   
<listcomp>z'MambaModel.__init__.<locals>.<listcomp>p  s&    $r$r$r3Z#%F%F%F$r$r$rr'   Fr   )ro   rC   r   r
  
vocab_sizerp   
embeddings
ModuleListr9   r:   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_initr?   r*   r   s    `r%   rC   zMambaModel.__init__l  s       ,v'8&:LMMm$r$r$r$rRWX^XpRqRq$r$r$rss&+#"6#56;TUUU//???r'   c                 v    |D ]5}d|v r/|                     |          ||                    dd          <    d S 6d S )Nz
embedding.zembeddings.)popreplace)r?   
state_dictprefixargsks        r%   r.  zMambaModel.load_hookx  sW     	 	Aq  EO^^TUEVEV
199\=AAB !	 	r'   c                     | j         S rS   r(  r   s    r%   get_input_embeddingszMambaModel.get_input_embeddings~  s
    r'   c                     || _         d S rS   r9  r?   new_embeddingss     r%   set_input_embeddingszMambaModel.set_input_embeddings  s    (r'   N	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictrF   r   rG   c	                    ||n| j         j        }||n| j        s| j         j        nd}||n| j         j        }|du |duz  rt          d          ||                     |          }| j        r| j        r|rd}|rp|\t          | j         |	                    d          |j
        |j                  }t          j        d| j         j        |j
                  }n|t          d          nd}|}	|rdnd}
| j        D ]} ||	|||	          }	|r|
|	fz   }
|                     |	          }	|r|
|	fz   }
|st#          d
 |	||
fD                       S t%          |	|r|nd|
          S )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r/   r-   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrb   r   c              3      K   | ]}||V  	d S rS   rb   )r$  vs     r%   	<genexpr>z%MambaModel.forward.<locals>.<genexpr>  s(      ffqXYXeXeXeXeXeffr'   )r  r   r   )r*   rB  r   rA  use_return_dict
ValueErrorr(  r+  r)   r   r-   r,   r8   r~   r4   r*  r,  r  r  )r?   r?  r@  r   rA  rB  rC  rF   r   r   all_hidden_statesmixer_blocks               r%   r   zMambaModel.forward  s#   ( %9$D  $+Jj 	 "+!6IIZ^Zg=rT[=R=Rmr	%0%<kk$+B]-t";< 	[YZZZ  OOI66M& 	4= 	Y 	I 	 #)K!3!3A!6!6}?S[h[n      "'a1HQ^Qe!f!f!f' !;  	 (  L%"6@BBD; 		I 		IK'K)--	  M $ I$58H$H!M22 	E 1]4D D 	gff]LBS$Tffffff+)2<+
 
 
 	
r'   )NNNNNNNN)rX   rY   rZ   rC   r.  r:  r>  r   r   r8   ra   r)   boolr   r  r  r   r   r   s   @r%   r!  r!  j  sI       
 
 
 
 
    ) ) )  1548-1$(/3&*5959L
 L
E,-L
   01L
 z*	L

 D>L
 'tnL
 d^L
 !!12L
 !!12L
 
uk!	"L
 L
 L
 ^L
 L
 L
 L
 L
r'   r!  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       e Zd ZdgZ fdZd Zd Z	 ddedee	e
f         ded	ee	e
f         fd
Z	 	 	 	 	 ddee         deej                 deej                 fdZe	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         deej                 dee         dee         dee         deej                 d	eeef         fd            Z xZS )MambaForCausalLMzlm_head.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S )NFrm   )
ro   rC   r!  r   r   ry   rp   r'  lm_headr/  r0  s     r%   rC   zMambaForCausalLM.__init__  s^       "6**y!3V5FUSSSr'   c                 4    | j                                         S rS   )r   r:  r   s    r%   r:  z%MambaForCausalLM.get_input_embeddings  s    }11333r'   c                 6    | j                             |          S rS   )r   r>  r<  s     r%   r>  z%MambaForCausalLM.set_input_embeddings  s    }11.AAAr'   r   outputsmodel_kwargsnum_new_tokensrG   c                 6   |                     dd           |d<   |                     dd          r"d|v r|d         |d         dd          |z   |d<   d|v rC|d         }t          j        ||                    |j        d         df          gd	          |d<   |S )
Nr   rA  TrF   rI   r   r   r   r   )getr8   catnew_onesr   )r?   rT  rU  rV  kwargsr   s         r%   #_update_model_kwargs_for_generationz4MambaForCausalLM._update_model_kwargs_for_generation  s     (/{{>4'H'H^$[$//	b L00-.:-9:J-KBCC-PSa-aL)*|++)*:;N-2Y!8!8.:Nq:QST9U!V!VW]_. . .L)* r'   Nr   rF   r   c                 R   d|                                 i}|r|t          j        d| j        j        j        |j                  }|d|i}|                    d          }	n|                    d          }	t          | j        j        |	| j        | j	                  }|rB|d         dk    r6|d d df         
                    d                                           |d<   d }|s|d|i}|                    ||||d           |                                D ]\  }
}|
|vr|||
<   |S )Nr?  r   rE  r@  r/   rI   )r   rA  rF   r   )r   r8   r~   r   r*   r4   r-   r   r)   r,   r   updateitems)r?   r?  r@  rA  r   rF   r   r[  model_inputsr+   keyvalues               r%   prepare_inputs_for_generationz.MambaForCausalLM.prepare_inputs_for_generation  sp    $Y%9%9%;%;< 	r-
 #\!T]-A-MV_VfgggN( /?!.!3!3A!6!6!*!2!2%dm&:NSWS^fjfpqqqL 	"*Q..(1!!!R%(8(B(B2(F(F(Q(Q(S(SL%!N 	<]6+];L ,&"0"0	 	
 	
 	
 !,,.. 	* 	*JC,&&$)S!r'   r?  r@  labelsrB  rC  rA  c
           
         ||n| j         j        }|                     |||||||	|          }|d         }|                     |                    | j        j        j                                                            }d}||                    |j                  }|dddddf         	                                }|dddf         	                                }t                      } ||                    d|                    d                    |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)r   r@  rB  rC  rA  rF   r   r   .rI   r   )r  r  r   r   )r*   rI  r   rQ  rL   r   r,   r   r-   r   r   r   r   r  r   r   )r?   r?  r   r@  r   rd  rB  rC  rA  rF   r[  mamba_outputsr   r  r  shift_logitsshift_labelsloss_fctoutputs                      r%   r   zMambaForCausalLM.forward-  s   2 &1%<kk$+B]%'!5#)) & 	
 	
 &a(m..t|/B/HIIJJPPRRYYv}--F!#ssAAA+.99;;L!#qrr'?5577L'))H8L--b,2C2CB2G2GHH,J[J[\^J_J_``D 	FYqrr!22F)-)9TGf$$vE"&3'5	
 
 
 	
r'   )r   )NNNNN)	NNNNNNNNN)rX   rY   rZ   _tied_weights_keysrC   r:  r>  r   dictr_   r   r^   r\  r   r)   r8   ra   rc  r   r  rM  r`   r   r  r  r   r   r   s   @r%   rO  rO    s        ++    4 4 4B B B YZ "26sCx.RU	c3h   , -15959. .
 z*. !!12. !!12. . . .`  155959-1-1/3&*$(15<
 <
E,-<
 !!12<
   12	<

 z*<
 )*<
 'tn<
 d^<
 D><
 !.<
 
u))	*<
 <
 <
 ^<
 <
 <
 <
 <
r'   rO  )rO  r!  r   r)   )8r[   r   dataclassesr   typingr   r   r   r8   r   torch.nnr   activationsr
   configuration_utilsr   
generationr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   r   configuration_mambar   
get_loggerrX   r   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   r!   r&   r)   Modulerd   r   r   r   r  r  r!  rO  __all__rb   r'   r%   <module>r~     sS      ! ! ! ! ! ! ' ' ' ' ' ' ' ' ' '        % % % % % % ! ! ! ! ! ! 3 3 3 3 3 3 ) ) ) ) ) ) 9 9 9 9 9 9 - - - - - -         
            - , , , , , 
	H	%	% #######E QXXXXXXXXRRRRRRR@P=-~      &d/ d/ d/ d/ d/ d/ d/ d/NQ^ Q^ Q^ Q^ Q^ Q^ Q^ Q^hF F F F F29 F F F(    +   8 ?4 ?4 ?4 ?4 ?4? ?4 ?4 ?4D   
= = = = =+ = =  =   
= = = = =+ = =  =& f
 f
 f
 f
 f
% f
 f
 f
R   P
 P
 P
 P
 P
+_ P
 P
 P
f S
R
Rr'   