
     `i                    v   d Z ddlZddlmZmZmZ ddlZddlmc m	Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)  e            rddlm*Z*  e'            rddl+m,Z,m-Z- ddl.m/Z/ nd\  Z/Z-Z, e&            r	ddl0m1Z1m2Z2 nd\  Z2Z1 e3e/e-e1e2e,f          Z4 e"j5        e6          Z7	 	 	 dAdeej8        e9ej8                 df         dee:         deej8                 deej8        e:f         fdZ; G d d ej<                  Z=d!ej8        d"e:dej8        fd#Z> G d$ d%          Z? G d& d'ej<                  Z@ G d( d)e@          ZA G d* d+e@          ZBe@eAeBd,ZC G d- d.ej<                  ZD G d/ d0ej<                  ZE G d1 d2ej<                  ZF G d3 d4e          ZG G d5 d6e          ZHe  G d7 d8e                      ZIeGeHd9ZJe  G d: d;eI                      ZK G d< d=eIe          ZL G d> d?eeI          ZMg d@ZNdS )BzPyTorch Jamba model.    N)AnyOptionalUnion)nn   )ACT2FN)GenerationMixin)AttentionMaskConverter)!flash_attn_supports_top_left_maskis_flash_attn_available) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_ssm_available   )JambaConfig)_flash_attention_forward)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNN   router_logitsnum_expertsattention_maskreturnc                    | t          | t                    sdS t          | t                    r/| d         j        t          j        fd| D             d          }t          j        j                            |d          }t          j        ||d          \  }}t          j        j        	                    ||          }|@t          j
        |                                d          }	t          j
        |d          }
nD|j        \  }}|j        d         ||z  z  }|dddddddf                             |||||f                              d||                                        }t          j        |                                |z  d          t          j        |d          z  }	|ddddddf                             ||||j        d         f                              d|j        d                                                 }t          j        ||z  d          t          j        |d          z  }
|j        j        |j        j        nd}|j        d         t%          |          z  }t          j        |	dd|||j        d         z   f         |
                    d          z            }||z  S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        router_logits:
            Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                 :    g | ]}|                               S  )to).0layer_routercompute_devices     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/jamba/modeling_jamba.py
<listcomp>z,load_balancing_loss_func.<locals>.<listcomp>i   s%    OOO\__^,,OOO    dimr   )
isinstancetupledevicetorchcatr   
functionalsoftmaxtopkone_hotmeanfloatshapeexpandreshaper+   sumindexint	unsqueeze)r$   r%   top_kr&   concatenated_router_logitsrouting_weights_selected_expertsexpert_masktokens_per_expertrouter_prob_per_expert
batch_sizesequence_lengthnum_hidden_layersexpert_attention_mask router_per_expert_attention_maskdevice_indexrankoverall_lossr.   s                      @r/   load_balancing_loss_funcrW   F   s   : J}e$D$Dq-'' 
&q)0%*YOOOOOOOUV&
 &
 &
" h)112LRT1UUO*_eDDDA(%--.>LLK!J{'8'8':':BBB "'O!C!C!C&4&:#
O6<Q?JQ`D`a 4AAAtT12V&
OUKXYYWR,,R	 	 "Ik&7&7&9&9<Q&QWXYYY\a\e!q]
 ]
 ]
 
 4AAAt+,V&
O_EZ[\E]^__WR.q122R	 	) "'?=]+]cd!e!e!ehmhq,!i
 i
 i
 "
 4C3I3O3[?)//abL #c,&7&77D9!!!TD?+@+C$CCCDG]GgGghiGjGjj L +%%r1   c                   ,     e Zd Zd fd	Zd Zd Z xZS )JambaRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z;
        JambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr8   onesweightvariance_epsilon)selfhidden_sizeeps	__class__s      r/   r]   zJambaRMSNorm.__init__   sD     	l5:k#:#:;; #r1   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nr#   r4   T)keepdim)	dtyper+   r8   float32powr>   rsqrtra   r`   )rb   hidden_statesinput_dtypevariances       r/   forwardzJambaRMSNorm.forward   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r1   c                 H    t          | j        j                   d| j         S )Nz, eps=)r6   r`   r@   ra   )rb   s    r/   
extra_reprzJambaRMSNorm.extra_repr   s&    )**II$2GIIIr1   )rZ   )__name__
__module____qualname__r]   ro   rq   __classcell__re   s   @r/   rY   rY      sb        $ $ $ $ $ $; ; ;J J J J J J Jr1   rY   rl   n_repc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r@   rA   rB   )rl   rw   batchnum_key_value_headsslenhead_dims         r/   	repeat_kvr}      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr1   c                       e Zd ZdZdZej        dfdZ	 ddej        dej        de	de
eeef                  d	eej        ej        f         f
d
Zdej        fdZdde
e	         d	e	fdZdS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc           
         || _         |j        | _        d| _        |j        |j        z  }|j        }|j        }g | _        g | _        g | _	        t          |j                  D ]}| j        |         dk    rQ| xj        t          j        |||          gz  c_        | xj        t          j        |||          gz  c_        d| xj        t          j        g gz            gz  c_        | xj        t          j        g gz            gz  c_        | j	                            |           шfdt          |j                  D             | _        fdt          |j                  D             | _        d S )NFmambar7   rh   r7   c                 D    g | ]}t          j        g gz             S r   r8   tensorr,   rJ   rO   r7   s     r/   r0   z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>   s/    rrrQ%,tj'8HHHrrrr1   c                 D    g | ]}t          j        g gz             S r   r   r   s     r/   r0   z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>   s/    tttqEL"
):6JJJtttr1   )rh   layers_block_typehas_previous_statemamba_expandrc   mamba_d_statemamba_d_convconv_states
ssm_statestransformer_layersrangerQ   r8   zerosr   append	key_cachevalue_cache)	rb   configrO   rh   r7   intermediate_sizessm_state_sizeconv_kernel_sizeis	     ` `    r/   r]   z)HybridMambaAttentionDynamicCache.__init__   s   
!'!9"'"/&2DD-!."$v/00 	2 	2A%a(G33  K
,=?OX^fklll%    K
,=~V\dijjj$    U\2$2CF%S%S%S$TT  EL"
1B6$R$R$R#SS'..q1111rrrrrRWX^XpRqRqrrrtttttTYZ`ZrTsTstttr1   
key_statesvalue_states	layer_idxcache_kwargsr'   c                 D   | j         |         j        d         dk    r|| j         |<   || j        |<   nVt          j        | j         |         |gd          | j         |<   t          j        | j        |         |gd          | j        |<   | j         |         | j        |         fS )Nr4   r   r#   r2   )r   r@   r   r8   r9   )rb   r   r   r   r   s        r/   updatez'HybridMambaAttentionDynamicCache.update   s     >)$*2.!33(2DN9%*6DY''(-	4>)3Lj2Y_`(a(a(aDN9%*/)T5Ei5PR^4_ef*g*g*gDY'~i($*:9*EEEr1   beam_idxc                    t          t          | j                            D ];}| j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   =dS )zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   r7   index_selectr+   r   r   r   )rb   r   r   r7   s       r/   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_cache   sE   s4>2233 		i 		iI^I.5F(,y(A(N(NqRZR]R]^dReRe(f(fDN9%%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'_Y/6F)-)C)P)PQRT\T_T_`fTgTg)h)hDOI&&		i 		ir1   r   c                     || j         vr| j         d         n|}t          | j                  |k    rdS | j        |         j        d         S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )r   r   r   r@   )rb   r   s     r/   get_seq_lengthz/HybridMambaAttentionDynamicCache.get_seq_length  sS     3<4CZ2Z2ZD+A..`i	t~)++1~i(.r22r1   N)r   )rr   rs   rt   __doc__is_compileabler8   float16r]   TensorrE   r   dictstrr   r6   r   
LongTensorr   r   r*   r1   r/   r   r      s	         N16t u u u u> 26F FLF lF 	F
 tCH~.F 
u|U\)	*F F F F"ie&6 i i i i3 3 3c 3 3 3 3 3 3r1   r   c                   D    e Zd ZdZddedee         f fdZ eddd	          	 	 	 	 	 	 dde	j
        dee	j
                 dee	j                 dee         dededee	j                 dee	j
        ee	j
                 eee	j
                          f         fd            Z xZS )JambaAttentionz
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    Nr   r   c                 V   t                                                       || _        || _        |(t                              d| j        j         d           |j        | _        |j	        | _
        | j        | j
        z  | _        |j        | _        | j
        | j        z  | _        d| _        |j        | _        | j        | j
        z  | j        k    r t!          d| j         d| j
         d          t#          j        | j        | j
        | j        z  d          | _        t#          j        | j        | j        | j        z  d          | _        t#          j        | j        | j        | j        z  d          | _        t#          j        | j
        | j        z  | j        d          | _        d S )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).Fbias)r\   r]   r   r   loggerwarning_oncere   rr   rc   num_attention_heads	num_headsr|   rz   num_key_value_groups	is_causalattention_dropout
ValueErrorr   Linearq_projk_projv_projo_proj)rb   r   r   re   s      r/   r]   zJambaAttention.__init__  s   ",!8 , , ,   "-3(DN:#)#= $(Nd6N$N!!'!9MDN*t/???8RVRb 8 8%)^8 8 8   i 0$.4=2PW\]]]i 0$2JT]2Zafgggi 0$2JT]2Zafgggi >@PW\]]]r1   past_key_valuepast_key_values4.58new_nameversionFrl   r&   position_idsoutput_attentions	use_cachecache_positionr'   c                    |                                 \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }||	                    ||| j
                  \  }}t          || j                  }t          || j                  }t          j        ||                    dd                    t          j        | j                  z  }|$|d d d d d d d |j        d         f         }||z   }t$          j                            |dt          j                                      |j                  }t$          j                            || j        | j                  }t          j        ||          }|                                 || j        |	| j        fk    r5t7          d|| j        |	| j        f d	|                                            |                    dd                                          }|                    ||	| j                  }|                     |          }|sd }|||fS )
Nr   r#   r   r   r4   r3   rh   )ptrainingz `attn_output` should be of size z	, but is ) sizer   r   r   viewr   r|   	transposerz   r   r   r}   r   r8   matmulmathsqrtr@   r   r:   r;   ri   r+   rh   dropoutr   r   r   
contiguousrB   rc   r   )rb   rl   r&   r   r   r   r   r   bszq_lenrJ   query_statesr   r   attn_weightscausal_maskattn_outputs                    r/   ro   zJambaAttention.forward5  s    &**,,UA{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm&'6'='=j,X\Xf'g'g$J z4+DEE
 t/HII|L*2F2Fq!2L2LMMPTPYZ^ZgPhPhh%(AAAqqq2HJ4DR4H2H)HIK'+5L },,\r,WWZZ[g[mnn},,\T=S^b^k,lll<>>#t~udm!LLL)CPTP]3^ ) )$$&&) )  
 "++Aq11<<>>!))#ud6FGGkk+..  	 LL/99r1   r   NNNFFN)rr   rs   rt   r   r   r   rE   r]   r   r8   r   r   r   boolr6   ro   ru   rv   s   @r/   r   r     sC        
^ ^{ ^x} ^ ^ ^ ^ ^ ^: _%0A6RRR 2637FJ"'594: 4:|4: !.4: u/0	4:
 ""BC4:  4: 4: !!124: 
u|Xel3XeEL>Q5RR	S4: 4: 4: SR4: 4: 4: 4: 4:r1   r   c                        e Zd ZdZ fdZ	 	 	 	 	 	 ddej        deej                 deej                 dee	         d	e
d
e
deej                 fdZ xZS )JambaFlashAttention2aF  
    Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 `     t                      j        |i | t                      | _        d S r   )r\   r]   r   _flash_attn_uses_top_left_mask)rb   argskwargsre   s      r/   r]   zJambaFlashAttention2.__init__u  s6    $)&)))
 /P.Q.Q+++r1   NFrl   r&   r   r   r   r   r   c                    |                                 \  }	}
}|                     |          }|                     |          }|                     |          }|                    |	|
| j        | j                  }|                    |	|
| j        | j                                      dd          }|                    |	|
| j        | j                                      dd          }||	                    ||| j
                  \  }}t          || j                  }t          || j                  }| j        sdn| j        }|j        }|j        j        dk    r|j        j        nd}|t$          j        k    rt%          j                    r=t+          t$          d          rt%          j        |          nt%          j                    }n3t+          | j        d          r| j        j        }n| j        j        j        }t6                              d| d	           |                    |          }|                    |          }|                    |          }|                    dd          }|                    dd          }t=          |||||
|t?          | j        d
d           | j         | j!        	  	        }|"                    |	|
| j#                  $                                }| %                    |          }|sd }|||fS )Nr   r#           mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .sliding_window)r   r   r   use_top_left_mask)&r   r   r   r   r   r   r|   rz   r   r   r   r}   r   r   r   rh   r7   typer8   ri   is_autocast_enabledhasattrr   get_autocast_gpu_dtyper   r   r`   r   r   r+   r   getattrr   r   rB   rc   r   r   )rb   rl   r&   r   r   r   r   r   r   r   r   rJ   r   r   r   dropout_raterm   device_typetarget_dtyper   r   s                        r/   ro   zJambaFlashAttention2.forward}  s    &**,,UA{{=11[[//
{{=11
 $((eT^T]SS__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm&'6'='=j,X\Xf'g'g$J z4+DEE
 t/HII"&-KssT5K
 #(2>2E2Je2S2Sl)..Y^%-''(** 
8 u&:;;8E,[999577  &?@@ 8#{B#{17$ $ $ $   (??<88L#|44J'??<88L  ))!Q//
#--a33. "4;0@$GGn"A

 

 

 "))#ud6FGGRRTTkk+..  	 LL/99r1   r   )rr   rs   rt   r   r]   r8   r   r   r   r   r   ro   ru   rv   s   @r/   r   r   n  s         R R R R R 2637FJ"'59R: R:|R: !.R: u/0	R:
 ""BCR:  R: R: !!12R: R: R: R: R: R: R: R:r1   r   c                   &    e Zd ZdZ eddd          	 	 	 	 	 	 ddej        d	eej                 d
eej                 dee	         de
de
deej                 deej        eej                 eeej                          f         f fd            Z xZS )JambaSdpaAttentionz
    Jamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `JambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    r   r   r   r   NFrl   r&   r   r   r   r   r'   c                 B   |rAt                               d           t                                          ||||||          S |                                \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j	        | j
                                      dd          }|                    ||	| j        | j
                                      dd          }|                    ||	| j        | j
                                      dd          }||                    ||| j                  \  }}t          || j                  }t          || j                  }|}||d d d d d d d |j        d         f         }|j        j        dk    r>|<|                                }|                                }|                                }| j        o	|d u o|	dk    }t,          j        j                            ||||| j        r| j        nd|          }|                    dd                                          }|                    ||	| j                  }|                     |          }|d |fS )	Na  JambaModel is using JambaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)rl   r&   r   r   r   r   r   r#   r   cudar   )	attn_mask	dropout_pr   )r   r   r\   ro   r   r   r   r   r   r   r|   r   rz   r   r   r}   r   r@   r7   r   r   r   r8   r   r:   scaled_dot_product_attentionr   r   rc   r   )rb   rl   r&   r   r   r   r   r   r   r   rJ   r   r   r   r   r   r   re   s                    r/   ro   zJambaSdpaAttention.forward  s     	[   77??+-) /"3# #    &**,,UA{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm&'6'='=j,X\Xf'g'g$Jz4+DEE
 t/HII$%%aaaAAA/E1A"1E/E&EFK #v--.2L'2244L#..00J'2244L
 NH{d':Huqy	h)FF!04Fd,,3 G 
 
 "++Aq11<<>>!&&sE43CDDkk+..D/11r1   r   )rr   rs   rt   r   r   r8   r   r   r   r   r   r6   ro   ru   rv   s   @r/   r   r     s"         _%0A6RRR 2637FJ"'59G2 G2|G2 !.G2 u/0	G2
 ""BCG2  G2 G2 !!12G2 
u|Xel3XeEL>Q5RR	SG2 G2 G2 G2 G2 SRG2 G2 G2 G2 G2r1   r   )eagerflash_attention_2sdpac                        e Zd ZdZdef fdZ	 	 ddej        dee	         deej
                 fdZddee	         deej
                 fd	Z	 	 ddee	         deej
                 fd
Z xZS )JambaMambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r   c           	         t                                                       || _        || _        |j        | _        |j        | _        |j        | _        |j	        |j        z  | _
        |j        | _        |j        | _        |j        | _        t#          j        | j
        | j
        | j        | j        | j
        | j        dz
            | _        |j        | _        t,          |j                 | _        |j        | _        t#          j        | j        | j
        dz  | j                  | _        t#          j        | j
        | j        | j        dz  z   d          | _        t#          j        | j        | j
        d          | _        t=          j        d| j        dz             d d d f         }|                     | j
        d          !                                }t#          j"        t=          j#        |                    | _$        t#          j"        t=          j%        | j
                            | _&        t#          j        | j
        | j        | j                  | _'        tQ          | j        |j)                  | _*        tQ          | j        |j)                  | _+        tQ          | j        |j)                  | _,        tZ          st\          /                    d	           d S d S )
Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr#   r   FTr4   rd   aq  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)0r\   r]   r   r   rc   r   r   r   r   r   r   mamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projx_projdt_projr8   arangerA   r   r^   logA_logr_   Dout_projrY   rms_norm_epsdt_layernormb_layernormc_layernormis_fast_path_availabler   r   )rb   r   r   Are   s       r/   r]   zJambaMambaMixer.__init__6  s   "!-$2 & 3!'!4v7I!I$2#3.i./#-))A-
 
 
 !+&+, & 8 y!143IA3MTXTabbbi 68KdNadeNe8elqrrry!4d6LSWXXX LD/!344T111W=HHT+R00;;==\%)A,,//
ej)?@@AA	$"8$:JQUQ^___()<&BUVVV'(;ATUUU'(;ATUUU% 	^    	 	r1   Nrl   cache_paramsr&   c                 	   |j         \  }}}|d uoR|j        oK|dk    oE|j        | j                 j         d         |j        | j                 j         d         cxk    o|k    nc }|                     |                              dd          }|                    dd          \  }}	|||                    d          z  }| j	        j
                            | j	        j
                            d          | j	        j
                            d                    }
|r[t          |                    d          |j        | j                 |
| j	        j        | j                  }|                    d          }n~|Zt"          j                            || j        |j         d         z
  df          }|j        | j                                     |           t-          ||
| j	        j        | j                  }|||                    d          z  }|                     |                    dd                    }t1          j        || j        | j        | j        gd          \  }}}|                     |          }|                     |          }|                     |          }| j        j        j         }t1          j!                    5  t1          j"        | j        j        j                   | j        j        _         d d d            n# 1 swxY w Y   |                     |                              dd          }t1          j!                    5  || j        j        _         d d d            n# 1 swxY w Y   t1          j#        | j$        %                                           }||%                                nd }|rhtM          |j        | j                 |d         |d         ||d d df         |d d df         | j'        |	d         |d	
  
                            d          }ntQ          ||||                    dd          |                    dd          | j'        %                                |	|dd

  
        \  }}|'|%|j        | j                                     |           | )                    |                    dd                    }|S )Nr   r   r#   r2   r4   )r  ).r   T)dt_softplus)delta_softplusreturn_last_state)*r@   r   r   r   r   r  r   chunkrF   r  r`   r   r   r!   squeezer   r  r   r:   padr   copy_r    r  r8   splitr  r   r"  r#  r$  r  datano_grad
zeros_likeexpr  r?   r   r  r   r   )rb   rl   r'  r&   rO   seq_lenrJ   use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr&  scan_outputs	ssm_statecontextualized_statess                         r/   cuda_kernels_forwardz$JambaMambaMixer.cuda_kernels_forwardj  sX    "/!4
GQ$ /1 (8>qA&t~6<Q?      	 	  <<66@@AFF /44QA4>>t%)N,D,DQ,G,GGM {)..t{/A/F/Fq/I/I4;K]KbKbcdKeKeff! 	x0%%b))(8  M *33B77MM' m//@UXeXklnXo@oqr?stt(8>>{KKK,]L$+JZgkgvwwwM%)N,D,DQ,G,GGM ]%<%<Q%B%BCC+T0$2EtGZ[ac
 
 
	1a %%i00	QQ */]__ 	N 	N%*%5dl6G6L%M%MDL"	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N!\\)44>>q!DD]__ 	4 	4%3DL"	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 Ytz''))***3A3M--///SW! 	I1'7f%"6*!!!Q$!!!Q$V    imm L '8"Aq!!Aq!!#"&' ' '#L) $)A'7==iHHH !%l.D.DQ.J.J K K$$s$   3LLLM,,M03M0c           	      f   |j         \  }}}|j        }|                     |                              dd          }|                    dd          \  }	}
||	|                    d          z  }	t          |t                    }|r|j        | j	                 j         d         |k    r| j
        r%|j        | j	                                                 }n|j        | j	                 }|                    |	j                  }|j        r|dk    r|j        | j	                 j         d         |k    r|j        | j	                 }t!          j        |dd          }|	d d d d df         |d d d d df<   ||j        | j	        <   t!          j        || j        j        d d dd d f         z  d          }	| j        r|	| j        j        z  }	|                     |	                              |                              d          }	nt0          j                            |	| j        |	j         d         z
  df          }||j        | j	        <   |                     |                     |	          dd |f                   }	n[t!          j        || j        | j        f|	j        |          }|                     |                     |	          dd |f                   }	||	|                    d          z  }	|                     |	                    dd                    }t!          j         || j!        | j        | j        gd          \  }}}| "                    |          }| #                    |          }| $                    |          }| %                    |          }t0          j        &                    |                              dd          }t!          j'        | j(        )                                           }t!          j'        |d d d d d d f         |d d d d d d d f         z            }|d d d d d d d f         |d d d d d d d f         )                                z  }||	d d d d d d d f         )                                z  }g }tU          |          D ]}|d d d d |d d f         |z  |d d d d |d d f         z   }t!          j+        |                    |          |d d |d d f                             d                    }|,                    |d d d d df                    t!          j-        |d          }||	| j.        d d d d f         z  z   }||                     |
          z  }|r||j        | j	        <   | /                    |                    dd                    }|S )	Nr   r#   r2   r   r4   )shiftsdims.r   )0r@   rh   r  r   r,  rF   r5   r   r   r   r   cloner+   r7   r   r   r8   rollrC   r  r`   r  r   r  r   r:   r.  r   r   r   r   r  r0  r  r"  r#  r$  r  softplusr4  r  r?   r   r   r   stackr  r   )rb   input_statesr'  r&   rO   r5  rJ   rh   r7  rl   r8  r   rA  
conv_stater:  r;  r<  r=  r?  r&  
discrete_A
discrete_BdeltaB_ur@  r   scan_outputrB  s                              r/   slow_forwardzJambaMambaMixer.slow_forward  s   !-!3
GQ"<<55??1EE.44QA4>>t%)N,D,DQ,G,GGM|-MNN	 	P0@FqIZWW} D(3DNCIIKK		(3DNC	!]%9::I. T7a<< ,T^<B1ESS)5dnE
"Z
2BGGG
'4QQQ1W'=
111aaa8$;E(8 %	*t{7I!!!QPQPQPQ'7R*RXZ [ [ [% 6!T[%55M $ 7 7 : :5 A A K KB O O]..!*]-@-DDaH 
 <F(8 $])C)CC'M)R S ST3T5HI$+5  I !HHT[[%?%?XgX%NOOM%)N,D,DQ,G,GGM ]%<%<Q%B%BCC+T0$2EtGZ[ac
 
 
	1a %%i00	QQ!\\)44]334FGGQQRSUVWW Ytz''))***Yqqqq$!125G111aaaQU5VVWW
'111aaa6111dAAAqqq=9I9O9O9Q9QQ
aaaAAAtm < B B D DDw 	6 	6A"111aaaAAA:.:XaaaAqqqj=QQI,y||E':':AaaaAAAgJ<P<PQS<T<TUUKAAAqqq!G 45555k,B777!]TVD!!!TM5J%JK"TXXd^^3 	@6?L#DN3 !%k.C.CAq.I.I J J$$r1   c                     | j         rEt          rd| j        j        j        j        vrt          d          |                     |||          S |                     |||          S )Nr   zsFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device)	r  r%  r  r`   r7   r   r   rC  rQ  )rb   rl   r'  r&   s       r/   ro   zJambaMambaMixer.forward*  sw       	Z) V4;;M;T;Y-Y-Y  J   ,,]L.YYY  nMMMr1   r"   )rr   rs   rt   r   r   r]   r8   r   r   r   r   rC  rQ  ro   ru   rv   s   @r/   r  r  .  sF        2{ 2 2 2 2 2 2n DH59	h% h%|h% ?@h% !!12	h% h% h% h%VR% R%x@`7a R%  {C  DI  DT  {U R% R% R% R%p DH59	N N ?@N !!12	N N N N N N N Nr1   r  c                   $     e Zd Z fdZd Z xZS )JambaMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          |j                 | _        d S NFr   )r\   r]   r   rc   r   r   r   	gate_projup_proj	down_projr   r  act_fnrb   r   re   s     r/   r]   zJambaMLP.__init__;  s    !-!'!94#3T5KRWXXXy!143IPUVVV4#94;KRWXXXV./r1   c                     |                      |                     |                     |                    |                     |          z            }|S r   )rY  rZ  rW  rX  )rb   xrY  s      r/   ro   zJambaMLP.forwardE  sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r1   )rr   rs   rt   r]   ro   ru   rv   s   @r/   rT  rT  :  sG        0 0 0 0 0      r1   rT  c                   f     e Zd ZdZdef fdZdej        deej        ej        f         fdZ	 xZ
S )JambaSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    r   c                 d   t                                                       j        | _        j        | _        j        | _        j        | _        t          j
        | j        | j        d          | _        t          j        fdt          | j                  D                       | _        d S )NFr   c                 .    g | ]}t                    S r*   )rT  )r,   rJ   r   s     r/   r0   z0JambaSparseMoeBlock.__init__.<locals>.<listcomp>_  s!    %X%X%X1hv&6&6%X%X%Xr1   )r\   r]   rc   
hidden_dimr   ffn_dimr%   num_experts_per_tokrG   r   r   router
ModuleListr   expertsr[  s    `r/   r]   zJambaSparseMoeBlock.__init__W  s     ,/!-/
i1ANNN}%X%X%X%XdFV@W@W%X%X%XYYr1   rl   r'   c                    |j         \  }}}|                    d|          }|                     |          }t          j        |dt
          j                  }t          j        || j        d          \  }}|	                    |j
                  }t          j        ||z  |f|j
        |j                  }t
          j        j                            || j                                      ddd          }	t%          | j                  D ]}
| j        |
         }t          j        |	|
                   \  }}|j         d         dk    r>|d	|f                             d|          } ||          |||d	f         z  }|                    d||	                    |j
                             |                    |||          }||fS )
 r4   r   r   r2   )rh   r7   )num_classesr#   r   N)r@   r   re  Fr;   r8   r?   r<   rG   r+   rh   r   r7   r   r:   r=   r%   permuter   rg  whererB   
index_add_)rb   rl   rO   rP   rb  r$   rI   rK   final_hidden_statesrL   
expert_idxexpert_layeridxtop_xcurrent_statecurrent_hidden_statess                   r/   ro   zJambaSparseMoeBlock.forwarda  s   2?2E/
OZ%**2z::M22)MqLLL,1J
XZ,[,[,[))),,]-@AA#k/):6m>QZgZn
 
 
 h)112BPTP`1aaiijkmnpqrr   011 	d 	dJ<
3L[%<==JC{1~""
 *$+6>>r:NNM$0L$?$?/RWY\^bRbBc$c!  **1e5J5M5MmNa5b5bcccc199*oWabb"M11r1   )rr   rs   rt   r   r   r]   r8   r   r6   ro   ru   rv   s   @r/   r_  r_  K  s        	 	Z{ Z Z Z Z Z Z&2U\ &2eEL%,<V6W &2 &2 &2 &2 &2 &2 &2 &2r1   r_  c                   R    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	e         de	e         de	ej
                 deej        e	eej        ej        f                  f         fd            Z xZS )JambaAttentionDecoderLayerr   r   c                 j   t                                                       |j        |         }t          |j                 ||          | _        |dk    rt          nt          } ||          | _        t          |j
        |j                  | _        t          |j
        |j                  | _        d S )Nr   r  )r\   r]   layers_num_expertsJAMBA_ATTENTION_CLASSES_attn_implementation	self_attnr_  rT  feed_forwardrY   rc   r!  input_layernormpre_ff_layernormrb   r   r   r%   ffn_layer_classre   s        r/   r]   z#JambaAttentionDecoderLayer.__init__  s    /	:01LMfV_``1<q--h+OF33+F,>FDWXXX ,V-?VEX Y Y Yr1   r   r   r   r   NFrl   r&   r   r   output_router_logitsr   r   r'   c	           	      T   |}	|                      |          }|                     |||||||          \  }}
}|	|z   }|}	|                     |          }|                     |          }t	          |t
                    r|\  }}n|d}}|	|z   }|f}|r||
fz  }|r||fz  }|r||fz  }|S )  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        )rl   r&   r   r   r   r   r   N)r~  r|  r  r}  r5   r6   )rb   rl   r&   r   r   r   r  r   r   residualself_attn_weightspresent_key_value
ff_outputsr$   outputss                  r/   ro   z"JambaAttentionDecoderLayer.forward  s   > !,,];;>Bnn')%+/) ?M ?
 ?
;(*; !=0 !--m<<&&}55
j%(( 	<+5(M==+5t=M =0 " 	,)++G 	,)++G 	(''Gr1   NNNFFFNrr   rs   rt   r   rE   r]   r   r8   r   r   r   r   r   r6   FloatTensorro   ru   rv   s   @r/   rw  rw    s[       Z{ Zs Z Z Z Z Z Z _%0A6RRR 2637FJ,1/4$)59D D|D !.D u/0	D
 ""BCD $D>D 'tnD D>D !!12D 
u (51BEDU1U+V"WW	XD D D SRD D D D Dr1   rw  c                   R    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	e         de	e         de	ej
                 deej        e	eej        ej        f                  f         fd            Z xZS )JambaMambaDecoderLayerr   r   c                 V   t                                                       |j        |         }t          ||          | _        |dk    rt
          nt          } ||          | _        t          |j	        |j
                  | _        t          |j	        |j
                  | _        d S )N)r   r   r   r  )r\   r]   ry  r  r   r_  rT  r}  rY   rc   r!  r~  r  r  s        r/   r]   zJambaMambaDecoderLayer.__init__  s    /	:$FiHHH
1<q--h+OF33+F,>FDWXXX ,V-?VEX Y Y Yr1   r   r   r   r   NFrl   r&   r   r   r  r   r   r'   c	                 H   |}	|                      |          }|                     |||          }d}
|	|z   }|}	|                     |          }|                     |          }t	          |t
                    r|\  }}n|d}}|	|z   }|f}|r||
fz  }|r||fz  }|r||fz  }|S )r  )rl   r'  r&   N)r~  r   r  r}  r5   r6   )rb   rl   r&   r   r   r   r  r   r   r  r  r  r$   r  s                 r/   ro   zJambaMambaDecoderLayer.forward  s    > !,,];;

'() # 
 

 ! !=0 !--m<<&&}55
j%(( 	<+5(M==+5t=M =0 " 	,)++G 	*))G 	(''Gr1   r  r  rv   s   @r/   r  r    s[       Z{ Zs Z Z Z Z Z Z _%0A6RRR 2637FJ,1/4$)59A A|A !.A u/0	A
 ""BCA $D>A 'tnA D>A !!12A 
u (51BEDU1U+V"WW	XA A A SRA A A A Ar1   r  c                   @    e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZd ZdS )	JambaPreTrainedModelr   modelTrw  r  r   c                    | j         j        }t          |t          j        t          j        f          rJ|j        j                            d|           |j	         |j	        j        
                                 d S d S t          |t          j                  rU|j        j                            d|           |j        +|j        j        |j                 
                                 d S d S t          |t                    r!|j        j                            d           d S t          |t                    rt!          j        d|j        dz             d d d f         }|                    |j        d                                          }|j        j                            t!          j        |                     |j        j                            d           d S d S )Nr   )r>   stdg      ?r   r4   )r   initializer_ranger5   r   r   r  r`   r1  normal_r   zero_	Embeddingpadding_idxrY   fill_r  r8   r  r   rA   r   r   r  r/  r  r  )rb   moduler  r&  s       r/   _init_weightsz"JambaPreTrainedModel._init_weights9  s   k+fry")455 	%M&&CS&999{& &&((((( '&-- 
	%M&&CS&999!-"6#56<<>>>>> .--- 	%M$$S)))))00 	%Q 5 9::47CA1266AACCAL##EIaLL111HM$$$$$		% 	%r1   N)rr   rs   rt   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  r*   r1   r/   r  r  -  s_         &*#57OP"3NL% % % % %r1   r  )	attentionr   c                   X    e Zd ZdZdef fdZee	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j
                 dee         d	ee	j                 d
ee         dee         dee         dee         dee	j
                 dee         defd                        Zd Zd Z xZS )
JambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`JambaDecoderLayer`]

    Args:
        config: JambaConfig
    r   c                 0   t                                          |           |j        | _        |j        | _        t          j        |j        |j        | j                  | _        g }t          |j
                  D ]:}t          |j        |                  }|                     |||                     ;t          j        |          | _        |j        | _        t#          |j        |j                  | _        d| _        |                                  d S )N)r   r  F)r\   r]   pad_token_idr  
vocab_sizer   r  rc   embed_tokensr   rQ   ALL_DECODER_LAYER_TYPESr   r   rf  layersr{  rY   r!  final_layernormgradient_checkpointing	post_init)rb   r   decoder_layersr   layer_classre   s        r/   r]   zJambaModel.__init__Y  s       !. +L):F<NPTP`aav/00 	D 	DA1&2J12MNK!!++f"B"B"BCCCCmN33$*$?!+F,>FDWXXX&+#r1   N	input_idsr&   r   r   inputs_embedsr   r   output_hidden_statesr  r   r   r'   c                    ||n| j         j        }|	|	n| j         j        }	||n| j         j        }||n| j         j        }|d u |d uz  rt          d          | j        r%| j        r|rt          	                    d           d}|| 
                    |          }|}|r|t          	                    d           |
&t          j        |j        d         |j                  }
||
                    d          }|                     |||
          }|                     ||
          }|rdnd }|rdnd }|	rdnd }| j        D ]j}t'          |t(                    r|n|}|r||fz  } |||||||	||
	          }|d         }|r|d         ||d         fz  }|	r|d
         ||d
         fz  }k|                     |          }|r||fz  }|r|j        sd|_        |sd n|}t/          |||||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzJamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r   r   r*   )r&   r   r   r   r  r   r   r4   T)last_hidden_stater   rl   
attentionsr$   )r   r   r  r  r   r   r  r   r   r   r  r8   r  r@   r7   rF   _update_causal_mask_update_mamba_maskr  r5   r  r  r   r   )rb   r  r&   r   r   r  r   r   r  r  r   r   rl   r   
mamba_maskall_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputs
next_caches                         r/   ro   zJambaModel.forwardl  s     2C1N--TXT_Tq$8$D  $+Jj 	 %9$D  $+Jj 	 "+!6IIDK<Q	-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M% 	0:  
 !"\-*=a*@I]^^^N)33A66L..~}n]],,^^LL
"6@BBD0:d"6@BBD![ 	> 	>M'1-AW'X'Xi^iJ# 6!m%55!)M)) /"3%9#-	 	 	M *!,M  : #/"}Q'7&99N# > $0%-*;)==%,,];;   	2-!11 	6?#E 	615O.!*?TT
%+&+%+
 
 
 	
r1   c                    | j         j        dk    r
|d|v r|S d S |j        |j        }}t	          j        |          j        }|j        d         }|d         dz   }t	          j        ||f|||          }	|dk    rt	          j	        |	d          }	|	t	          j
        ||          |                    dd          k    z  }	|	d d d d d d f                             |j        d         ddd          }	||	                                }	|                                d	k    rw|j        d         }
|	d
d |
f                             d          |d d d d d d f                             d          z  }|	d
d |
f                             ||          |	d
d |
f<   | j         j        dk    r%|#|j        j        dv rt%          j        |	|          }	|	S )Nr  r   r   r4   )
fill_valuerh   r7   )diagonalr   r   r#   .r  )r   xpunpu)r   r{  rh   r7   r8   finfominr@   fulltriur  rB   rA   rG  r3   eqmasked_fillr   r
   _unmask_unattended)rb   r&   input_tensorr   rh   r7   	min_dtyperP   target_lengthr   mask_lengthpadding_masks               r/   r  zJambaModel._update_causal_mask  s   ;+/BBB)c^.C.C%%4$*L,?vK&&*	&,Q/&r*Q.j/=!Ai_dmsttta*[1===Ku|M&AAANDZDZ[]_`DaDaaa!$aaa"23::<;Ma;PRSUWY[\\%%++--K!!##q((,226*3+<=@@EEWXWXWXZ^`dfgfgfgWgHhHkHkloHpHpp1<S,;,=N1O1[1[\hjs1t1tC+-. K,66*%*.DDD
 1CKQZ[[Kr1   c                 Z    |}|d         dk    s|t          j        |dk              rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r8   all)rb   r&   r   r  s       r/   r  zJambaModel._update_mamba_mask  s>     $
!q  ^%?EIn`aNaDbDb%?Jr1   )
NNNNNNNNNN)rr   rs   rt   r   r   r]   r   r   r   r8   r   r   r   r  r   r   r   r   ro   r  r  ru   rv   s   @r/   r  r  P  s        {      &  151537FJ59$(,0/3/359e
 e
E,-e
 !.e
 u/0	e

 ""BCe
   12e
 D>e
 $D>e
 'tne
 'tne
 !!12e
 +,e
 
 e
 e
 e
 ^ e
N! ! !F	 	 	 	 	 	 	r1   r  c                       e Zd ZdgZdef fdZee	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j
                 d	ee         d
ee	j                 dee	j
                 dee         dee         dee         dee         dee	j
                 deee	j        f         dee         defd                        Z	 	 	 	 	 	 	 ddZ xZS )JambaForCausalLMzlm_head.weightr   c                 F   t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        |j	        | _	        |j
        | _
        |j        | _        |                                  d S rV  )r\   r]   r  r  r  r   r   rc   lm_headrouter_aux_loss_coefr%   rd  r  r[  s     r/   r]   zJambaForCausalLM.__init__  s       ''
 +y!3V5FUSSS$*$?!!-#)#= r1   Nr   r  r&   r   r   r  labelsr   r   r  r  r   logits_to_keepr   r'   c                 `   ||n| j         j        }|
|
n| j         j        }
|	|	n| j         j        }	|                     ||||||||	|
|
  
        }|j        }t          |t                    rt          | d          n|}| 	                    |dd|ddf                   }d}| | j
        ||| j        fi |}d}|
rHt          |j        | j        | j        |          }|%|| j        |                    |j                  z  z  }t'          ||||j        |j        |j        |j                  S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, JambaForCausalLM

        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r  r&   r   r   r  r   r   r  r  r   )lossaux_losslogitsr   rl   r  r$   )r   r   r  r  r  r  r5   rE   slicer  loss_functionr  rW   r$   r%   rd  r  r+   r7   r   r   rl   r  )rb   r  r&   r   r   r  r  r   r   r  r  r   r  r   r  rl   slice_indicesr  r  r  s                       r/   ro   zJambaForCausalLM.forward  s   R 2C1N--TXT_Tq$8$D  $+Jj 	
 %9$D  $+Jj 	
 +/**)%+'/!5!5) +5 +
 +
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%ffdoPPPPD 	M/% (	 H !1HKK4L4LLL(#3!/)!/
 
 
 	
r1   FTc	           
         |d u }
|
s]||d         |j         d         k    r|d d |j         d          d f         }nV|j         d         |j         d         k    r|d d |f         }n-t          | j        |j         d         | j        | j                  }|b|`|                                                    d          dz
  }|                    |dk    d           |
s|d d |j         d          d f         }||
rd|i}nd|                                i}|	                    |||||| j        j
        |d           |	                                D ]\  }}||vr|||<   |S )Nr4   r   r   r   r  r  )r   r   r   r&   r  r  r   )r@   r   r   rh   r7   longcumsummasked_fill_r   r   num_logits_to_keepitems)rb   r  r   r&   r  r  r   r   r   r   empty_past_kvmodel_inputskeyvalues                 r/   prepare_inputs_for_generationz.JambaForCausalLM.prepare_inputs_for_generationp  s    (4/  	)!"%);;;%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	>Y_Q/DK  O %,*>)..0077;;a?L%%n&91===  F+AAA	0B/B/D/D,DE $$+];LL')=)=)?)?@L ,#2&"0(<"&+"@"0 
	
 
	
 
	
 !,,.. 	* 	*JC,&&$)S!r1   )NNNNNNNNNNNr   )NNNFNNT)rr   rs   rt   _tied_weights_keysr   r]   r   r   r   r8   r   r   r   r  r   r   rE   r   r   r   ro   r  ru   rv   s   @r/   r  r    s       *+	{ 	 	 	 	 	 	  151537FJ59-1$(,0/3/35934Y
 Y
E,-Y
 !.Y
 u/0	Y

 ""BCY
   12Y
 )*Y
 D>Y
 $D>Y
 'tnY
 'tnY
 !!12Y
 c5</0Y
 +,Y
 
#Y
 Y
 Y
 ^ Y
| "@ @ @ @ @ @ @ @r1   r  c                       e Zd ZdS )JambaForSequenceClassificationN)rr   rs   rt   r*   r1   r/   r  r    s          r1   r  )r  r  r  r  )Nr#   N)Or   r   typingr   r   r   r8   torch.nn.functionalr   r:   rk  activationsr   
generationr	   modeling_attn_mask_utilsr
   modeling_flash_attention_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.import_utilsr   r   configuration_jambar   r   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr    r!   r  r%  
get_loggerrr   r   r   r6   rE   rW   ModulerY   r}   r   r   r   r   rz  r  rT  r_  rw  r  r  r  r  r  r  __all__r*   r1   r/   <module>r     s  (    ' ' ' ' ' ' ' ' ' '                 ! ! ! ! ! ! ) ) ) ) ) ) > > > > > > h h h h h h h h        R Q Q Q Q Q Q Q - - - - - - & & & & & & R R R R R R R R R R R R 0 0 0 0 0 0 T T T T T T T T , , , , , ,  KJJJJJJ  QXXXXXXXXRRRRRRR@P=-~ 8DDDDDDDDD-7**.0@BVXfg  
 
	H	%	% "&
-1	U& U&uU\':D@AU&#U& U\*	U&
 5<U& U& U& U&rJ J J J J29 J J J*	UU\ 	U# 	U%, 	U 	U 	U 	UN3 N3 N3 N3 N3 N3 N3 N3dX: X: X: X: X:RY X: X: X:xa: a: a: a: a:> a: a: a:JP2 P2 P2 P2 P2 P2 P2 P2h -  HN HN HN HN HNbi HN HN HNX    ry   "<2 <2 <2 <2 <2") <2 <2 <2~P P P P P!; P P PfM M M M M7 M M M` % % % % %? % % %< )CMcdd  p p p p p% p p phk k k k k+_ k k k\ b a a a a%EG[ a a a g
f
fr1   