
     `i                        d Z ddlZddlmZmZmZmZ ddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)  e'            rddl*m+Z+m,Z, ddl-m.Z. nd\  Z.Z,Z+ e&            r	ddl/m0Z0m1Z1 nd\  Z1Z0 e2e.e,e0e1e+f          Z3 e"j4        e5          Z6 G d dej7                  Z8dej9        de:dej9        fdZ; G d d           Z<	 dAd"ej7        d#ej9        d$ej9        d%ej9        d&eej9                 d'e=d(e=fd)Z> G d* d+ej7                  Z? G d, d-ej7                  Z@ G d. d/ej7                  ZA G d0 d1ej7                  ZB G d2 d3ej7                  ZC G d4 d5ej7                  ZDe! G d6 d7e                      ZEe! G d8 d9eE                      ZF G d: d;eEe          ZG e!d<=           G d> d?eE                      ZHg d@ZIdS )BzPyTorch Zamba model.    N)AnyCallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)Cache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_ssm_available   )ZambaConfig)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNc                   ,     e Zd Zd fd	Zd Zd Z xZS )ZambaRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z;
        ZambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/zamba/modeling_zamba.pyr)   zZambaRMSNorm.__init__@   sD     	l5:k#:#:;; #    c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )N   T)keepdim)	dtypetor+   float32powmeanrsqrtr.   r-   )r/   hidden_statesinput_dtypevariances       r3   forwardzZambaRMSNorm.forwardH   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r4   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler-   shaper.   r/   s    r3   
extra_reprzZambaRMSNorm.extra_reprO   s&    )**II$2GIIIr4   )r&   )__name__
__module____qualname__r)   rB   rG   __classcell__r2   s   @r3   r%   r%   ?   sb        $ $ $ $ $ $; ; ;J J J J J J Jr4   r%   r?   n_repreturnc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rE   expandreshape)r?   rM   batchnum_key_value_headsslenhead_dims         r3   	repeat_kvrV   T   s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr4   c                      e Zd ZdZdZej        dfdZd Zde	de
ej        ej        f         fdZ	 dd	ej        d
ej        de	deeeef                  de
ej        ej        f         f
dZdej        fdZddee	         de	fdZdS )ZambaHybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc           
         || _         d| _        |j        | _        d| _        |j        |j        z  | _        |j        | _        |j	        | _
        |j        | _        g | _        g | _        g | _        i | _        i | _        i | _        t%          |j                  D ]}| xj        t)          j        | j        | j
        |          gz  c_        | j        | j        | j        z  | j        f}| xj        t)          j        ||          gz  c_        | j        |         dk    r| j                            |           fdt%          |j                  D             | _        fdt%          |j                  D             | _        d S )NFdevicer9   hybridc                 D    g | ]}t          j        g gz             S r[   r+   tensor.0_
batch_sizer[   s     r3   
<listcomp>z4ZambaHybridDynamicCache.__init__.<locals>.<listcomp>   s/    rrrQ%,tj'8HHHrrrr4   c                 D    g | ]}t          j        g gz             S r^   r`   rb   s     r3   rf   z4ZambaHybridDynamicCache.__init__.<locals>.<listcomp>   s/    tttqEL"
):6JJJtttr4   )r9   is_compileablelayers_block_typehas_previous_statemamba_expandr0   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layers_modules_parameters_buffersrangenum_hidden_layersr+   zerosappend	key_cachevalue_cache)r/   configre   r9   r[   icache_shapes     ` `  r3   r)   z ZambaHybridDynamicCache.__init__p   s   
#!'!9"'!'!4v7I!I$2 & 3#1"$v/00 	2 	2AJ(>@U^dlqrrr!  "&$*<<#	K OOKe T T TUUOO%a(H44'..q111rrrrrRWX^XpRqRqrrrtttttTYZ`ZrTsTstttr4   c                 *    t          | j                  S N)lenr|   rF   s    r3   __len__zZambaHybridDynamicCache.__len__   s    4>"""r4   	layer_idxrN   c                 6    | j         |         | j        |         fS r   )r|   r}   r/   r   s     r3   __getitem__z#ZambaHybridDynamicCache.__getitem__   s    ~i($*:9*EEEr4   
key_statesvalue_statescache_kwargsc                 D   | j         |         j        d         dk    r|| j         |<   || j        |<   nVt          j        | j         |         |gd          | j         |<   t          j        | j        |         |gd          | j        |<   | j         |         | j        |         fS )Nr7   r   r6   dim)r|   rE   r}   r+   cat)r/   r   r   r   r   s        r3   updatezZambaHybridDynamicCache.update   s     >)$*2.!33(2DN9%*6DY''(-	4>)3Lj2Y_`(a(a(aDN9%*/)T5Ei5PR^4_ef*g*g*gDY'~i($*:9*EEEr4   beam_idxc                    t          t          | j                            D ];}| j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   =dS )zDReorders the cache for beam search, given the selected beam indices.r   N)	rx   r   r|   r[   index_selectr:   r}   rr   rs   )r/   r   r   r[   s       r3   reorder_cachez%ZambaHybridDynamicCache.reorder_cache   sE   s4>2233 		i 		iI^I.5F(,y(A(N(NqRZR]R]^dReRe(f(fDN9%%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'_Y/6F)-)C)P)PQRT\T_T_`fTgTg)h)hDOI&&		i 		ir4   r   c                     || j         vr| j         d         n|}t          | j                  |k    rdS | j        |         j        d         S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rt   r   r|   rE   r   s     r3   get_seq_lengthz&ZambaHybridDynamicCache.get_seq_length   sS     3<4CZ2Z2ZD+A..`i	t~)++1~i(.r22r4   r   )r   )rH   rI   rJ   __doc__rh   r+   float16r)   r   intrD   Tensorr   r   dictstrr   r   
LongTensorr   r    r4   r3   rX   rX   `   sO         N16t u u u u@# # #FS FU5<3M-N F F F F 26F FLF lF 	F
 tCH~.F 
u|U\)	*F F F F$ie&6 i i i i3 3 3c 3 3 3 3 3 3r4   rX           modulequerykeyvalueattention_maskscalingdropoutc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr6   r   r   r7   )r   r9   )ptrainingr   )rV   num_key_value_groupsr+   matmul	transposerE   r   
functionalsoftmaxr;   r:   r9   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr   r   attn_weightscausal_maskattn_outputs                r3   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r4   c                       e Zd ZdZdedef fdZ eddd          	 dd
ej	        dede
ej	                 de
e         dee         deej	        e
ej	                 e
eej	                          f         fd            Z xZS )ZambaAttentionaA  
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    r~   r   c                    t                                                       || _        || _        |j        | _        |j        | _        |j        |j        z  | _	        |j
        | _
        | j        dz  dz  | _        d| _        |j        | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        | j        z  |j        d          | _        d S )Nr6         TFbias)r(   r)   r~   r   attention_hidden_sizeattention_head_dimrU   num_attention_headsrS   r   max_position_embeddingsr   	is_causalattention_dropoutr   Linearq_projk_projv_projr0   o_projr/   r~   r   r2   s      r3   r)   zZambaAttention.__init__   s!   "%+%A"1$*$>&B\$\!'-'E$)d2!'!9i <f>X[_[h>hotuuui <f>X[_[h>hotuuui <f>X[_[h>hotuuui :T] JFL^ejkkkr4   past_key_valuepast_key_values4.58new_nameversionNr?   r   r   rN   c                    |j         d d         }g |d| j        R }|                     |                              |                              dd          }|                     |                              |                              dd          }	|                     |                              |                              dd          }
||                    |	|
|          \  }	}
t          }| j	        j
        dk    rt          | j	        j
                 } || ||	|
|f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr7   r   r6   eagerr   )r   r   )rE   rU   r   viewr   r   r   r   r   r~   _attn_implementationr   r   r   r   rQ   r   r   )r/   r?   r   r   r   r   input_shapehidden_shapequery_statesr   r   attention_interfacer   r   s                 r3   rB   zZambaAttention.forward   s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&'6'='=j,Xa'b'b$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r4   r   )rH   rI   rJ   r   r   r   r)   r   r+   r   r   rX   r   r   rD   rB   rK   rL   s   @r3   r   r      s        l{ ls l l l l l l$ _%0A6RRR >B#) #)|#) #) !.	#)
 ""9:#) -.#) 
u|Xel3XeEL>Q5RR	S#) #) #) SR#) #) #) #) #)r4   r   c                   b     e Zd ZdZdef fdZ	 d
dej        defdZ	d
defdZ
d
defd	Z xZS )ZambaMambaMixeruE  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
    r~   c           	         t                                                       || _        || _        |j        | _        |j        | _        |j        | _        |j	        |j        z  | _
        |j        | _        |j        | _        | j
        | j        z  | _        |j        | _        |j        | _        t'          j        | j
        | j
        | j        | j        | j
        | j        dz
            | _        |j        | _        t0          |j                 | _        |j        | _        t'          j        | j        | j
        dz  | j                  | _        t'          j        t?          j         | j        | j        | j        dz  z   | j                            | _!        t'          j        t?          j         | j        | j        | j                  dz
  dz  | j        dz  z            | _"        t'          j        t?          j         | j        | j                            | _#        t?          j$        d| j        dz   t>          j%                  d d d f         }|&                    | j
        d          '                                }t'          j        t?          j(        |          )                    | j        | j        d                    | _*        t'          j        t?          j+        | j        | j                            | _,        t'          j        | j
        | j        | j                  | _-        t\          st^          0                    d           d S d S )	Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr6   r   g      ?r9   r7   aq  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)1r(   r)   r~   r   r0   rm   rn   ro   rp   rk   rl   mamba_dt_ranktime_step_rankrq   mamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1dhidden_mamba_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projr*   r+   rz   x_proj_weightdt_proj_weightdt_proj_biasaranger;   rP   r   logrQ   A_logr,   Dout_projis_fast_path_availableloggerwarning_once)r/   r~   r   Ar2   s       r3   r)   zZambaMambaMixer.__init__.  s   "!-$2 & 3!'!4v7I!I$2#1"48JJ#3.i./#-))A-
 
 
 !1&12 & 8 y!143IA3MTXTabbb  \K"#d&9A&==# 
 
 !l[+T-@$BUVVY\\!3&'
 

 LT5GI\)])]^^ LD/!35=III$PQPQPQ'RHHT+R00;;==\%)A,,"6"6t7I4K^`b"c"cdd
ej);T=PQQRR	$"8$:JQUQ^___% 	^    	 	r4   Nr?   cache_paramsc                    |j         \  }}}|d uo|j        o|dk    }|                     |                              dd          }|                    |dd|                              dd          \  }}	|                    d                                          }|	                    d          }	|	                    || j	        d|                              dd          }	| j
        j                            | j
        j                            d          | j
        j                            d                    }
|r[t          |                    d          |j        | j                 |
| j
        j        | j                  }|                    d          }n|0t'          j        |dk              s||                    d          z  }|Zt*          j                            || j        |j         d         z
  df          }|j        | j                                     |           t5          ||
| j
        j        | j                  }|0t'          j        |dk              s||                    d          z  }|                    d| j	        | j        |                              dd          }| j        d d d d d d d f         |z                      dd          }t'          j        || j        | j        | j        gd          \  }}}| j         d d d f         |                    dd          z  }t'          j!        | j"        #                                           }| j$        | j$        #                                nd }t'          j%        |d|f|j&        |j'                  }|rtQ          | j	                  D ]}tS          |j*        | j                 d d |f         ||d	df         ||d	df         ||         ||d d df         ||d d df         | j+        |         |	|d	df         ||         d

  
                            d          }t'          j,        ||fd          }nOt'          j%        |d| j        | j        f|j&        |j'                  }tQ          | j	                  D ]}t[          ||         ||         ||         ||                             dd          ||                             dd          | j+        |         #                                |	|         ||         d
d

  
        \  }}t'          j,        ||fd                                          }t'          j,        ||                    d          fd          }|'|%|j*        | j                                     |           | .                    |                    dd                    }|S )Nr   r6   r7   r   r   )r   r   rZ   .T)dt_softplus)delta_softplusreturn_last_state)/rE   rj   r   r   r   chunksqueezer   rQ   rq   r   r-   sizer"   rr   r   r   r   	unsqueezer+   allr   r   padrp   copy_r!   r   r   splitr   rn   r   expr   floatr   emptyr[   r9   rx   r    rs   r   r   r   r   )r/   r?   r   r   re   seq_lenrd   use_precomputed_statesprojected_statesgateconv_weightsrr   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r3   cuda_kernels_forwardz$ZambaMambaMixer.cuda_kernels_forwardk  s    "/!4
GQ!-T!9!nl>]!nbimnbn  <<66@@AFF.33JAwOOUUVW]^U__t%--a00;;==||A||J(:BHHRRSTVWXX {)..t{/A/F/Fq/I/I4;K]KbKbcdKeKeff! 	L0%%b))(8  M *33B77MM)%)Na<O2P2P) -0H0H0K0K K' m//@UXeXklnXo@oqr?stt(8>>{KKK,]L$+JZgkgvwwwM)%)Na<O2P2P) -0H0H0K0K K
 &--b$2DdFY[bccmmnoqrss,QQQaaa];mKVVWY[]^^+T0$2EtGZ[ac
 
 
	1a "0D9I<O<OPRTV<W<WWYtz''))*** 7;6G6S*00222Y]{J7#;MDX`m`sttt! &	I4-.. O O 6 +DN;AAAqDA!!S!),&q#qy1aDaAgJaAgJF1ICO"1% $! ! ! )B--   %y,)FANNNO  Q 3T5HI$+#)  I
 4-.. S S,=!!$&q)aDaDNN1a((aDNN1a((F1IOO%%G"1%#'&*- - -)z  %y,)FANNNYY[[!Iy*2F2Fq2I2I&JPQRRR		$)A'7==iHHH !%l.D.DQ.J.J K K$$r4   c           
      *   |j         \  }}}|j        }|                     |                              dd          }|                    |dd|                              dd          \  }	}
|	                    d                                          }	|
                    d          }
|
                    || j	        d|                              dd          }
t          |t                    }|r|j        | j                 j         d         |k    re| j        r%|j        | j                                                 }n|j        | j                 }|                    |	j                  }|j        r|dk    r|j        | j                 j         d         |k    r|j        | j                 }t)          j        |dd          }|	d d d d df         |d d d d df<   ||j        | j        <   t)          j        || j        j        d d dd d f         z  d          }	| j        r|	| j        j        z  }	|                     |	                              |                              d          }	n|Ht)          j        |dk              s0|	|d d |	j         d          d f                             d          z  }	t<          j                             |	| j!        |	j         d         z
  df          }||j        | j        <   |                     |                     |	          dd |f                   }	|Ht)          j        |dk              s0|	|d d |	j         d          d f                             d          z  }	nt)          j"        || j	        | j#        | j$        f|	j        |          }|0t)          j        |dk              s|	|                    d          z  }	|                     |                     |	          dd |f                   }	|0t)          j        |dk              s|	|                    d          z  }	|	                    d| j	        | j#        |                              dd          }	| j%        d d d d d d d f         |	z                      dd	          }t)          j&        || j'        | j$        | j$        gd          \  }}}| j(        d d d f         |                    dd	          z  | j)        d d d d d d f         z   }t<          j        *                    |          }t)          j+        | j,        -                                           }t)          j+        |d d d d d d d d f         |d d d d d d d d d f         z            }|d d d d d d d d d f         |d d d d d d d d d f         -                                z  }||	d d d d d d d d d f         -                                z  }g }t]          |          D ]}|d d d d d d |d d f                             dd          |z  |d d d d d d |d d f                             dd          z   }t)          j/        |                    dd                              |          |d d d d |d d f                             d                    }|0                    |d d d d d d df                    t)          j1        |d          }||	| j2        d d d d d d f         z  z   }||                     |
          z  }|r||j        | j        <   | 3                    |                    dd                              |d|                              dd                    }|S )
Nr   r6   r7   r   r   )shiftsdims.rZ   r   )4rE   r9   r   r   r   r   r   r   rQ   rq   
isinstancerX   rs   r   r   cloner:   r[   rj   rr   r+   rollsumr   r-   r   r   r   r  r  r   r   r  rp   rz   r   rn   r   r  r   r   r   softplusr  r   r  rx   r   r{   stackr   r   )r/   input_statesr   r   re   r
  rd   r9   r  r?   r  	use_cacher  
conv_stater  r  r  r  r  r   
discrete_A
discrete_BdeltaB_ur  r   scan_outputr  s                              r3   slow_forwardzZambaMambaMixer.slow_forward  s"   !-!3
GQ"<<55??1EE.33JAwOOUUVW]^U__t%--a00;;==||A||J(:BHHRRSTVWXX|-DEE	 (	L0@FqIZWW} D(3DNCIIKK		(3DNC	!]%9::I /oqLL ,T^<B1ESS)5dnE
"Z
2BGGG
'4QQQ1W'=
111aaa8$;E(8 %	*t{7I!!!QPQPQPQ'7R*RXZ [ [ [% 6!T[%55M $ 7 7 : :5 A A K KB O O!-eiRS@S6T6T-$1N111}GZ[]G^F^F`F`C`4a4k4klm4n4n$nM]..}t?TWdWjkmWn?npq>rss
;E(8 $])C)CC'M)R S S!-eiRS@S6T6T-$1N111}GZ[]G^F^F`F`C`4a4k4klm4n4n$nMT/1DdFYZ$+  I
 )%)Na<O2P2P) -0H0H0K0K K HHT[[%?%?XgX%NOOM)%)Na<O2P2P) -0H0H0K0K K &--b$2DdFY[bccmmnoqrss,QQQaaa];mKVVWY[]^^+T0$2EtGZ[ac
 
 
	1a #1!!!T':Y=P=PQSUW=X=XX\`\mAAtQQQ]
 
  ]334FGG Ytz''))***YqD!!!T111!458J111aaaQRQRQRTUTUTUW[K[8\\]]
'111aaaD(89AaaaD!!!QQQ>N<O<U<U<W<WW
aaaAAAqqq$.> ? E E G GGw 	9 	9A"111aaaAqqq=1;;AqAAIMPXYZYZYZ\]\]\]_`_`_`bcefefefYfPgPqPqrsuvPwPwwI,y':':1a'@'@'C'CE'J'JAaaaQRQRQRTUWXWXWXjMLcLcdfLgLghhKAAAqqq!!!QJ 78888k,B777!]TVAAAtQQQ<L5M%MN!DHHTNN2 	@6?L#DN3 !%!!!Q''//
BHHRRSTVWXX!
 !
 %$r4   c                     | j         rAt          rd| j        j        j        vrt          d          |                     |||          S |                     |||          S )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r   r   r[   type
ValueErrorr  r,  )r/   r?   r   r   s       r3   rB   zZambaMambaMixer.forward)  sz      	i) V4;M;T;Y-Y-Y i  
 ,,]LYg,hhh  ^ \\\r4   r#   )rH   rI   rJ   r   r   r)   r+   r   rX   r  r,  rB   rK   rL   s   @r3   r   r   !  s        
 
;{ ; ; ; ; ; ;| im_% _%"\_%9P_% _% _% _%B[% [%7N [% [% [% [%z	] 	]3J 	] 	] 	] 	] 	] 	] 	] 	]r4   r   c                   $     e Zd Z fdZd Z xZS )ZambaMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          |j                 | _        d S NFr   )r(   r)   r~   r0   rl   r   r   	gate_projup_proj	down_projr   
hidden_actact_fnr/   r~   r2   s     r3   r)   zZambaMLP.__init__7  s    !-!'!94#3T5KRWXXXy!143IPUVVV4#94;KRWXXXV./r4   c                     |                      |                     |                     |                    |                     |          z            }|S r   )r7  r9  r5  r6  )r/   xr7  s      r3   rB   zZambaMLP.forwardA  sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r4   )rH   rI   rJ   r)   rB   rK   rL   s   @r3   r2  r2  6  sG        0 0 0 0 0      r4   r2  c                   8    e Zd Zddedee         f fdZ eddd          	 	 	 	 dd
ej	        dej	        dedeej	                 dee
         dee         dee         dee         deej        eeej        ej        f                  f         fd            Z xZS )ZambaAttentionDecoderLayerNr~   r   c                    t                                                       t          ||          | _        t	          |          | _        t          |j        |j                  | _	        t          |j
        |j                  | _        d S )Nr1   )r(   r)   r   	self_attnr2  feed_forwardr%   r   rms_norm_epsinput_layernormr0   pre_ff_layernormr   s      r3   r)   z#ZambaAttentionDecoderLayer.__init__G  sv    '	::$V,,+F,HfNabbb ,V-?VEX Y Y Yr4   r   r   r   r   Fr?   original_hidden_statesr   output_attentionsr&  r   rN   c           
          t          j        ||gd          }|                     |          } | j        d||||||d|\  }}	|                     |          }|                     |          }|f}
|r|
|	fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        r7   r   )r?   r   r   r   rG  r&  r   )r+   concatenaterD  rA  rE  rB  )r/   r?   rF  r   r   r   rG  r&  r   self_attn_weightsoutputss              r3   rB   z"ZambaAttentionDecoderLayer.forwardO  s    @ )=:P*QWYZZZ,,];;+94> ,
')+/,
 ,
 ,
 ,
(( --m<<))-88 " 	,)++Gr4   r   )NNFF)rH   rI   rJ   r   r   r   r)   r   r+   r   rX   boolr   r   rD   FloatTensorrB   rK   rL   s   @r3   r>  r>  F  s>       Z Z{ Zx} Z Z Z Z Z Z _%0A6RRR 26=A,1$)3 3|3 !&3 	3
 !.3 ""9:3 $D>3 D>3 -.3 
u (51BEDU1U+V"WW	X3 3 3 SR3 3 3 3 3r4   r>  c                       e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 	 	 dd
ej        de	ej                 de	e         de	ej                 de	ej                 de	e
         de	e         de	e         de	ej                 de	ej                 deej        e	eej        ej        f                  f         fd            Z xZS )ZambaMambaDecoderLayerr~   r   c                     t                                                       t          ||          | _        t	          |j        |j                  | _        || _        d S )N)r~   r   r@  )	r(   r)   r   mambar%   r0   rC  rD  r   r   s      r3   r)   zZambaMambaDecoderLayer.__init__  sS    $FiHHH
+F,>FDWXXX"r4   r   r   r   r   NFr?   rF  r   r   rG  r&  cache_positiontransformer_hidden_statesrN   c                     |}|
||
z   n|}|                      |          }|                     |||          }d}||z   }|f}|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)r?   r   r   )rD  rQ  )r/   r?   rF  r   r   r   r   rG  r&  rR  rS  r   residualrJ  rK  s                  r3   rB   zZambaMambaDecoderLayer.forward  s    > !
 :S9^M555dq 	 ,,];;

'() # 
 
 ! !=0 " 	,)++G 	*))Gr4   )	NNNNNFFNN)rH   rI   rJ   r   r   r)   r   r+   r   r   rX   rL  r   rD   rM  rB   rK   rL   s   @r3   rO  rO    sh       #{ #s # # # # # # _%0A6RRR :>#'15.2=A,1$)59<@: :|: !) 6: C=	:
 !.: el+: ""9:: $D>: D>: !!12: $,EL#9: 
u (51BEDU1U+V"WW	X: : : SR: : : : :r4   rO  c                   |    e Zd Zdedej        def fdZ eddd          	 	 	 	 	 	 	 	 dde	j
        dee	j
                 dee         dee	j
                 dee	j
                 dee         dee         dee         dee	j                 dee	j        eee	j        e	j        f                  f         fd            Z xZS )ZambaHybridLayershared_transflinearrQ  c                 r    t                                                       || _        || _        || _        d S r   )r(   r)   rX  rY  mamba_decoder)r/   rX  rY  rQ  r2   s       r3   r)   zZambaHybridLayer.__init__  s6    *"r4   r   r   r   r   NFr?   rF  r   r   r   rG  r&  rR  rN   c
           
          |                      ||||||||	          }
|
d         }|r|
d         }|                     |          }|                     |||||||	          }
|r|
d         |f|
dd         z   }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        )rF  r   r   r   rG  r&  rR  r   r   )rS  r   r   rG  r&  rR  r6   N)rX  rY  r[  )r/   r?   rF  r   r   r   r   rG  r&  rR  layer_outputsrS  rJ  s                r3   rB   zZambaHybridLayer.forward  s    @ **#9&+/) + 	
 	
 %2!$4! 	1 -a 0$(KK0I$J$J!**&?)+/) + 
 
  	V*1-/@AMRSRTRTDUUMr4   )NNNNNFFN)rH   rI   rJ   r>  r   r   rO  r)   r   r+   r   r   r   rX   rL  r   rD   rM  rB   rK   rL   s   @r3   rW  rW    s_       #&@ #") #\r # # # # # # _%0A6RRR :>#'15.2=A,1$)59> >|> !) 6> C=	>
 !.> el+> ""9:> $D>> D>> !!12> 
u (51BEDU1U+V"WW	X> > > SR> > > > >r4   rW  c                   @    e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZd Zd	S )
ZambaPreTrainedModelr~   modelTr>  rO  r   Fc                    | j         j        }t          |t          j        t          j        f          rJ|j        j                            d|           |j	         |j	        j        
                                 d S d S t          |t          j                  rU|j        j                            d|           |j        +|j        j        |j                 
                                 d S d S t          |t                    r!|j        j                            d           d S t          |t                    rD|j        j                            d|           | j         j        dz  }t          j                            |j        | |           | j         j        | j         j        z  | j         j        z  }t1          j        t1          j        | j         j        |          t7          j        | j         j                  t7          j        | j         j                  z
  z  t7          j        | j         j                  z                                 | j         j                   }|t1          j        t1          j!        |                      z   }|j"        j        #                    |           t1          j$        d|j%        dz   t0          j&                  d d d f         }|'                    |j(        d          )                                }|j*        j        #                    t1          j        |          +                    |j        |j,        d                     |j-        j                            d           d S d S )	Nr   )r=   stdg      ?r   )minr   r   r7   ).r~   initializer_ranger  r   r   r   r-   datanormal_r   zero_	Embeddingpadding_idxr%   fill_r   r   r   inituniform_r   rk   r0   rq   r+   r  randmathr   time_step_maxtime_step_minclamptime_step_floorexpm1r   r  r   rn   r;   rP   rl   r   r   rQ   r   r   )r/   r   rb  dt_init_stdr   dtinv_dtr   s           r3   _init_weightsz"ZambaPreTrainedModel._init_weights   s    k+fry")455 	%M&&CS&999{& &&((((( '&-- 	%M&&CS&999!-"6#56<<>>>>> .--- 	%M$$S)))))00 	% %--3C-@@@+3T9KGV2[L+NNN![58OOSWS^SllN
4;4nEE8DK566$+B[9\9\\^(4;4556  e3e44	  %)U["%5%5$5666F$**6222Q 5 9OOOPTVWVWVWPWXA1266AACCAL##EIaLL$8$89MvOdfh$i$ijjjHM$$$$$%	% 	%r4   N)rH   rI   rJ   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulrw  r   r4   r3   r_  r_    s_         &*#57OP"3 NL% % % % %r4   r_  c                   B    e Zd ZdZdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         d	eej                 d
ee         dee         dee         dee         deej	                 deeef         fd            Zd Z xZS )
ZambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

    Args:
        config: ZambaConfig
    r~   c           
        
 t                                          |           |j        | _        |j        | _        t          j        |j        |j        | j                  | _        t          |          }g }g }|j
        | _
        t          |j                  D ]}|j
        |         dk    r%|                    t          ||                     8|j
        |         dk    rb|                    t          j        | j        j        | j        j        d                     |                    t          ||                     t#          |          }t#          |          }g }g | _        t'          | j
                  D ]\  }}|dk    red| d
g d}	g | j        
fd	|	D             | _        |                    t)          |t+          |          t+          |                               p|                    t+          |                     t          j        |          | _        |j        | _        t3          |j        |j        
          | _        d| _        |                                  d S )NrQ  )r   r\   Fr   zlayers..)	z%shared_transf.self_attn.q_proj.weightz%shared_transf.self_attn.k_proj.weightz%shared_transf.self_attn.v_proj.weightz%shared_transf.self_attn.o_proj.weightz+shared_transf.feed_forward.gate_proj.weightz)shared_transf.feed_forward.up_proj.weightz+shared_transf.feed_forward.down_proj.weightz$shared_transf.input_layernorm.weightz%shared_transf.pre_ff_layernorm.weightc                     g | ]}|z   S r   r   )rc   r   prefix_names     r3   rf   z'ZambaModel.__init__.<locals>.<listcomp>l  s     FnFnFn]`{UXGXFnFnFnr4   r@  )r(   r)   pad_token_idri  
vocab_sizer   rh  r0   embed_tokensr>  ri   rx   ry   r{   rO  r   r~   iter_tied_weights_keys	enumeraterW  next
ModuleListlayersr   r%   rC  final_layernormgradient_checkpointing	post_init)r/   r~   blockmamba_layerslinear_layersr   r  layer_id
layer_type	tied_keysr  r2   s             @r3   r)   zZambaModel.__init__J  sr      !. +L):F<NPTP`aa*622!'!9v/00 	Q 	QA'*g55##$:6Q$O$O$OPPPP)!,88$$RYt{/FH_fk%l%l%lmmm##$:6Q$O$O$OPPPL))]++"$$-d.D$E$E 	2 	2 HjX%%3333
 
 
	 +pD,C*oFnFnFnFndmFnFnFn*o'.ud=6I6I4P\K]K]^^____d<001111mF++$*$?!+F,>FDWXXX&+#r4   N	input_idsr   position_idsr   inputs_embedsr&  rG  output_hidden_statesreturn_dictrR  rN   c                 (   ||n| j         j        }||n| j         j        }||n| j         j        }|	|	n| j         j        }	|d u |d uz  rt          d          | j        r%| j        r|rt          	                    d           d}|| 
                    |          }|}t          j        |          }|r|t          	                    d           |
&t          j        |j        d         |j                  }
||
                    d          }|                     |||
          }|rdnd }|rdnd }t%          | j                  D ]q\  }}|r||fz  }| j        r+| j        r$|                     |j        |||||||||

  
        }n ||||||||||
		  	        }|d         }|r|d         ||d         fz  }r|                     |          }|r||fz  }|r|j        sd
|_        t1          ||r|nd ||          }|	r|n|                                S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r_   r   r   )rF  r   r   r   r   rG  r&  rR  T)last_hidden_stater   r?   
attentions)r~   rG  r  r&  use_return_dictr0  r  r   r   r   r  r+   r   r   rE   r[   r  _update_causal_maskr  r  _gradient_checkpointing_func__call__r  rj   r   to_tuple)r/   r  r   r  r   r  r&  rG  r  r  rR  r?   rF  r   all_hidden_statesall_self_attnsr   layerr]  outputs                       r3   rB   zZambaModel.forwardy  s    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]-t";< 	s   & 	4= 	Y 	j   I  --i88M%!&]!;!;  	0:  
 !"\-*=a*@I]^^^N)33A66L..~}n]]"6@BBD0:d )$+ 6 6 "	: "	:Iu# 6!m%55!* t}  $ A AN!*"#%"! ! !&!+A'#1 +$3&7'#1
! 
! 
! *!,M  : #/"}Q'7&99N,,];;   	2-!11 	6?#E 	615O.(+/8BOOd+%	
 
 
 %;vv&//*;*;;r4   c                    | j         j        dk    r
|d|v r|S d S |j        |j        }}t	          j        |          j        }|j        d         }|d         dz   }t	          j        ||f|||          }	|dk    rt	          j	        |	d          }	|	t	          j
        ||          |                    dd          k    z  }	|	d d d d d d f                             |j        d         ddd          }	||	                                }	|                                d	k    rw|j        d         }
|	d
d |
f                             d          |d d d d d d f                             d          z  }|	d
d |
f                             ||          |	d
d |
f<   | j         j        dk    r%|#|j        j        dv rt%          j        |	|          }	|	S )Nflash_attention_2r   r   r7   )
fill_valuer9   r[   )diagonalr_   r   r6   .sdpa)r.  xpunpu)r~   r   r9   r[   r+   finforc  rE   fulltriur   rQ   rP   r   r   eqmasked_fillr/  r   _unmask_unattended)r/   r   input_tensorrR  r9   r[   	min_dtypesequence_lengthtarget_lengthr   mask_lengthpadding_masks               r3   r  zZambaModel._update_causal_mask  s   ;+/BBB)c^.C.C%%4$*L,?vK&&*	&,Q/&r*Q.j/=!Ai_dmsttta*[1===Ku|M&AAANDZDZ[]_`DaDaaa!$aaa"23::<;Ma;PRSUWY[\\%%++--K!!##q((,226*3+<=@@EEWXWXWXZ^`dfgfgfgWgHhHkHkloHpHpp1<S,;,=N1O1[1[\hjs1t1tC+-. K,66*%*.DDD
 1CKQZ[[Kr4   
NNNNNNNNNN)rH   rI   rJ   r   r   r)   r   r   r+   r   r   rX   rM  rL  r   rD   r   rB   r  rK   rL   s   @r3   r  r  A  sh        -{ - - - - - -^  151537=A59$(,0/3&*59k< k<E,-k< !.k< u/0	k<
 ""9:k<   12k< D>k< $D>k< 'tnk< d^k< !!12k< 
u--	.k< k< k< ^k<\! ! ! ! ! ! !r4   r  c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 dee
         d	eej                 d
eej                 dee         dee         dee         dee         deej                 deeej	        f         deeef         fd            Z	 	 	 	 	 	 ddZ xZS )ZambaForCausalLMr~   c                 &   t                                          |           t          |          | _        dg| j        j        | _        |j        | _        t          j        |j        |j        d          | _	        | 
                                 d S )Nzlm_head.weightFr   )r(   r)   r  r`  r  r  r   r   r0   lm_headr  r:  s     r3   r)   zZambaForCausalLM.__init__  s       ''
#3"Tdj6S"T +y!3V5FUSSS 	r4   Nr   r  r   r  r   r  labelsr&  rG  r  r  rR  logits_to_keeprN   c                    ||n| j         j        }|	|	n| j         j        }	|
|
n| j         j        }
|                     ||||||||	||

  
        }|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}| | j	        ||| j
        fi |}|
s|f|dd         z   }||f|z   n|S t          |||j        |j        |j                  S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ZambaForCausalLM

        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r  r   r  r   r  r&  rG  r  rR  r  r   r   losslogitsr   r?   r  )r~   rG  r  r  r`  r  r   slicer  loss_functionr  r   r   r?   r  )r/   r  r   r  r   r  r  r&  rG  r  r  rR  r  r   rK  r?   slice_indicesr  r  r  s                       r3   rB   zZambaForCausalLM.forward  sj   P 2C1N--TXT_Tq %9$D  $+Jj 	 &1%<kk$+B] **)%+'/!5)#  
 
  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%ffdoPPPPD 	DY,F'+'7D7V##VC%#3!/)
 
 
 	
r4   Tc           	         |d u }	|	s]||d         |j         d         k    r|d d |j         d          d f         }nV|j         d         |j         d         k    r|d d |f         }n-t          | j        |j         d         | j        | j                  }|b|`|                                                    d          dz
  }|                    |dk    d           |	s|d d |j         d          d f         }||	rd|i}
nd|                                i}
|
	                    ||||| j        j
        |d           |                                D ]\  }}||
vr||
|<   |
S )Nr7   r   r   )r9   r[   r  r  )r  r   r&  r   r  rR  )rE   rX   r~   r9   r[   longcumsummasked_fill_r   r   num_logits_to_keepitems)r/   r  r   r   r  rR  r  r&  r   empty_past_kvmodel_inputsr   r   s                r3   prepare_inputs_for_generationz.ZambaForCausalLM.prepare_inputs_for_generationj  s    (4/  	 )!"%);;;%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	5Y_Q/tz$+  O %,*>)..0077;;a?L%%n&91===  F+AAA	0B/B/D/D,DE $$+];LL')=)=)?)?@L ,#2&"0"&+"@"0 		
 		
 		
 !,,.. 	* 	*JC,&&$)S!r4   )NNNNNNNNNNNr   )NNNNNT)rH   rI   rJ   r   r)   r   r   r+   r   r   rX   rM  rL  r   r   rD   r   rB   r  rK   rL   s   @r3   r  r    s       {        151537=A59-1$(,0/3&*5934O
 O
E,-O
 !.O
 u/0	O

 ""9:O
   12O
 )*O
 D>O
 $D>O
 'tnO
 d^O
 !!12O
 c5</0O
 
u,,	-O
 O
 O
 ^O
h ? ? ? ? ? ? ? ?r4   r  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   X    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee	e
eej                 f                  deej                 deej                 d	ee         d
ee         dee         dee         de	eef         fd            Z xZS )ZambaForSequenceClassificationc                     t                                          |           |j        | _        t          |          | _        | j        j        | _        t          j        |j        | j        d          | _	        | 
                                 d S r4  )r(   r)   
num_labelsr  r`  r  r   r   r0   scorer  r:  s     r3   r)   z'ZambaForSequenceClassification.__init__  sv        +''
"&*"?Yv14?OOO
 	r4   Nr  r   r  r   r  r  r&  rG  r  r  rN   c                 4   |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }||j        d         }n|j        d         }| j         j        |dk    rt          d          | j         j        d}n|}|| j         j        k                        |j        t          j
                  }t          j        |j        d         |j        t          j
                  }||z                      d          }n)d}t                              | j        j         d           |t          j        ||j        	          |f         }d}|t|                    |j                  }| j         j        f| j        dk    rd
| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        d
k    rWt-                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt1                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt5                      } |||          }|
s|f|dd         z   }||f|z   n|S t7          |||j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r  r   r  r&  rG  r  r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r7   rZ   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r_   
regressionsingle_label_classificationmulti_label_classificationr  )r~   r  r`  r  rE   r  r0  r:   r[   r+   int32r   argmaxr   r   r2   rH   problem_typer  r9   r  r   r
   r   r	   r   r   r   r   r?   r  )r/   r  r   r  r   r  r  r&  rG  r  r  transformer_outputsr?   r  re   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                         r3   rB   z&ZambaForSequenceClassification.forward  sJ   ( &1%<kk$+B]"jj)%+'/!5# ) 

 

 ,A.M** "+JJ&,Q/J;#+
a\]]];#+!#"%)AAEEfmUZU`aaL!L)<V]Z_ZefffM"/,">!F!Fr!J!J!#>* Z Z Z  
 u|Jv}MMMOaabYYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--x 2 22t G GUWYY)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE/ /?-;*5
 
 
 	
r4   r  )rH   rI   rJ   r)   r   r   r+   r   r   r   r   listrM  rL  rD   r   rB   rK   rL   s   @r3   r  r    sV             151537KO59-1$(,0/3&*[
 [
E,-[
 !.[
 u/0	[

 "%tE4E/F(F"GH[
   12[
 )*[
 D>[
 $D>[
 'tn[
 d^[
 
u66	7[
 [
 [
 ^[
 [
 [
 [
 [
r4   r  )r  r  r  r_  )r   )Jr   rn  typingr   r   r   r   r+   r   torch.nnr   r	   r
   activationsr   cache_utilsr   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.deprecationr   utils.import_utilsr   r   configuration_zambar   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater    causal_conv1dr!   r"   r  r   
get_loggerrH   r   Moduler%   r   r   rV   rX   r  r   r   r   r2  r>  rO  rW  r_  r  r  r  __all__r   r4   r3   <module>r     s}  (    1 1 1 1 1 1 1 1 1 1 1 1        A A A A A A A A A A ! ! ! ! ! !             ) ) ) ) ) ) > > > > > > B B B B B B q q q q q q q q q q F F F F F F F F & & & & & & , , , , , , , , 0 0 0 0 0 0 T T T T T T T T , , , , , ,  QXXXXXXXXRRRRRRR@P=-~ 8DDDDDDDDD-7**.0@BVXfg  
 
	H	%	%J J J J J29 J J J*	UU\ 	U# 	U%, 	U 	U 	U 	U]3 ]3 ]3 ]3 ]3 ]3 ]3 ]3N % %I%<% 
% <	%
 U\*% % % % % %4D) D) D) D) D)RY D) D) D)NQ] Q] Q] Q] Q]bi Q] Q] Q]j    ry    = = = = = = = =@B B B B BRY B B BJF F F F Fry F F FR )% )% )% )% )%? )% )% )%X G G G G G% G G GV\ \ \ \ \+_ \ \ \~   g
 g
 g
 g
 g
%9 g
 g
 g
T g
f
fr4   