
     `iL                       d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.  e,            rd dl/m0Z0 d dl1m2Z2m3Z3 nd\  Z0Z2Z3 e+            r	d dl4m5Z5m6Z6 nd\  Z6Z5 e'j7        e8          Z9 G d de	j
        j:                  Z; G d de
j:                  Z< G d d          Z= G d  d!e
j:                  Z>d"e	j?        d#e@d$e	j?        fd%ZA	 dMd'e
j:        d(e	j?        d)e	j?        d*e	j?        d+ee	j?                 d,eBd-eBfd.ZCd/ ZDdNd0ZE G d1 d2e
j:                  ZFd3e	j?        d4e@fd5ZGd6 ZHd7 ZI eJe0e5e6f          ZK G d8 d9e
j:                  ZL G d: d;e
j:                  ZM G d< d=e
j:                  ZN G d> d?e
j:                  ZO G d@ dAe
j:                  ZP G dB dCe"          ZQe& G dD dEeQ                      ZR G dF dGeQe          ZS e&dHI           G dJ dKeQ                      ZTg dLZUdS )O    N)cycle)AnyCallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)Cache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_ssm_available   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNc                   (     e Zd Zd fd	ZddZ xZS )Zamba2RMSNormGatedư>c                     t                                                       t          j        t	          j        |                    | _        || _        || _        d S N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer4   eps	__class__s       ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/zamba2/modeling_zamba2.pyr.   zZamba2RMSNormGated.__init__=   sG    l5:k#:#:;; #$    Nc                    |j         }|                    t          j                  }|?|t          j                            |                    t          j                            z  }|j        ^ }}|| j        z  } |j	        g ||| j        R  }|
                    d                              dd          }|t          j        || j        z             z  } |j	        g ||| j        z  R  }| j        |                    |          z  S N   T)keepdim)dtypetor0   float32r   
functionalsilushaper4   viewpowmeanrsqrtr3   r2   )	r5   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r9   forwardzZamba2RMSNormGated.forwardC   s   #)%((77)BM,>,>twwu}?U?U,V,VVM!.!4h$/10m0\+\{\DO\\\&**1--222t2DD1EK4K`@`4a4aa0+0]+]{T_?\]]]{]--k::::r:   r*   r,   )__name__
__module____qualname__r.   rR   __classcell__r8   s   @r9   r)   r)   <   sQ        % % % % % %; ; ; ; ; ; ; ;r:   r)   c                   ,     e Zd Zd fd	Zd Zd Z xZS )Zamba2RMSNormr*   c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z<
        Zamba2RMSNorm is equivalent to T5LayerNorm
        N)r-   r.   r   r/   r0   r1   r2   r3   )r5   r6   r7   r8   s      r9   r.   zZamba2RMSNorm.__init__R   sD     	l5:k#:#:;; #r:   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S r<   )	r@   rA   r0   rB   rG   rH   rI   r3   r2   )r5   rJ   rL   rQ   s       r9   rR   zZamba2RMSNorm.forwardZ   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r:   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler2   rE   r3   r5   s    r9   
extra_reprzZamba2RMSNorm.extra_repra   s&    )**II$2GIIIr:   rS   )rT   rU   rV   r.   rR   r`   rW   rX   s   @r9   rZ   rZ   Q   sb        $ $ $ $ $ $; ; ;J J J J J J Jr:   rZ   c                   r   e Zd ZdZdZej        dfdededej	        de
e         fdZd	 Zd
edeej        ej        f         fdZ	 ddej        dej        d
ede
eeef                  deej        ej        f         f
dZdej        fdZdd
e
e         defdZd
edej        dej        dej        fdZd ZdS )Zamba2HybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNconfig
batch_sizer@   devicec           	         || _         |j        | _        d| _        t          |j        |j        z            | _        |j        | _        |j	        | _
        |j        | _        g | _        i | _        i | _        i | _        i | _        i | _        t%          |j                  D ]}t)          j        | j        d|j        z  |j        z  z   | j
        |          | j        |<   t)          j        | j        |j        | j        |          | j        |<   | j        |         dk    r| j                            |           fdt%          |j                  D             | _        fdt%          |j                  D             | _        d S )NFr=   re   r@   hybridc                 D    g | ]}t          j        g gz             S re   r0   tensor.0_rd   re   s     r9   
<listcomp>z5Zamba2HybridDynamicCache.__init__.<locals>.<listcomp>   s/    rrrQ%,tj'8HHHrrrr:   c                 D    g | ]}t          j        g gz             S rj   rl   rn   s     r9   rq   z5Zamba2HybridDynamicCache.__init__.<locals>.<listcomp>   s/    tttqEL"
):6JJJtttr:   )r@   layers_block_typehas_previous_stateintmamba_expandr6   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr0   zerosmamba_ngroupsmamba_headdimappend	key_cachevalue_cache)r5   rc   rd   r@   re   is     ` ` r9   r.   z!Zamba2HybridDynamicCache.__init__u   s    
!'!9"'!$V%86;M%M!N!N$2 & 3#1"$v/00 	2 	2A"'+&V-A)AFDX)XX%# # #DQ "'D.0DdFYbhpu" " "DOA %a(H44'..q111rrrrrRWX^XpRqRqrrrtttttTYZ`ZrTsTstttr:   c                 *    t          | j                  S r,   )lenr   r_   s    r9   __len__z Zamba2HybridDynamicCache.__len__   s    4>"""r:   	layer_idxreturnc                 6    | j         |         | j        |         fS r,   )r   r   r5   r   s     r9   __getitem__z$Zamba2HybridDynamicCache.__getitem__   s    ~i($*:9*EEEr:   
key_statesvalue_statescache_kwargsc                 D   | j         |         j        d         dk    r|| j         |<   || j        |<   nVt          j        | j         |         |gd          | j         |<   t          j        | j        |         |gd          | j        |<   | j         |         | j        |         fS )Nr>   r   r=   dim)r   rE   r   r0   cat)r5   r   r   r   r   s        r9   updatezZamba2HybridDynamicCache.update   s     >)$*2.!33(2DN9%*6DY''(-	4>)3Lj2Y_`(a(a(aDN9%*/)T5Ei5PR^4_ef*g*g*gDY'~i($*:9*EEEr:   beam_idxc                    t          t          | j                            D ];}| j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   =dS )zDReorders the cache for beam search, given the selected beam indices.r   N)	r   r   r   re   index_selectrA   r   r   r   )r5   r   r   re   s       r9   reorder_cachez&Zamba2HybridDynamicCache.reorder_cache   sE   s4>2233 		i 		iI^I.5F(,y(A(N(NqRZR]R]^dReRe(f(fDN9%%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'_Y/6F)-)C)P)PQRT\T_T_`fTgTg)h)hDOI&&		i 		ir:   r   c                     || j         vr| j         d         n|}t          | j                  |k    s#| j        |                                         dk    rdS | j        |         j        d         S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )r}   r   r   numelrE   r   s     r9   get_seq_lengthz'Zamba2HybridDynamicCache.get_seq_length   sq     3<4CZ2Z2ZD+A..`i	t~)++t~i/H/N/N/P/PTU/U/U1~i(.r22r:   new_conv_statecache_positionc                 P   | j         |         }|                    d| j        dz
            }|                    dd          }|                    |j                  |d d d d |f<   | j         |                                          | j         |xx         |z  cc<   | j         |         S )Nr   r   r>   shiftsdims)r   clampr{   rollrA   re   zero_)r5   r   r   r   
conv_states        r9   update_conv_statez*Zamba2HybridDynamicCache.update_conv_state   s     %i0
'--a1F1JKK__BR_88
+9+<+<Z=N+O+O
111aaa'(#))+++###z1###	**r:   c                 j    | j                                          | j                                         d S r,   )r   r   r   r_   s    r9   resetzZamba2HybridDynamicCache.reset   s1       r:   r,   )r   )rT   rU   rV   __doc__is_compileabler0   float16r    ru   r@   r   strr.   r   r^   Tensorr   dictr   r   
LongTensorr   r   r   r    r:   r9   rb   rb   e   s         N KP-quu u"u03u<AKuaijmanu u u u@# # #FS FU5<3M-N F F F F 26F FLF lF 	F
 tCH~.F 
u|U\)	*F F F F"ie&6 i i i i3 3 3c 3 3 3 3
+
+.3l
+LQL\
+	
+ 
+ 
+ 
+         r:   rb   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )Zamba2RotaryEmbeddinginv_freqNrc   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r-   r.   hasattr
isinstancer   r   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrc   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r5   rc   re   r   r8   s       r9   r.   zZamba2RotaryEmbedding.__init__   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r:   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r>   r   mpscpuF)device_typeenabledr=   r   r@   )r   floatexpandrE   rA   re   r   r   r   r0   autocast	transposer   cosr   sinr@   )
r5   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r9   rR   zZamba2RotaryEmbedding.forward   s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/r,   )rT   rU   rV   r0   r   __annotations__r    r.   no_gradr   rR   rW   rX   s   @r9   r   r      s         l/ /| / / / / / /" U]__< <  _< < < < <r:   r   rJ   n_repr   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rE   r   reshape)rJ   r   batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr:           modulequerykeyvalueattention_maskscalingdropoutc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr=   r   r   r>   )r   r@   )ptrainingr   )r   num_key_value_groupsr0   matmulr   rE   r   rC   softmaxrB   rA   r@   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr   r   attn_weightscausal_maskattn_outputs                r9   eager_attention_forwardr     s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r:   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nr>   r=   r   )rE   r0   r   )r   x1x2s      r9   rotate_halfr     s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r:   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r9   apply_rotary_pos_embr  #  sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr:   c                   p    e Zd ZdZ	 	 	 ddedee         dee         dee         f fdZ edd	d
          	 	 	 dde	j
        dedee	j
                 d	ee         deee	j
        e	j
        f                  dee         dee	j
        ee	j
                 eee	j
                          f         fd            Z xZS )Zamba2AttentionaZ  
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
    Nrc   r   num_fwd_mem_blocksblock_idc           	      <   t                                                       || _        || _        |j        | _        |j        | _        |j        |j        z  | _	        |j
        | _
        | j        dz  dz  | _        d| _        |j        | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        | j        z  |j        d          | _        || _        |j        | _        || _        |j        rt          j        g           | _        t          j        g           | _        t          j        g           | _        t=          | j                  D ]}||j        z  |k    rt          j         t          j        | j        | j        j!        d          t          j        | j        j!        | j        d                    }t          j         t          j        | j        | j        j!        d          t          j        | j        j!        | j        d                    }t          j         t          j        | j        | j        j!        d          t          j        | j        j!        | j        d                    }n9t          j"                    }t          j"                    }t          j"                    }| j        #                    |           | j        #                    |           | j        #                    |           d tI          | j                  D             | _%        d S )Nr=   g      TFbiasc                     i | ]\  }}||	S r   r   ro   indexr   s      r9   
<dictcomp>z,Zamba2Attention.__init__.<locals>.<dictcomp>  s    [[[<5%%[[[r:   )&r-   r.   rc   r   attention_hidden_sizeattention_head_dimr   num_attention_headsr   r   r   r   	is_causalattention_dropoutr   Linearq_projk_projv_projr6   o_projr
  hybrid_layer_idslayer_block_mapr  use_shared_attention_adapter
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr   num_mem_blocks
Sequentialadapter_rankIdentityr   	enumerate	layer_dic)
r5   rc   r   r
  r  r   linear_q_adapterlinear_k_adapterlinear_v_adapterr8   s
            r9   r.   zZamba2Attention.__init__N  s*    	"%+%A"1$*$>&B\$\!'-'E$)d2!'!9i <f>X[_[h>hotuuui <f>X[_[h>hotuuui <f>X[_[h>hotuuui :T] JFL^ejkkk"4%6 . 	D)+r):):D&)+r):):D&)+r):):D&4233 D Dv,,88')}	$"<dk>V]bccc	$+":D<V]bccc( ($ (*}	$"<dk>V]bccc	$+":D<V]bccc( ($ (*}	$"<dk>V]bccc	$+":D<V]bccc( ($$
 (*{}}$'){}}$'){}}$*112BCCC*112BCCC*112BCCCC[[9TEY;Z;Z[[[r:   past_key_valuepast_key_values4.58new_nameversionrJ   r   position_embeddingsr   r   c                    |j         d d         }g |d| j        R }|                     |          }	|                     |          }
|                     |          }| j        j        rX| j        |         }|	 | j        |         |          z   }	|
 | j	        |         |          z   }
| | j
        |         |          z   }|	                    |                              dd          }	|
                    |                              dd          }
|                    |                              dd          }| j        j        r|\  }}t          |	|
||          \  }	}
||                    |
||          \  }
}t           }| j        j        dk    rt$          | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr>   r   r=   eagerr   )r   r   )rE   r   r  r  r  rc   r  r)  r!  r"  r#  rF   r   use_mem_roper  r   r   _attn_implementationr   r   r  r   r   r   r  )r5   rJ   r   r   r.  r3  r   input_shapehidden_shapequery_statesr   r   adapter_layer_idxr   r   attention_interfacer   r   s                     r9   rR   zZamba2Attention.forward  sS    $)#2#.88b8$-88{{=11[[//
{{=11;3 	g $y 9'*W$*DEV*WXe*f*ffL#&Sd&@AR&STa&b&bbJ'*W$*DEV*WXe*f*ffL#((66@@AFF__\22<<QBB
#((66@@AFF;# 	`*HC';L*VY[^'_'_$L*&'6'='=j,Xa'b'b$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r:   r$   )rT   rU   rV   r   r    r   ru   r.   r   r0   r   rb   r^   r   r   rR   rW   rX   s   @r9   r	  r	  >  sg        $ $(,0"&6\ 6\6\ C=6\ %SM	6\
 3-6\ 6\ 6\ 6\ 6\ 6\p _%0A6RRR
 26>BKO1) 1)|1) 1) !.	1)
 "":;1) &eEL%,,F&GH1) -.1) 
u|Xel3XeEL>Q5RR	S1) 1) 1) SR1) 1) 1) 1) 1)r:   r	  input_tensorpad_sizec                     t          | j                  dk    r
ddddd|ddfnddd|ddf}t          j        j                            | |dd          S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )r   rE   r0   r   rC   pad)r=  r>  	pad_shapes      r9   pad_tensor_by_sizerE    sj     47|7I3J3Ja3O3OAq!Q!Q//VWYZ\]_gijlmUnI8""<ST"UUUr:   c                 "   t          | |          } t          | j                  dk    r.|                     | j        d         d|| j        d                   S |                     | j        d         d|| j        d         | j        d                   S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r   r>   r=   )rE  r   rE   r   )r=  r>  
chunk_sizes      r9   reshape_into_chunksrH    s     &lH==L
<!####L$6q$92z<K]^_K`aaa ##q!2z<3Ea3H,J\]^J_
 
 	
r:   c                    |                      d          } | d         j        g |                                  |R  } t          j        t          j        ||| j        t          j                  d          }|                     | d          } t          j        | d          }t          j        t          j        ||| j        t          j                  d          }|                    | t          j	                   }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r>   .Nrg   diagonalr   r   r   )
sizer   r0   trilr1   re   boolmasked_fillcumsuminf)r=  rG  masktensor_segsums       r9   segment_sumrU    s     ""2&&J 2<	*1S<3D3D3F3FS
SSSL:ejZ@S[`[efffqstttD++TE155LL2666M :ejZ@S[`[efffqrsssD!--teeiZ@@Mr:   c                        e Zd ZdZddedee         f fdZ	 	 ddej	        dee
         deej	                 fd	Zddee
         deej	                 fd
Z	 	 ddee
         deej	                 fdZ xZS )Zamba2MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    Nrc   r   c           	      z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          |j	        | j        z            | _
        || _        |j        | _        d| _        t          j                    | _        |j        | _        |j        | _        |j        | _        | j        j        | _        |j        | _        |j        | _        |j        | _        |j        | _        | j
        d| j        z  | j        z  z   | _        t          j        | j        | j        d|j        | j        |j        dz
            | _        | j
        | j        z   | j        z   }t          j        | j        ||j                   | _!        t          j"        tG          j$        | j                            | _%        tG          j&        d| j        dz             }t          j"        tG          j'        |                    | _(        tS          | j
        | j
        | j        z  d          | _*        t          j"        tG          j$        | j                            | _+        t          j        | j
        | j        |j                   | _,        tZ          st\          /                    d	           d S d S )
NrD   r=   Tr   )in_channelsout_channelsr  kernel_sizegroupspaddingr  gh㈵>)r4   r7   a  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)0r-   r.   rc   r6   rx   ry   rz   r{   ru   rv   rw   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr   n_groupsr   r   r|   	num_headsrG  time_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr  add_bias_linearin_projr/   r0   r1   dt_biasarangelogA_logr)   normDout_projis_fast_path_availableloggerwarning_once)r5   rc   r   projection_sizeAr8   s        r9   r.   zZamba2MambaMixer.__init__  sg   !-$2 & 3!$V%84;K%K!L!L"#1 799 & 7,,2 +%5#1#1.T]1BTEX1XXi+='!+
 
 
 04=@4>Qy'
 
 
 |EJt~$>$>?? LDNQ.//\%)A,,//
&"t/E/V\`
 
 
	 ej8899	$"8$:JQWQghhh% 	>    	 	r:   rJ   cache_paramsr   c                    |j         \  }}}| j        | j        z  }d| j        z  d| j        z  | j        z  z   | j        z   }||j        r|                     |                    d                    }	|	j         d         |z
  dz  }
|
|
| j        | j        | j        g}t          j
        |	|d          \  }}}}}t          ||j        | j                 | j        j                            d          | j        j        | j                  }t          j
        || j        ||gd          \  }}}t          j        | j                                                   }|d d d df         d d d d d f                             d| j        | j                                      t          j                  }|d d d d d f                             dd| j                  }| j        d d d df                             d| j                  }| j        d d d df                             d| j                  }|                    || j        |j         d         | j        z            }|                    || j        |j         d         | j        z            }|                    || j        | j                  }t9          |j        | j                 ||||||d |d
  
        }|                    || j        | j        z            }|                     ||          }|                     |          d d d df         }n'|Dt          j         |dk              s,|j!        }||d d d d d f         z                      |          }|                     |          }t          j        | j                                                   }| j"        i nd	| j"        i}|t          j         |dk              }nd}| j#        r| j$        r||rtK          || j        j                            d          | j        j        | j        |f| j        | j&        d | j        | j        j        | j        j'        | j        j        | j        j        | j        | j        d
dd|\  }}nt          j
        || j        | j        | j        gd          \  }}}|p|(                    dd          }tR          j*        +                    || j,        |j         d         z
  df          }|j        | j                 -                    |           t\          	| j        dvr]| /                    |                     |(                    dd                    (                    dd          d d d |f                   }nst]          |(                    dd          | j        j                            d          | j        j        | j                  (                    dd          d d d |f         }t          j
        || j        ||gd          \  }}}|Dt          j         |dk              s,|j!        }||d d d d d f         z                      |          }ta          |                    ||d| j                  |||                    ||| j        d          |                    ||| j        d          f| j&        | j        d d d| j        dd|\  }}|'|%|j        | j                 -                    |           |                    ||d          }|                     ||          }|                     |          }|S )Nr=   r   r>   r   .r   T)zrm  dt_softplusdt_limitF)rr  rG  seq_idxr_  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rD   swish)r   r2   r  r_  )rG  rr  r{  r~  r  rm  r|  )1rE   rc  ry   rw   rd  rt   rl  squeezerh  r0   splitr&   r   r   rj  r2   r  r_  exprp  r   r   r   rA   rB   rm  rr  rF   r!   r   rq  rs  allr@   re  rb  r   r#   rG  r3   r   r   rC   rC  r{   copy_r%   ra  r"   )r5   rJ   ry  r   rd   seq_lenrp   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrK   hidden_states_B_CdtBCrx  rm  rr  hidden_states_reshapedoutr@   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputs                                  r9   cuda_kernels_forwardz%Zamba2MambaMixer.cuda_kernels_forward;  s    "/!4
GQ!%1D!D$001t}3DtGZ3ZZ]a]kk #(G#"&,,}/D/DQ/G/G"H"H(.r2[@QFE$)5$2H$-Y]Yg#h 05<OQekm0n0n0n-Aq$)2 4!(8"**1-- ! ! #(+!')?AWX# # #M1a
 4:++--...A!!!T3,111d
+222t}dFYZZ]]didq]rrAAAAqqq$J&&r2t}==Bl111dC<077DMJJGqqq$|$++B>>Az4=!'!*2MNNAz4=!'!*2MNNA%2%7%7
DNTXTa%b%b"2'7&   M *..z4>DM;YZZM IImT::M--..qqq$|<CC )%)Na<O2P2P)%+!.111d
1K!K O OPU V V#||M::4:++--...A$($8$@bbzSWSgFhO)#(9^q-@#A#A  #' $ L1 L1<;OTd;O!A$K&..q11K$L" f# ##'9#3 $	 :#'=#7!%!3 M M%*(,#" "$ &%" "YY, 6;[$+T]DNK6 6 62'  +*;*E*Ea*K*K'!#!2!2+d.CFYF_`bFc.cef-g" "J !,T^<BB:NNN#+tFW/W/W(,$5$?$?1$E$EFFPPQRTUVVWXWXWXZb[bZbWbc) )%% )9+55a;;#{199!<<![-#'?	) ) )
  i1ooaaa'k)3% ',k%+-CE[\' ' '#q!
 "-eiRS@S6T6T-)/E%2^AAAqqq$J5O%O$S$STY$Z$ZM)B!&&z7BNNFF:wrBBFF:wrBB*  $f (, L $* * &* *&Y (\-E +DN;AA)LLL)..z7BGG"iiT::mmK00
r:   c                 \   1 |j         \  }}}|j        }|0|j        r)                     |                    d                    }nT|=t          j        |dk              s%||d d d d d f         z                      |          }                     |          }|j         d         d j        z  z
  d j	        z   j
        z  z
   j        z
  dz  }	|                    |	|	 j         j         j        gd          \  }}}
}}|d|j         j                                                 }|                    |j                  }|j        r|
                    d          }
|j         j                 }t          j        |dd          }|j        dk    r|d d dd d f         n||d d d d df<   |j         j                                     |           t          j        |                    |j                   j        j        d d dd d f         z  d          } j        r| j        j        z  }                     |                              |          d d d df         }n|                    dd          }t<          j                             | j!        |j         d         z
  df          }|j         j                                     |                                                     |                              dd                    d d d |d d f         }|Dt          j        |dk              s,|j        }||d d d d d f         z                      |          }nt          j"        | j         j#         j
        f|j        |	          }                                          |                    dd                    dd |f                             dd                    }t          j        | j         j	         j
        z   j	         j
        z  gd          \  }}}t          j$         j%        &                                           }|)|j        r!|j        dk    r|d d d df         n|d d dd d f         d d d df         }|                    dd          '                    ||j         d          j#                  } j(        d
         '                     j(        j         d          j#                  }t
          j        j        )                    ||                    |j                  z             }t          j*        | j+                  }|d         '                     j         j#         j
                                      t
          j,                  }t          j$        |d
         |z            }|-                    | j	        d          dd d d f         }|'                    | j	         j         j	        z  |j         d                   .                                }|-                    |d|j         d                   }|d
         |dd d d f         z  }|-                    |d j#                  }||d
         z  }|j         j                                     |j         j                 |z  |z              |-                    | j	        d          dd d d f         }|'                    | j	         j         j	        z  |j         d                   .                                }|-                    |d|j         d                   }|j         j                                     |j                  }|/                    | j        z   j#         j
                  }|/                    | j        z   j
        d          }t          j0        ||          }|/                    | j         j#                  } j1        d
         '                     j1        j         d          j#                  }|||z  z                       |j                  }|-                    |d          d d d df         }ngt<          j        )                    | j(        z             }t          j*        | j+                  }|-                    ||d j#                  &                                }|-                    ||d j
                  &                                }|-                    ||d j
                  &                                }|2                     j         j	        z  d j                  }|2                     j         j	        z  d j                  } j3        | j3        z  z
   j3        z  1 j1        d
         ti          |1          z  }||d
         z  }|                    |j                  |z  }1 fd||||fD             \  }}}}|5                    dddd          }t          j6        |d          }t          j$        to          |                    }|d d d d d d d d d d d f         |d d d d d d d d d d d f         z  }|                    d          }|d
         |5                    ddddd          d
         z  } |                     d          }!|!d
         |d d d d d f         z                      d          }"t          j$        |d d d d d d dd f         |z
            }#||#5                    dddd          d
         z  }$|$5                    ddddd          d
         |5                    ddddd          dd d d f         z                      d          5                    ddddd          }%|%|j        r|j         j                 d d d df         }&n t          j8        |%d d d df                   }&t          j9        |&|%gd          }%t          j$        to          t<          j                             |d d d d d d df         d                              }'|%5                    ddddd          }(|'d         |(d d d d d df         z                      d          })|)5                    ddddd          }*|*d d d df         |*d d df         }}%t          j$        |          }+|dd d d f         |%d d d d d df         z  },|+5                    dddd          }-|,                    d          |-d
         z  }.|"|.z   }|-                    |d j         j#                  }||z   }1dk    r|d d d |d d d d f         }|-                    ||d          }|'|%|j         j                                     |            :                    ||
          }/ ;                    |/                    |                    }0|0S )Nr   r>   r=   r   r   r   r   .rg   rJ  ).NNr   )r   output_sizec                 <    g | ]}t          |j                  S r   )rH  rG  )ro   tr>  r5   s     r9   rq   z2Zamba2MambaMixer.torch_forward.<locals>.<listcomp>M  s)    %z%z%z\]&9!Xt&W&W%z%z%zr:   r@  )r   r   )<rE   r@   rt   rl  r  r0   r  rA   rw   rc  ry   rd  r  rh  r   r   clonere   r  r   r   ndimr  sumrj  r2   r^  r  ra  r   r   rC   rC  r{   r   r   r  rp  r   r   rm  softplusr   rf  rB   r   r   rF   bmmrr  repeat_interleaverG  rE  permuterQ  rU  
zeros_liker   rq  rs  )2r5   input_statesry  r   rd   r  rp   r@   r  r  rK   rJ   r  r  r   r  r  rx  rm  dAdBdBxr   ssm_states_reshaped
C_reshapedyrr  
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr  contextualized_statesr>  s2   `                                                @r9   torch_forwardzZamba2MambaMixer.torch_forward  s   !-!3
GQ"#(G##||L,@,@,C,CDD)%)NA<M2N2N) ,~aaaDj/I IMMeTT#||L99!'+a$2H.HH1t}K\_c_rKrrtx  uC  C  HI  I(8(>(>t5t~V\^ )? )
 )
%1dM2
 #$/?EEGGI!]%9::I. [~~a(()5dnE
"Z
2BGGG
ANASWXAXAX}QQQ111W'='=^k
111aaa8$(8>>zJJJ %	*--8H8O*P*PSWS^SefgfgfgijlmlmlmfmSn*ntv w w w% 6!T[%55M $ 7 7 : :5 A A!!!T3, O - 7 7! < <]..!*]-@-DDaH 
 (8>>zJJJ $])C)C)M)MaPQ)R)R S STUTUTUW_X_W_abababTb c!-eiPQ@Q6R6R-)/E%2^AAAqqq$J5O%O$S$STY$Z$ZMT^T]D<OP$+5  I !HHT[[1H1HA1N1N%O%OPSU]V]U]P]%^%h%hijlm%n%nooM#k-$:PRVR_bfbuRuw{  xE  HL  H[  x[  :\  bd  e  e  eq!Ytz''))***#(G# &(W\\AAAtSL!!r!!!Q'{111dC<7PBa##**:rx|T]SSBl9-44T\5G5JDMZZG$--b7::bh3G3G.GHHBR!344B/"))$.$-I\]]``glgt`uuA2i=1,--B
 		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66AI3aaa<0B *11*b$-PPM}Y//C #DN399'7"<sB   		*dmR88dAAAFAT]DNdm4SUVU\]_U`aallnnA		*b!'"+66A &0@CCAGLLJ",//*t~2Mt}^b^q"r"r
T^ ;T=PRSTTJ	-z::Az4>4=AAA y!((a$-HHA]Q&&**1733A 		*b))!!!T3,7AA ''T\(9::BR!344B)11*gr4=YY__aaM		*gD4GHHNNPPA		*gr43FGGMMOOA##DNdm$CX\Xf#ggA##DNdm$CX\Xf#ggA'DO*CCtVH	*-?x-X-XXJ *ByM9M]())B.A &{%z%z%z%zboqrtuwxay%z%z%z"M1a 		!Q1%%A|A2...H 	+a..))A qqq!!!QQQaaa23a111dAAAqqq!!!8K6LLN""r"**A y\AIIaAq!,D,DY,OON""r"**A 	l]111aaa:%>>CCAFFF !9XaaaAAArssl%;h%FGGL"#l&:&:1aA&F&Fy&Q"Q)11!Q1a@@K}OdOdefhiklnoqrOsOstwy}  @A  @A  @A  uA  PB  B  G  G  LM  G  N  N  V  V  WX  Z[  ]^  `a  cd  e  eF'L,K'"."9$."I!!!TSV,"W"'"26!!!RaR%="A"AY8a@@@F)K0A0A(111aaaQRQRQRTV;BWY_0`0`$a$abbK$nnQ1a;;O!/2_QQQ4QT_5UUZZ_`ZaaF1aA66J *111crc6 2Jqqq"u4EIF $i11OT111oqqq!!!T30GGN'6'>'>q!Q'J'J$#''++.Fy.QQE A		*b$.$-HHAJA!||aaa'111aaa'(		*gr22A$)A'7==iHHHii4((
 !%knnU.C.C D D$$r:   c                     t           r/d| j        j        j        j        v r|                     |||          S |                     |||          S )Ncuda)rt  rl  r2   re   r   r  r  )r5   rJ   ry  r   s       r9   rR   zZamba2MambaMixer.forward  sR     " 	Zf0C0J0O&O&O,,]L.YYY!!-~NNNr:   r,   r'   )rT   rU   rV   r   r    r   ru   r.   r0   r   rb   r  r  rR   rW   rX   s   @r9   rW  rW    sJ        = =| = = = = = = =D <@15	T T|T 78T !.	T T T Tn% %AY8Z %qyz  {G  rH % % % %J <@15		O 	O 78	O !.		O 	O 	O 	O 	O 	O 	O 	Or:   rW  c                   >     e Zd Zddedee         f fdZddZ xZS )	Zamba2MLPNrc   r  c           	      n   t                                                       || _        |j        | _        |j        | _        || _        || _        t          j        | j        d| j        z  |j	                  | _
        t          j        | j        | j        |j	                  | _        t          |j                 | _        t          j        g           | _        t#          | j                  D ]}||j        z  |k    rft          j        t          j        | j        j        | j        j        d          t          j        | j        j        d| j        z  d                    }nt          j                    }| j                            |           |j        }d t1          |          D             | _        dS )aQ  
        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
        r=   r  Fc                     i | ]\  }}||	S r   r   r  s      r9   r  z&Zamba2MLP.__init__.<locals>.<dictcomp>  s    VVV<5%%VVVr:   N)r-   r.   rc   r6   rw   r
  r  r   r  rk  gate_up_proj	down_projr   
hidden_actact_fnr   gate_up_proj_adapter_listr   r$  r%  r&  r'  r   r  r(  r)  )r5   rc   r
  r  r   gate_up_proj_adapterr  r8   s          r9   r.   zZamba2MLP.__init__  s   
 	!-!'!9"4 Id&6D<R8RY_Yoppp4#94;KRXRhiiiV./)+r):):&t.// 	H 	HA6((H44')}Idk5t{7OV[\\\Idk6D<R8RY^___( ($$
 (*{}}$*112FGGGG 1VV9_;U;UVVVr:   c                    |                      |          }| j        |         }| | j        |         |          z   }t          j        |dd          }|                     |d                   |d         z  }|                     |          }|S )Nr=   r>   r   r   r   )r  r)  r  r0   chunkr  r  )r5   hidden_stater   gate_up_stateoutputs        r9   rR   zZamba2MLP.forward  s    )),77N9-	%(Q(Fy(QR^(_(__M1"==={{=#344}Q7GG--r:   r'   r,   )	rT   rU   rV   r    r   ru   r.   rR   rW   rX   s   @r9   r  r    st        W W| WPXY\P] W W W W W W<       r:   r  c                   R    e Zd Zddedee         dee         f fdZ eddd	          	 	 	 	 ddej	        dej	        dedeej	                 dee
         dee         deej                 dee         deej        eeej        ej        f                  f         fd            Z xZS )Zamba2AttentionDecoderLayerNrc   r  r   c                 \   t                                                       || _        t          |j                  }t          |d||          | _        t          |||          | _        t          |j
        |j                  | _        t          |j        |j                  | _        d S )Nr>   )r   r
  r  )r
  r  r7   )r-   r.   r  r   r  r	  	self_attnr  feed_forwardrZ   r  rms_norm_epsinput_layernormr6   pre_ff_layernorm)r5   rc   r  r   num_gsr8   s        r9   r.   z$Zamba2AttentionDecoderLayer.__init__  s     V,--(2RXcklll%fRZ[[[,V-IvObccc -f.@fFY Z Z Zr:   r-  r.  r/  r0  FrJ   original_hidden_statesr   output_attentionsr3  r   r   c           
          t          j        ||gd          }|                     |          } | j        d||||||d|\  }}	|                     |          }|                     ||          }|f}
|r|
|	fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        r>   r   )rJ   r   r   r.  r  r3  r   )r0   concatenater  r  r  r  )r5   rJ   r  r   r   r.  r  r3  r   self_attn_weightsoutputss              r9   rR   z#Zamba2AttentionDecoderLayer.forward  s    @ )=:P*QWYZZZ,,];;+94> ,
')+/ 3,
 ,
 ,
 ,
(( --m<<))-CC " 	,)++Gr:   r'   )NNFN)rT   rU   rV   r    r   ru   r.   r   r0   r   rb   rO  r   r   r   r^   FloatTensorrR   rW   rX   s   @r9   r  r    sQ       [ [| [x} [X`adXe [ [ [ [ [ [ _%0A6RRR 26>B,1:>3 3|3 !&3 	3
 !.3 "":;3 $D>3 &e&673 -.3 
u (51BEDU1U+V"WW	X3 3 3 SR3 3 3 3 3r:   r  c                       e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 	 	 dd
ej        de	ej                 de	e         de	ej                 de	ej                 de	e
         de	e         de	e         de	ej                 de	ej                 deej        e	eej        ej        f                  f         fd            Z xZS )Zamba2MambaDecoderLayerrc   r   c                     t                                                       t          ||          | _        t	          |j        |j                  | _        || _        d S )N)rc   r   r  )	r-   r.   rW  mambarZ   r6   r  r  r   )r5   rc   r   r8   s      r9   r.   z Zamba2MambaDecoderLayer.__init__  sS    %VyIII
,V-?VEXYYY"r:   r-  r.  r/  r0  NFrJ   r  r   r   r  	use_cacher   transformer_hidden_statesr   c                     |}|
||
z   n|}|                      |          }|                     |||          }d}||z   }|f}|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)rJ   ry  r   )r  r  )r5   rJ   r  r   r   r   r.  r  r  r   r  r   residualr  r  s                  r9   rR   zZamba2MambaDecoderLayer.forward  s    > !
 :S9^M555dq 	 ,,];;

'() # 
 
 ! !=0 " 	,)++G 	*))Gr:   )	NNNNNFFNN)rT   rU   rV   r    ru   r.   r   r0   r   r   rb   rO  r   r^   r  rR   rW   rX   s   @r9   r  r    sh       #| # # # # # # # _%0A6RRR :>#'15.2>B,1$)59<@: :|: !) 6: C=	:
 !.: el+: "":;: $D>: D>: !!12: $,EL#9: 
u (51BEDU1U+V"WW	X: : : SR: : : : :r:   r  c                   |    e Zd Zdedej        def fdZ eddd          	 	 	 	 	 	 	 	 dde	j
        dee	j
                 dee         dee	j
                 dee	j
                 dee         dee         dee         dee	j                 dee	j        eee	j        e	j        f                  f         fd            Z xZS )Zamba2HybridLayershared_transformerlinearr  c                 r    t                                                       || _        || _        || _        d S r,   )r-   r.   r  mamba_decoderr  )r5   r  r  r  r8   s       r9   r.   zZamba2HybridLayer.__init__Q  s8     	""4r:   r-  r.  r/  r0  NFrJ   r  r   r   r   r  r  r3  r   c
           	          |                      |||||||	          }
|
d         }|r|
d         }|                     |          }|                     |||||||	          }
|r|
d         |f|
dd         z   }
|
S )aY  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r  r   r   r.  r  r3  r   r   )r  r   r.  r  r  r3  r=   N)r  r  r  )r5   rJ   r  r   r   r   r.  r  r  r3  layer_outputsr  r  s                r9   rR   zZamba2HybridLayer.forwardY  s    B //#9&+/ 3 0 
 
 %2!$4! 	1 -a 0$(KK0I$J$J!**&?)+/ 3 + 
 
  	V*1-/@AMRSRTRTDUUMr:   )NNNNNFFN)rT   rU   rV   r  r   r  r  r.   r   r0   r   r   ru   rb   rO  r   r^   r  rR   rW   rX   s   @r9   r  r  P  s`       5"=5GIy5Yp5 5 5 5 5 5 _%0A6RRR :>#'15.2>B,1$):>> >|> !) 6> C=	>
 !.> el+> "":;> $D>> D>> &e&67> 
u (51BEDU1U+V"WW	X> > > SR> > > > >r:   r  c                   N     e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZdZ fdZ xZS )Zamba2PreTrainedModelrc   modelTr  r  r.  c                 >   t                                          |           t          |t                    rdt	          j        t	          j        | j        j                  t          j
        | j        j                  t          j
        | j        j                  z
  z  t          j
        | j        j                  z                                 | j        j                  }|t	          j
        t	          j        |                      z   }|j        j                            |           t	          j        d|j        dz             }|j        j                            t	          j
        |                     |j        j                            d           d S d S )N)minr   g      ?)r-   _init_weightsr   rW  r0   r  randrc   r|   mathro  rg  rf  r   time_step_floorexpm1rm  datar  rn  rd  rp  rr  fill_)r5   r   r  inv_dtrx  r8   s        r9   r  z#Zamba2PreTrainedModel._init_weights  sM   f%%%f.// 	%
4;4558DK566$+B[9\9\\^(4;4556  e3e44	  %)U["%5%5$5666FN%%f---Q 01 455AL##EIaLL111HM$$$$$	% 	%r:   )rT   rU   rV   r    r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr  rW   rX   s   @r9   r  r    sz         &*#68QR"3NL% % % % % % % % %r:   r  c                   H    e Zd ZdZdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         d	eej                 d
ee         dee         dee         dee         deej	                 deeef         fd            Zd Zd Z xZS )Zamba2Modelzh
    Model consisting of *config.num_hidden_layers* layers.

    Args:
        config: Zamba2Config
    rc   c                 ~   t                                                     | _        j        | _        j        | _        t          j        j        j        | j                  | _	        fdt          j                  D             }g }g }j        | _        t          j                  D ]}j        |         dk    r%|                    t          |                     8j        |         dk    rb|                    t          j        | j        j        | j        j        d                     |                    t          |                     t#          |          }t#          |          }t%          |          }|                     |||          }t          j        |          | _        j        | _        t/          j        j                  | _        j        r5j        rt8                              d           t=                    | _        d| _         | !                                 d S )	Nc                 2    g | ]}t          |           S ))r  )r  )ro   r  rc   s     r9   rq   z(Zamba2Model.__init__.<locals>.<listcomp>  s'    hhha-fqAAAhhhr:   r  r   rh   Fr  r  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.)"r-   r.   rc   pad_token_idpadding_idx
vocab_sizer   	Embeddingr6   embed_tokensr   r$  rs   r   r   r  r  iterr   
get_layersr   layersr7  rZ   r  final_layernormr6  use_long_contextru  rv  r   
rotary_embgradient_checkpointing	post_init)r5   rc   blocksmamba_layerslinear_layersr   r  r8   s    `     r9   r.   zZamba2Model.__init__  s      !. +L):F<NPTP`aahhhh5QWQfKgKghhh!'!9v/00 	R 	RA'*g55##$;Fa$P$P$PQQQQ)!,88$$RYt{/FH_fk%l%l%lmmm##$;Fa$P$P$PQQQL))]++vEEmF++$*$?!,V-?VEXYYY 	<& ##{   4F;;DO&+# 	r:   N	input_idsr   r   r.  inputs_embedsr  r  output_hidden_statesreturn_dictr   r   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|	|	n| j         j        }	|d u |d uz  rt          d          | j        r%| j        r|rt          	                    d           d}|| 
                    |          }|}t          j        |          }|r@|>||j        d         n|j        d         }t          | j         || j        | j                  }|
I||                    | j                  nd}t          j        |||j        d         z   |j                  }
||
                    d          }|                     |||
          }| j         j        r|                     ||          }nd }|rd	nd }|rd	nd }t1          | j                  D ]q\  }}|r||fz  }| j        r+| j        r$|                     |j        |||||||||
  
        }n ||||||||||
	  	        }|d         }|r|d         ||d         fz  }r|                     |          }|r||fz  }||j        sd|_        t=          ||r|nd ||          }|	r|n|                                S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r@   re   r  r   rk   r   )r  r   r   r   r.  r  r  r3  T)last_hidden_stater.  rJ   
attentions) rc   r  r(  r  use_return_dict
ValueErrorr!  r   ru  rv  r  r0   r  rE   rb   r@   re   r   first_transformer_layer_idrn  r  _update_causal_maskr6  r   r(  r  _gradient_checkpointing_func__call__r  rt   r   to_tuple)r5   r&  r   r   r.  r'  r  r  r(  r)  r   rJ   r  rd   past_seen_tokensr   r3  all_hidden_statesall_self_attnsr   layerr  r  s                          r9   rR   zZamba2Model.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]-t";< 	s   & 	4= 	Y 	j   I  --i88M%!&]!;!;  	v0/8/D++-J]^_J`J6t{JVZV`imituuuO! #.  ..9X.YYY 
 #\ "2]5H5K"KTaTh  N )33A66L..~}n]] ;# 	'"&//-"N"N"&"6@BBD0:d )$+ 6 6 "	: "	:Iu# 6!m%55!* t}  $ A AN!*"#%'! ! !&!+A'#1 +$3&7'(;
! 
! 
! *!,M  : #/"}Q'7&99N,,];;   	2-!11&/Q&15O.(+/8BOOd+%	
 
 
 %;vv&//*;*;;r:   c                    | j         j        dk    r
|d|v r|S d S |j        |j        }}t	          j        |          j        }|j        d         }|d         dz   }t	          j        ||f|||          }	|dk    rt	          j	        |	d          }	|	t	          j
        ||          |                    dd          k    z  }	|	d d d d d d f                             |j        d         ddd          }	||	                                }	|                                d	k    rw|j        d         }
|	d
d |
f                             d          |d d d d d d f                             d          z  }|	d
d |
f                             ||          |	d
d |
f<   | j         j        dk    r%|#|j        j        dv rt%          j        |	|          }	|	S )Nflash_attention_2r   r   r>   )
fill_valuer@   re   rK  rk   r   r=   .sdpa)r  xpunpu)rc   r7  r@   re   r0   finfor   rE   fulltriurn  r   r   r  r   eqrP  r   r   _unmask_unattended)r5   r   r=  r   r@   re   	min_dtypesequence_lengthtarget_lengthr   mask_lengthpadding_masks               r9   r1  zZamba2Model._update_causal_mask^  s   ;+/BBB)c^.C.C%%4$*L,?vK&&*	&,Q/&r*Q.j/=!Ai_dmsttta*[1===Ku|M&AAANDZDZ[]_`DaDaaa!$aaa"23::<;Ma;PRSUWY[\\%%++--K!!##q((,226*3+<=@@EEWXWXWXZ^`dfgfgfgWgHhHkHkloHpHpp1<S,;,=N1O1[1[\hjs1t1tC+-. K,66*%*.DDD
 1CKQZ[[Kr:   c           
      &   g }g | _         d| _        t          | j                  D ]\  }}|dk    r| j        dk    r|| _        t	          |          }| j        j        t          | j        j                  z  dk    r/d| d}t          j
        |dz   dz   dz   d	z   d
z             }	| j                             |	           d}
| j        D ]f}|dk    rY|
| j        j        z  |j        k    rAt          j
        dt          |
          z   dz             }| j                             |           |
dz  }
g| j        j        rpd}
| j        D ]f}|dk    rY|
| j        j        z  |j        k    rAt          j
        dt          |
          z   dz             }| j                             |           |
dz  }
g|                    t          |t	          |          t	          |                               |                    t	          |                     |S )Nr   rh   r   z	^layers\.z\.shared_transformer\.z(?:z3self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|z1feed_forward\.(?:gate_up_proj|down_proj)\.weight|z,(?:input_layernorm|pre_ff_layernorm)\.weightz)$z>^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\.z\.(?:0|1)\.weight$zg^shared_transformer\.self_attn\.(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\.)_tied_weights_keysr0  r(  rs   nextrc   r$  r   r  recompiler   r  r   r  r  )r5   r#  r%  r$  r  layer_id
layer_typeblockprefix_patternmain_keys_pattern
adapter_id_layer_typeadapter_patternattn_adapter_patterns                 r9   r  zZamba2Model.get_layers  sp   "$*+'$-d.D$E$E )	2 )	2 HjX%%2a776>D3V;-DK4P0Q0QQTUUU%R(%R%R%RN(*
& !PQ OO J	J
   ) )% +223DEEE!"J'+'= ( (&(22zDKD^7^bgbp7p7p.0j a"%j//!2"7!8/ /O
 !3::?KKK"a

{? ,%&
+/+A 	, 	,K*h66:Hb;bfkft;t;t79z%q&)*oo%6 '<%<8" 8" 4 !% 7 > >?S T T T&!OJJ/tM7J7JDQ]L^L^__````d<001111r:   
NNNNNNNNNN)rT   rU   rV   r   r    r.   r   r   r0   r   r   rb   r  rO  r   r^   r   rR   r1  r  rW   rX   s   @r9   r  r    sx        "| " " " " " "H  151537>B59$(,0/3&*59v< v<E,-v< !.v< u/0	v<
 "":;v<   12v< D>v< $D>v< 'tnv< d^v< !!12v< 
u--	.v< v< v< ^v<p! ! !F. . . . . . .r:   r  c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 dee
         d	eej                 d
eej                 dee         dee         dee         dee         deej                 deeej	        f         deeef         fd            Z	 	 	 	 	 	 ddZ xZS )Zamba2ForCausalLMrc   c                 &   t                                          |           t          |          | _        dg| j        j        | _        |j        | _        t          j        |j        |j        d          | _	        | 
                                 d S )Nzlm_head.weightFr  )r-   r.   r  r  rJ  r  r   r  r6   lm_headr"  r5   rc   r8   s     r9   r.   zZamba2ForCausalLM.__init__  s        ((
#3"Tdj6S"T +y!3V5FUSSS 	r:   Nr   r&  r   r   r.  r'  labelsr  r  r(  r)  r   logits_to_keepr   c                    ||n| j         j        }|	|	n| j         j        }	|
|
n| j         j        }
|                     ||||||||	||

  
        }|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}| | j	        ||| j
        fi |}|
s|f|dd         z   }||f|z   n|S t          |||j        |j        |j                  S )al  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Zamba2ForCausalLM

        >>> model = Zamba2ForCausalLM.from_pretrained("Zyphra/Zamba2-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r&  r   r   r.  r'  r  r  r(  r   r)  r   r   losslogitsr.  rJ   r-  )rc   r  r(  r.  r  r   ru   slicer[  loss_functionr  r   r.  rJ   r-  )r5   r&  r   r   r.  r'  r]  r  r  r(  r)  r   r^  r   r  rJ   slice_indicesrb  ra  r  s                       r9   rR   zZamba2ForCausalLM.forward  sj   P 2C1N--TXT_Tq %9$D  $+Jj 	 &1%<kk$+B] **)%+'/!5)#  
 
  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%ffdoPPPPD 	DY,F'+'7D7V##VC%#3!/)
 
 
 	
r:   Tc           	         |d u }	|	s]||d         |j         d         k    r|d d |j         d          d f         }nV|j         d         |j         d         k    r|d d |f         }n-t          | j        |j         d         | j        | j                  }|b|`|                                                    d          dz
  }|                    |dk    d           |	s|d d |j         d          d f         }||	rd|i}
nd|                                i}
|
	                    ||||| j        j
        |d           |                                D ]\  }}||
vr||
|<   |
S )Nr>   r   r   r+  r'  r&  )r   r.  r  r   r^  r   )rE   rb   rc   r@   re   longrQ  masked_fill_r   r   num_logits_to_keepitems)r5   r&  r.  r   r'  r   r   r  r   empty_past_kvmodel_inputsr   r   s                r9   prepare_inputs_for_generationz/Zamba2ForCausalLM.prepare_inputs_for_generation  s    (4/  	 )!"%);;;%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	6Y_Q/tz$+  O %,*>)..0077;;a?L%%n&91===  F+AAA	0B/B/D/D,DE $$+];LL')=)=)?)?@L ,#2&"0"&+"@"0 		
 		
 		
 !,,.. 	* 	*JC,&&$)S!r:   )NNNNNNNNNNNr   )NNNNNT)rT   rU   rV   r    r.   r   r   r0   r   r   rb   r  rO  r   ru   r^   r   rR   rm  rW   rX   s   @r9   rY  rY    s       |        151537>B59-1$(,0/3&*5934O
 O
E,-O
 !.O
 u/0	O

 "":;O
   12O
 )*O
 D>O
 $D>O
 'tnO
 d^O
 !!12O
 c5</0O
 
u,,	-O
 O
 O
 ^O
h ? ? ? ? ? ? ? ?r:   rY  a  
    The Zamba2 Model with a sequence classification head on top (linear layer).

    [`Zamba2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   X    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee	e
eej                 f                  deej                 deej                 d	ee         d
ee         dee         dee         de	eef         fd            Z xZS )Zamba2ForSequenceClassificationc                     t                                          |           |j        | _        t          |          | _        | j        j        | _        t          j        |j        | j        d          | _	        | 
                                 d S )NFr  )r-   r.   
num_labelsr  r  rJ  r   r  r6   scorer"  r\  s     r9   r.   z(Zamba2ForSequenceClassification.__init__a  sv        + ((
"&*"?Yv14?OOO
 	r:   Nr&  r   r   r.  r'  r]  r  r  r(  r)  r   c                 4   |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }||j        d         }n|j        d         }| j         j        |dk    rt          d          | j         j        d}n|}|| j         j        k                        |j        t          j
                  }t          j        |j        d         |j        t          j
                  }||z                      d          }n)d}t                              | j        j         d           |t          j        ||j        	          |f         }d}|t|                    |j                  }| j         j        f| j        dk    rd
| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        d
k    rWt-                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt1                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt5                      } |||          }|
s|f|dd         z   }||f|z   n|S t7          |||j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r.  r'  r  r  r(  r)  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r>   rg   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rk   
regressionsingle_label_classificationmulti_label_classificationr`  )rc   r.  r  rs  rE   r  r/  rA   re   r0   int32rn  argmaxru  rv  r8   rT   problem_typerr  r@   rg  ru   r   r  r
   rF   r	   r   r.  rJ   r-  )r5   r&  r   r   r.  r'  r]  r  r  r(  r)  transformer_outputsrJ   rb  rd   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsra  loss_fctr  s                         r9   rR   z'Zamba2ForSequenceClassification.forwardk  sJ   ( &1%<kk$+B]"jj)%+'/!5# ) 

 

 ,A.M** "+JJ&,Q/J;#+
a\]]];#+!#"%)AAEEfmUZU`aaL!L)<V]Z_ZefffM"/,">!F!Fr!J!J!#>* Z Z Z  
 u|Jv}MMMOaabYYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--x 2 22t G GUWYY)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE/ /?-;*5
 
 
 	
r:   rW  )rT   rU   rV   r.   r   r   r0   r   r   r   r   listr  rO  r^   r   rR   rW   rX   s   @r9   rp  rp  R  sV             151537KO59-1$(,0/3&*[
 [
E,-[
 !.[
 u/0	[

 "%tE4E/F(F"GH[
   12[
 )*[
 D>[
 $D>[
 'tn[
 d^[
 
u66	7[
 [
 [
 ^[
 [
 [
 [
 [
r:   rp  )rY  rp  r  r  )r   )Nr   )Vr  rL  	itertoolsr   typingr   r   r   r   r0   r   torch.nnr	   r
   r   activationsr   cache_utilsr   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.deprecationr   utils.import_utilsr   r   configuration_zamba2r    +mamba_ssm.ops.triton.selective_state_updater!   !mamba_ssm.ops.triton.ssd_combinedr"   r#   causal_conv1dr%   r&   
get_loggerrT   ru  Moduler)   rZ   rb   r   r   ru   r   r   r   r   r  r	  rE  rH  rU  r  rt  rW  r  r  r  r  r  r  rY  rp  __all__r   r:   r9   <module>r     s[  ,  				       1 1 1 1 1 1 1 1 1 1 1 1        A A A A A A A A A A ! ! ! ! ! !             ) ) ) ) ) ) > > > > > > B B B B B B q q q q q q q q q q K K K K K K K K F F F F F F F F & & & & & & , , , , , , , , 0 0 0 0 0 0 T T T T T T T T . . . . . .  kRRRRRRmmmmmmmmmZjW57W 8DDDDDDDDD-7**		H	%	%; ; ; ; ; ; ; ;*J J J J JBI J J J(j  j  j  j  j  j  j  j Z!< !< !< !< !<BI !< !< !<H	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % % % %4( ( (   6z) z) z) z) z)bi z) z) z)@VU\ VS V V V V
 
 
(  ( 46FH\]^^ iO iO iO iO iOry iO iO iOX' ' ' ' '	 ' ' 'T> > > > >") > > >BB B B B Bbi B B BJH H H H H	 H H HV% % % % %O % % %: v v v v v' v v vt\ \ \ \ \- \ \ \~   g
 g
 g
 g
 g
&; g
 g
 g
T k
j
jr:   