
     `i}                        d dl mZmZmZmZ d dlZd dlmc mZ	 d dlmZ ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*  e(            r	d dl+m,Z,m-Z- nd\  Z,Z- ed           G d dej.                              Z/ G d dej.                  Z0 G d dej.                  Z1 G d d          Z2d Z3d=d Z4d!ej5        d"e6d#ej5        fd$Z7	 d>d&ej.        d'ej5        d(ej5        d)ej5        d*eej5                 d+e8d,e8d-ee          fd.Z9 G d/ d0ej.                  Z:d1 Z;e,e-fZ< e=e<          Z> G d2 d3ej.                  Z? G d4 d5e          Z@e! G d6 d7e                      ZAe! G d8 d9eA                      ZBe! G d: d;eAe                      ZCg d<ZDdS )?    )AnyCallableOptionalUnionN)nn   )Cache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs)is_causal_conv1d_available   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Lfm2RMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z:
        Lfm2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/lfm2/modeling_lfm2.pyr&   zLfm2RMSNorm.__init__2   sD     	l5:k#:#:;; #    c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )N   T)keepdim)	dtypetor(   float32powmeanrsqrtr+   r*   )r,   hidden_statesinput_dtypevariances       r0   forwardzLfm2RMSNorm.forward:   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r1   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler*   shaper+   r,   s    r0   
extra_reprzLfm2RMSNorm.extra_reprA   s&    )**II$2GIIIr1   )r#   )__name__
__module____qualname__r&   r?   rD   __classcell__r/   s   @r0   r"   r"   0   sb        $ $ $ $ $ $; ; ;J J J J J J Jr1   r"   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )Lfm2RotaryEmbeddinginv_freqNconfigc                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultrL   F)
persistent)r%   r&   hasattr
isinstancerO   dictgetrP   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrM   r   rope_init_fnattention_scalingregister_bufferrL   original_inv_freq)r,   rM   devicerL   r/   s       r0   r&   zLfm2RotaryEmbedding.__init__H   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r1   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r4   r   mpscpuF)device_typeenabledr3   dim)r6   )rL   floatexpandrB   r7   r_   rU   rQ   strr(   autocast	transposecatcosr\   sinr6   )
r,   xposition_idsinv_freq_expandedposition_ids_expandedrc   freqsembrm   rn   s
             r0   r?   zLfm2RotaryEmbedding.forwardY   s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/N)rE   rF   rG   r(   Tensor__annotations__r   r&   no_gradr   r?   rH   rI   s   @r0   rK   rK   E   s         l/ /z / / / / / /" U]__< <  _< < < < <r1   rK   c                   *     e Zd Zdef fdZd Z xZS )Lfm2MLPrM   c                    t                                                       |j        }|j        rPt	          d|z  dz            }|j        4t	          |j        |z            }|j        ||j        z   dz
  |j        z  z  }t          j        |j	        |d          | _
        t          j        |j	        |d          | _        t          j        ||j	        d          | _        d S )Nr3   r   r   Fbias)r%   r&   intermediate_sizeblock_auto_adjust_ff_dimintblock_ffn_dim_multiplierblock_multiple_ofr   Linearr-   w1w3w2)r,   rM   r~   r/   s      r0   r&   zLfm2MLP.__init__j   s    "4* 	 #A(9$9A$= > >.:$'(GJ[([$\$\!$*$<&)AAAE&Jbb%! )F.0ANNN)F.0ANNN)-v/ANNNr1   c                     |                      t          j        |                     |                    |                     |          z            S ru   )r   Fsilur   r   )r,   ro   s     r0   r?   zLfm2MLP.forwardy   s7    wwqvdggajj))DGGAJJ6777r1   )rE   rF   rG   r   r&   r?   rH   rI   s   @r0   rz   rz   i   sZ        Oz O O O O O O8 8 8 8 8 8 8r1   rz   c                      e Zd ZdZdZdZdZdZej	        dfde
dedej        deej        edf         fdZ	 dd	ej        d
ej        dedeeeef                  deej        ej        f         f
dZdej        fdZddee         defdZdej        dedeeef         fdZdefdZdefdZdedeej        ej        f         fdZd ZdS )Lfm2HybridConvCachea  
    Attention and conv cache for Lfm2.

    It stores the Key and Value states as a list of tensors, one for each layer.
    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
    NFrM   max_batch_sizer6   r_   c                    g | _         g | _        || _        |j        | _        | j                            d          | _        |j        | _        || _        g | _        |t          j
        |          nd }t          |j                  D ]h}t          j        | j        |j        | j        | j        |          }t          j                            |           | j                            |           id S )Nfull_attention)r6   r_   )	key_cachevalue_cacher   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cacher(   r_   rangenum_hidden_layerszerosr-   _dynamomark_static_addressappend)r,   rM   r   r6   r_   _
conv_states          r0   r&   zLfm2HybridConvCache.__init__   s     ,!-%)%5%;%;<L%M%M""/.0)/);f%%%v/00 		/ 		/A#"!k  J M--j999O"":....		/ 		/r1   
key_statesvalue_states	layer_idxcache_kwargsreturnc                    |Tt          | j                  |k    rt          t          | j                  |          D ]Z}| j                            t	          j        g                      | j                            t	          j        g                      [| j                            |           | j                            |           n| j        |                                         s|| j        |<   || j        |<   nVt	          j        | j        |         |gd          | j        |<   t	          j        | j        |         |gd          | j        |<   | j        |         | j        |         fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        Nre   )	lenr   r   r   r(   tensorr   numelrl   )r,   r   r   r   r   r   s         r0   updatezLfm2HybridConvCache.update   s^   0 !4>""i//s4>22I>> > >AN))%,r*:*:;;;$++EL,<,<====%%j111 ''5555N9-3355m -7y).: ++,1It~i7PR\6]ce,f,f,fy).3i9I)9TVb8cik.l.l.l +~i($*:9*EEEr1   beam_idxc                 "   t          t          | j                            D ]}| j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   | j        |         j        }| j        |                             d|                    |                    | j        |<   dS )zDReorders the cache for beam search, given the selected beam indices.r   N)r   r   r   r_   index_selectr7   r   r   )r,   r   r   r_   s       r0   reorder_cachez!Lfm2HybridConvCache.reorder_cache   s    s4>2233 	i 	iI^I.5F(,y(A(N(NqRZR]R]^dReRe(f(fDN9%%i07F*.*:9*E*R*RSTV^VaVabhViVi*j*jDY'_Y/6F)-)C)P)PQRT\T_T_`fTgTg)h)hDOI&&	i 	ir1   r   c                     | j         |         dk    r| j        n|}t          | j                  |k    s#| j        |                                         dk    rdS | j        |         j        d         S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r   r   )r   r   r   r   r   rB   r,   r   s     r0   get_seq_lengthz"Lfm2HybridConvCache.get_seq_length   sr     372B92MQa2a2aD..gp	t~)++t~i/H/N/N/P/PTU/U/U1~i(.r22r1   cache_positionc                 Z    d}|j         d         }|                                 }||z   }||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        r   )rB   r   )r,   r   r   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengths          r0   get_mask_sizesz"Lfm2HybridConvCache.get_mask_sizes   s@      %+A...00 #33	---r1   
max_lengthc                    |dk     r$|                                  t          |          z
  }|                                  |k    rdS t          t          | j                            D ]e}| j        |                                         rD| j        |         dd|ddf         | j        |<   | j        |         dd|ddf         | j        |<   fdS )z"Crop the cache to the given lengthr   N.)r   absr   r   r   r   r   )r,   r   idxs      r0   cropzLfm2HybridConvCache.crop   s    >>,,..Z@J  J..FT^,,-- 	S 	SC~c"((** S&*nS&9#{
{AAA:M&Ns#(,(8(=c;J;PQPQPQ>Q(R %	S 	Sr1   c                 *    t          | j                  S ru   )r   r   rC   s    r0   __len__zLfm2HybridConvCache.__len__  s    4>"""r1   c                 6    | j         |         | j        |         fS ru   )r   r   r   s     r0   __getitem__zLfm2HybridConvCache.__getitem__  s    ~i($*:9*EEEr1   c                     t          t          | j                            D ]!}| j        |                                          "d S ru   )r   r   r   zero_r   s     r0   resetzLfm2HybridConvCache.reset  sI    s4?3344 	/ 	/IOI&,,....	/ 	/r1   ru   )r   )rE   rF   rG   __doc__r   is_compileabler   r   r(   r8   r   r   r6   r   r_   ri   r&   rv   r   rV   r   rA   r   
LongTensorr   r   r   r   r   r   r    r1   r0   r   r   }   s
         NNIK #]15/ // / {	/
 elC-./ / / /D 26)F )FL)F l)F 	)F
 tCH~.)F 
u|U\)	*)F )F )F )FV	ie&6 	i 	i 	i 	i3 3 3c 3 3 3 3.U\ .c .eTWY\T\o . . . .Ss S S S S# # # # #FS FU5<3M-N F F F F/ / / / /r1   r   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nr4   r3   re   )rB   r(   rl   )ro   x1x2s      r0   rotate_halfr     s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r1   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkrm   rn   rp   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embr     sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr1   r<   n_repr   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rB   rh   reshape)r<   r   batchnum_key_value_headsslenhead_dims         r0   	repeat_kvr   /  s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr1           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr3   r   r   r4   )rf   r6   )ptrainingr   )r   num_key_value_groupsr(   matmulrk   rB   r   
functionalsoftmaxr8   r7   r6   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightscausal_maskattn_outputs                r0   eager_attention_forwardr   ;  s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r1   c                       e Zd ZdZdedef fdZ eddd          	 	 dd
ej	        de
ej	        ej	        f         deej	                 dee         deej                 de
ej	        eej	                 f         fd            Z xZS )Lfm2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrM   r   c                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        | j        dz  | _
        d| _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        | j        z  |j        d          | _        t%          | j        |j                  | _        t%          | j        |j                  | _        d S )Nr   g      TFr|   r.   )r%   r&   rM   r   getattrr-   num_attention_headsr   r   r   r   	is_causalr   r   q_projk_projv_projout_projr"   norm_epsq_layernormk_layernormr,   rM   r   r/   s      r0   r&   zLfm2Attention.__init__X  sC   "
F4F&Jd4dee$*$>&B\$\!}d*i 2F4NQUQ^4^ejkkki 2F4NQUQ^4^ejkkki 2F4NQUQ^4^ejkkk	&"<t}"LfN`glmmm&t}&/JJJ&t}&/JJJr1   past_key_valuepast_key_values4.58new_nameversionNr<   position_embeddingsr   r   r   c                 2   |j         d d         }g |d| j        R }|                      |                     |          j        |                               dd          }	|                      |                     |          j        |                               dd          }
 |                     |          j        |                     dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|
                    |
|| j        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||fd| j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr4   r   r3   )rn   rm   r   eagerr   )r   r   )rB   r   r   r   viewrk   r   r   r   r   r   r   r   rM   _attn_implementationr   r   r   r   r   )r,   r<   r  r   r  r   r   input_shapehidden_shapequery_statesr   r   rm   rn   r   attention_interfacer   r   outputs                      r0   r?   zLfm2Attention.forwardg  s    $)#2#.88b8$-88''(GM(B(B(G(VWWaabcefgg%%&Edkk-&@&@&E|&TUU__`acdee
6t{{=116EOOPQSTUU&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
 L	%
 	%
 	%
 	%
!\ *k);;;;;;FFHH{++|##r1   r   )rE   rF   rG   r   r   r   r&   r   r(   rv   rA   r   r   r   r?   rH   rI   s   @r0   r   r   U  s       GGKz Kc K K K K K K _%0A6RRR :>59'$ '$|'$ #5<#=>'$ !.	'$
 ""56'$ !!12'$ 
u|Xel33	4'$ '$ '$ SR'$ '$ '$ '$ '$r1   r   c                     |N|j         d         dk    r=|j         d         dk    r,| j        }| |dddddf         z                      |          } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rB   r6   r7   )r<   r   r6   s      r0   apply_mask_to_padding_statesr    si     !n&:1&=&A&AnFZ[\F]`aFaFa#&111d
)CCGGNNr1   c                       e Zd Zdedef fdZ eddd          	 	 	 dd	ej        de	e
         d
e	ej                 de	ej                 fd            Z eddd          	 	 	 dd	ej        de	e
         d
e	ej                 de	ej                 fd            Z eddd          	 	 	 ddej        de	e
         d
e	ej                 de	ej                 fd            Z xZS )Lfm2ShortConvrM   r   c           	         t                                                       || _        || _        |j        | _        |j        | _        t          j	        |j
        |j
        | j        |j
        | j        | j        dz
            | _        t          j        |j
        d|j
        z  | j                  | _        t          j        |j
        |j
        | j                  | _        d S )Nr   )in_channelsout_channelskernel_sizegroupsr}   paddingr   r|   )r%   r&   rM   r   r   L_cache	conv_biasr}   r   Conv1dr-   convr   in_projr   r   s      r0   r&   zLfm2ShortConv.__init__  s    
 	"*$	I*+%L1$
 
 
	 y!3Q9K5KRVR[\\\	&"4f6HtyYYYr1   r   r  r  r  Nro   r   r   c                    t          ||          }|                     |                              dd          }|                    dd          \  }}}||z  }| j        j                            | j        j                            d          | j        j                            d                    }	|b|d         dk    rVt          |	                    d          |j
        | j                 |	| j        j        d           }
|
                    d          }
ny|Zt          j                            || j        |j        d         z
  df          }|j
        | j                                     |           t)          ||	| j        j        d           }
||
z  }|                     |                    dd                                                    }|S )Nr4   r   r   re   r   r3   )
activation)r  r  rk   chunkr  r*   r	  sizer   squeezer   r   r}   r   r   r   padr  rB   copy_r   r   r   )r,   ro   r  r   r   BCxBCBxconv_weightsconv_outr   ys                r0   cuda_kernels_forwardz"Lfm2ShortConv.cuda_kernels_forward  s    )N;;ll1oo''B//))A2)&&1aUy',,TY-=-B-B1-E-EtyGWG\G\]^G_G_``&>!+<q+@+@+

2*4>:	 H  ))"--HH*]..rDL28B<4OQR3STT
*4>:@@LLL'L$).UYZZZHLMM!++b"--88::;;r1   c                    |j         d         }t          ||          }|                     |                              dd          }|                    dd          \  }}}||z  }	||d         dk    r|j        | j                 }
|                    d| j        dz
            }|
	                    dd          }
|	
                    |
j        |
j                  |
d d d d |f<   |j        | j                                     |
           t          j        |

                    |	j                  | j        j        d d dd d f         z  d          }| j        r|| j        j        z  }|                    d          }n{|Zt(          j                            |	| j        |	j         d         z
  df          }
|j        | j                                     |
           |                     |	          d	d |f         }||z  }|                    dd                                          }|                     |          }|S )
Nr   r4   r   r   re   r   )shiftsdims)r_   r6   .)rB   r  r  rk   r!  r   r   clampr  rollr7   r_   r6   r%  r(   sumr  r*   r}   r   r   r   r$  r   r   )r,   ro   r  r   r   seqlenr&  r'  r(  r)  r   r+  r,  s                r0   slow_forwardzLfm2ShortConv.slow_forward  s    (N;;ll1oo''B//))A2)&&1aU&>!+<q+@+@(3DNCJ+11!T\A5EFFN#<<J/1uuJ<MU_Ueu/f/fJqqq!!!^+,&t~6<<ZHHHyry!9!9DI<LQQQPQSTSTSTW<U!U[]^^^Hy +DIN*))"--HH*]..rDL28B<4OQR3STT
*4>:@@LLLyy}}S'6'\2HLKKB**,,MM!r1   r<   c                     t           rDd|j        j        v r6t          j                                        s|                     ||||          S |                     ||||          S )Ncuda)is_fast_path_availabler_   rQ   r(   r   is_compilingr-  r5  )r,   r<   r  r   r   s        r0   r?   zLfm2ShortConv.forward  sh     " 	mf0D0I&I&IRWR_RlRlRnRn&I,,]O^]klll  Q_```r1   )NNN)rE   rF   rG   r   r   r&   r   r(   rv   r   r   r   r-  r5  r?   rH   rI   s   @r0   r  r    s       ZZ Z Z Z Z Z Z, _%0A6RRR :>5915   <  ""56  !!12	 
 !.      SR D _%0A6RRR :>5915$ $<$ ""56$ !!12	$
 !.$ $ $ SR$L _%0A6RRR :>5915	a 	a|	a ""56	a !!12		a
 !.	a 	a 	a SR	a 	a 	a 	a 	ar1   r  c                       e Zd Zdedef fdZ eddd          	 	 	 	 dd	ej        d
e	ej        ej        f         de
ej                 de
ej                 de
e         de
ej                 dej        fd            Z xZS )Lfm2DecoderLayerrM   r   c                    t                                                       |j        |         dk    | _        | j        rt	          ||          | _        nt          ||          | _        t          |          | _	        t          |j        |j                  | _        t          |j        |j                  | _        d S )Nr   r   )r%   r&   r   is_attention_layerr   	self_attnr  r  rz   feed_forwardr"   r-   r   operator_normffn_normr   s      r0   r&   zLfm2DecoderLayer.__init__  s    "("4Y"?CS"S" 	9*69==DNN%fi88DI#FOO();QQQ#F$6FOLLLr1   r   r  r  r  Nr<   r  r   rp   r   r   c           
      &   |}| j         r+ | j        d|                     |          |||||d|\  }}	n,|                     |                     |          |||          }||z   }||                     |                     |                    z   }|S )N)r<   r  r   rp   r  r   )r<   r  r   r   r   )r=  r>  r@  r  r?  rA  )
r,   r<   r  r   rp   r  r   r   residualr   s
             r0   r?   zLfm2DecoderLayer.forward  s     !" 	-t~  "00??$7-) /-       M11 !II"00?? /--	 &  M &0%(9(9$--:V:V(W(WWr1   )NNNN)rE   rF   rG   r   r   r&   r   r(   rv   rA   r   r   r   r?   rH   rI   s   @r0   r;  r;    s       
Mz 
Mc 
M 
M 
M 
M 
M 
M _%0A6RRR
 26379=59 | #5<#=> !.	
 u/0 ""56 !!12 
   SR    r1   r;  c                   L    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )	Lfm2PreTrainedModelrM   modelTr;  r  F)r<   
attentionsN)rE   rF   rG   r   rw   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr;  r   _can_record_outputsr   r1   r0   rE  rE  ?  sl         &*#+,#4"5N""&)# r1   rE  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         deej                 d	ee         d
eej	                 dee         defd                        Z xZS )	Lfm2ModelrM   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t                    | _        d| _        t                    | _        t#          j        j                  | _        |                                  d S )Nc                 0    g | ]}t          |          S r   )r;  ).0r   rM   s     r0   
<listcomp>z&Lfm2Model.__init__.<locals>.<listcomp>Z  s$    bbbYfi00bbbr1   )rM   Fr   )r%   r&   pad_token_idpadding_idx
vocab_sizer   	Embeddingr-   embed_tokens
ModuleListr   r   layersrK   
rotary_embgradient_checkpointingpos_embr"   r   embedding_norm	post_initr,   rM   r/   s    `r0   r&   zLfm2Model.__init__S  s       !. +L):F<NPTP`aambbbb%H`BaBabbb
 
 .V<<<&+#*622)&*<&/RRR 	r1   N	input_idsr   rp   r  inputs_embeds	use_cacher   r   r   c           
         |d u |d uz  rt          d          ||                     |          }|r1|/|j        d         }	t          | j        |	| j        | j                  }|B||                                nd}
t          j	        |
|
|j        d         z   |j                  }||
                    d          }t          | j        |||||          }|}|                     ||          }| j        d | j        j                 D ]} ||f|||||d|}|                     |          }t!          ||          S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   )rM   r   r6   r_   r   )r_   )rM   input_embedsr   r   r  rp   )r   rp   r  r   r  )last_hidden_stater  )
ValueErrorr\  rB   r   rM   r6   r_   r   r(   aranger   r   ra  r^  r   rb  r   )r,   re  r   rp   r  rf  rg  r   r   
batch_sizer   r   r<   r  decoder_layers                  r0   r?   zLfm2Model.forwardd  s    -t";< 	[YZZZ  --i88M 	0&,Q/J1{:TZX\Xc  O !CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L(;&))+%
 
 
 &"ll=,GG "[)H4;+H)HI 		 		M)M*) /-$7   MM ++M::&++
 
 
 	
r1   )NNNNNNN)rE   rF   rG   r   r&   r   r   r   r(   r   rv   r   FloatTensorboolr   r   r   r?   rH   rI   s   @r0   rS  rS  Q  s       z      "  1515379=59$(59=
 =
E,-=
 !.=
 u/0	=

 ""56=
   12=
 D>=
 !!12=
 +,=
 
!=
 =
 =
 ^ =
 =
 =
 =
 =
r1   rS  c                   f    e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e         de	e
j                 deee
j        f         dee         defd                        Z xZS )Lfm2ForCausalLMzlm_head.weightlm_headcolwise_repr<   logitsc                     t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        | 	                                 d S )NFr|   )
r%   r&   rS  rF  rZ  r   r   r-   rs  rc  rd  s     r0   r&   zLfm2ForCausalLM.__init__  sj       v&&
 +y!3V5FUSSS 	r1   Nr   re  r   rp   r  rf  labelsrg  r   logits_to_keepr   r   c
                 R    | j         d|||||||d|
}|j        }t          |	t                    rt	          |	 d          n|	}|                     |dd|ddf                   }d}| | j        d||| j        j        d|
}t          |||j
        |j        |j                  S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Lfm2ForCausalLM

        >>> model = Lfm2ForCausalLM.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)re  r   rp   r  rf  rg  r   N)ru  rw  rZ  )lossru  r  r<   rG  r   )rF  rj  rU   r   slicers  loss_functionrM   rZ  r   r  r<   rG  )r,   re  r   rp   r  rf  rw  rg  r   rx  r   outputsr<   slice_indicesru  rz  s                   r0   r?   zLfm2ForCausalLM.forward  s    @ ,64: 	,
)%+')	,
 	,
 	,
 	,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%pVFt{OeppioppD%#3!/)
 
 
 	
r1   )	NNNNNNNNr   )rE   rF   rG   _tied_weights_keys_tp_plan_pp_planr&   r   r   r   r(   r   rv   r	   ro  rp  r   r   r   r   r   r?   rH   rI   s   @r0   rr  rr    sa       *+=)H_-z:;H      151537+/59-1$(59348
 8
E,-8
 !.8
 u/0	8

 "%8
   128
 )*8
 D>8
 !!128
 c5</08
 +,8
 
 8
 8
 8
 ^ 8
 8
 8
 8
 8
r1   rr  )rr  rS  rE  )Nr   )r   )Etypingr   r   r   r   r(   torch.nn.functionalr   r   r   cache_utilsr	   
generationr
   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   utils.import_utilsr   configuration_lfm2r   causal_conv1dr   r   Moduler"   rK   rz   r   r   r   rv   r   r   rg   r   r   r  kernel_modulesallr8  r  r;  rE  rS  rr  __all__r   r1   r0   <module>r     s  ( 2 1 1 1 1 1 1 1 1 1 1 1                             ) ) ) ) ) ) 7 7 7 7 7 7 / / / / / / 9 9 9 9 9 9 O O O O O O O O K K K K K K K K F F F F F F F F & & & & & & I I I I I I I I I I 0 0 0 0 0 0 / / / / / / < < < < < < * * * * * *  8DDDDDDDDD-7** Y''J J J J J") J J ('J(!< !< !< !< !<") !< !< !<H8 8 8 8 8bi 8 8 8(M/ M/ M/ M/ M/ M/ M/ M/`( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4:$ :$ :$ :$ :$BI :$ :$ :$z   #$89^,, ka ka ka ka kaBI ka ka ka\- - - - -1 - - -`     /   " Q
 Q
 Q
 Q
 Q
# Q
 Q
 Q
h H
 H
 H
 H
 H
)? H
 H
 H
V B
A
Ar1   