
     `i                    .   d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	c m
Z d dlm	Z	 d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)  e$            rd dl*m+Z+ d dl,m-Z- d dl.m/Z/ ne0Z- e%j1        e2          Z3 G d dej4        j5                  Z6	 	 dUdeej7                 dee8         fdZ9 G d de-          Z: G d de	j;                  Z< G d de	j;                  Z= G d d e	j;                  Z>d! Z?dVd"Z@	 dWd$d%d&ej7        d'ej7        d(ej7        d)eejA                 d*eBe8e8f         d+e8d,e8d-eeC         d.eeBej7        ej7        f         eBej7                 f         fd/ZDejE        fd$d%d&ej7        d0e:dej7        de8d*eBe8e8f         d+e8d,e8d1ejF        d.eBej7                 fd2ZGd$d%d&ej7        d'ej7        d(ej7        d)eejA                 d*eBe8e8f         d+e8d,e8d.eBej7                 fd3ZHeGeDeHd4ZI G d5 d%e	j;                  ZJ G d6 d7e          ZKe# G d8 d9e!                      ZL	 	 dUd:ej7        d'ej7        d)eej7                 d;eej7                 d.eBej7        ej7        ej7        e8eej7                 eej7                 f         f
d<ZMd:ej7        d=ej7        d>e8d?e8d.ej7        f
d@ZNe# G dA dBeL                      ZO G dC dDe	j;                  ZP e#dEF           G dG dHeL                      ZQ e#dIF           G dJ dKeL                      ZR e#dLF           G dM dNeL                      ZSe# G dO dPeL                      ZT e#dQF           G dR dSeL                      ZUg dTZVdS )X    N)nullcontext)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )ModernBertConfig) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                   l    e Zd Ze	 	 ddeej                 dee         fd            Zed             Z	dS )ApplyRotaryEmbUnpadN
cu_seqlens
max_seqlenc           
          |                                 }|j        \  }}}}	|d d d df                             |d|	          }
t          |
||d||dd           |                     |||           || _        |S )N   r   FT)seqlen_offsetsr"   r#   interleavedinplace)
contiguousshapeviewr   save_for_backwardr#   )ctxqkvcossinr"   r#   	total_nnz_three_nheadsheaddimqks              /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.pyforwardzApplyRotaryEmbUnpad.forward>   s     nn.1i+	67G BQBZ__YG44!!		
 		
 		
 		
 	c3
333#
    c                     | j         \  }}}|                                }|j        \  }}}}|d d d df                             |d|          }	t	          |	||d|| j        ddd	  	         |d d d d d d fS )Nr%   r&   r   FT)r'   r"   r#   r(   r)   	conjugate)saved_tensorsr*   r+   r,   r   r#   )
r.   dor0   r1   r"   r2   r3   r4   r5   dqks
             r7   backwardzApplyRotaryEmbUnpad.backward]   s    "0S*]]__.0h+	67G BQBinnYG44!~
	
 
	
 
	
 
	
 4tT455r9   NN)
__name__
__module____qualname__staticmethodr   torchTensorintr8   r?    r9   r7   r!   r!   =   sy         .2$( 
 U\* SM   \< 6 6 \6 6 6r9   r!   r"   r#   c                 >    t                               | ||||          S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r!   apply)r/   r0   r1   r"   r#   s        r7   apply_rotary_unpaddedrK   t   s     . $$S#sJ
KKKr9   c                        e Zd ZdZ	 	 	 	 ddededee         deej                 deej	                 f
 fd	Z
	 dd
ej        dej        dee         deej        eej        ej        f         f         fdZdefdZ xZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
         @Ndimbaser#   devicedtypec                     t                                          |||d           || _        ||||                     |||           dS dS dS dS )a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        F)rO   rP   rQ   r(   NrQ   rR   )super__init__r#   _update_cos_sin_cache)selfrO   rP   r#   rQ   rR   	__class__s         r7   rV   z*ModernBertUnpaddedRotaryEmbedding.__init__   sr     	StFNNN$!f&8U=N&&z&&NNNNN "!&8&8=N=Nr9   r/   r"   returnc                     |"|                      ||j        |j                   t          || j        | j        ||          }|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        NrT   r"   r#   )rW   rQ   rR   rK   _cos_cached_sin_cached)rX   r/   r"   r#   s       r7   r8   z)ModernBertUnpaddedRotaryEmbedding.forward   sY     !&&z#*CI&VVV#!!
 
 
 
r9   c                 6    d| j          d| j         d| j         S )Nzdim=z, base=z, scale_base=)rO   rP   
scale_baserX   s    r7   
extra_reprz,ModernBertUnpaddedRotaryEmbedding.extra_repr   s&    PdhPPtyPPtPPPr9   )rN   NNNN)rA   rB   rC   __doc__rG   floatr   rE   rQ   rR   rV   rF   r   tupler8   strrb   __classcell__rY   s   @r7   rM   rM      s2         $()-'+O OO O SM	O
 &O $O O O O O O. %)	 \ L SM	
 
u|U5<#=>>	?   2QC Q Q Q Q Q Q Q Qr9   rM   c                        e Zd ZdZdef fdZ ej        d          dej        dej	        fd            Z
	 ddeej                 d
eej	                 dej	        fdZ xZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                 >   t                                                       || _        t          j        |j        |j        |j                  | _        t          j	        |j        |j
        |j                  | _        t          j        |j                  | _        d S )N)padding_idxepsbias)rU   rV   rl   r   	Embedding
vocab_sizehidden_sizepad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutembedding_dropoutdroprX   rl   rY   s     r7   rV   zModernBertEmbeddings.__init__   s{     l6+<f>P^d^qrrrL!3vO_```	Jv788			r9   Tdynamic	input_idsrZ   c                 x    |                      |                     |                     |                              S rc   )r}   rz   rv   )rX   r   s     r7   compiled_embeddingsz(ModernBertEmbeddings.compiled_embeddings   s.    yy4#6#6y#A#ABBCCCr9   Ninputs_embedsc                    |)|                      |                     |                    }n\| j        j        r|                     |          n:|                      |                     |                     |                              }|S rc   )r}   rz   rl   reference_compiler   rv   )rX   r   r   hidden_statess       r7   r8   zModernBertEmbeddings.forward   s     $ IIdii&>&>??MM ;0J((333YYtyy)<)<Y)G)GHHII 
 r9   r@   )rA   rB   rC   rd   r   rV   rE   compile
LongTensorrF   r   r   r8   rh   ri   s   @r7   rk   rk      s         9/ 9 9 9 9 9 9 U]4   DU-= D%, D D D ! D ei !%"23KSTYT`Ka	       r9   rk   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    rl   c                    t                                                       || _        t          j        |j        t          |j                  dz  |j                  | _	        t          |j                 | _        t          j        |j                  | _        t          j        |j        |j        |j                  | _        d S )Nr%   rq   )rU   rV   rl   r   Linearrt   rG   intermediate_sizemlp_biasWir   hidden_activationactr{   mlp_dropoutr}   Wor~   s     r7   rV   zModernBertMLP.__init__   s    )F.F4L0M0MPQ0QX^Xghhh&23Jv122	)F4f6Hv___r9   r   rZ   c                     |                      |                              dd          \  }}|                     |                     |                     |          |z                      S )Nr%   r&   rO   )r   chunkr   r}   r   )rX   r   inputgates       r7   r8   zModernBertMLP.forward   sW    ggm,,221"2==twwtyy%4!788999r9   )
rA   rB   rC   rd   r   rV   rE   rF   r8   rh   ri   s   @r7   r   r      s|         `/ ` ` ` ` ` `:U\ :el : : : : : : : :r9   r   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )ModernBertRotaryEmbeddinginv_freqNrl   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)rU   rV   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrl   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)rX   rl   rQ   r   rY   s       r7   rV   z"ModernBertRotaryEmbedding.__init__   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r9   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r&   r   mpscpuF)device_typeenabledr%   r   )rR   )r   re   expandr+   torQ   r   r   rg   rE   autocast	transposecatr0   r   r1   rR   )
rX   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr0   r1   s
             r7   r8   z!ModernBertRotaryEmbedding.forward	  s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/rc   )rA   rB   rC   rE   rF   __annotations__r   rV   no_gradr   r8   rh   ri   s   @r7   r   r      s         l/ // / / / / / /" U]__< <  _< < < < <r9   r   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nr&   r%   r   )r+   rE   r   )r   x1x2s      r7   rotate_halfr     s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r9   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr0   r1   r   unsqueeze_dimq_embedk_embeds           r7   apply_rotary_pos_embr      sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr9   FmoduleModernBertAttentionr/   attention_masksliding_window_maskr   local_attentionbsrO   output_attentionsrZ   c	                    |                      ||          \  }
}|                    dd                              d          \  }}}t          |||
|          \  }}| j        dz  }t          j        ||                    dd                    |z  }|dk    r|}||z   }t          j        	                    |dt
          j
        	                              |j                  }t          j                            || j        | j        
          }t          j        ||          }|                    dd                                          }|                    |d|          }|r||fS |fS )Nr   r
   r   r%   r         ࿩r&   r&   r&   rO   rR   )ptraining)
rotary_embr   unbindr   head_dimrE   matmulr   
functionalsoftmaxfloat32r   rR   dropoutattention_dropoutr   r*   r,   )r   r/   r   r   r   r   r   rO   r   _kwargsr0   r1   querykeyvaluescaleattn_weightsattn_outputs                     r7   eager_attention_forwardr   ;  se      < @@HCa++22q299E3%eS#s;;JE3OT!E<s}}Q':':;;eCL("",.0L =((2U](SSVVW\WbccL=((9Q\b\k(llL,|U33K''1--88::K""2r3//K +\**>r9   r   target_dtypec	                     ||||          }|j         t          j        t          j        fv}
|
rZ|j         }|                    |          }t          |||| j        r| j        nd| j        |          }|                    |          }n(t          |||| j        r| j        nd| j        |          }|	                    ||          fS )Nr\           )r"   r#   	dropout_pdeterministicwindow_size)
rR   rE   float16bfloat16r   r   r   r   deterministic_flash_attnr,   )r   r/   r   r"   r#   r   r   rO   r   r   convert_dtype
orig_dtypeattns                r7   flash_attention_forwardr   `  s     *SZJ
G
G
GCIemU^%DDM 
 Y
ff\""/!!28/Jf..s 9'
 
 
 wwz""/!!28/Jf..s 9'
 
 
 IIb#  r9   c                    |                      ||          \  }	}
|                    dd                              d          \  }}}t          |||	|
          \  }}|dk    r|}t	          j        |||| j        r| j        nd|                              dd                                          }|	                    |d	|          }|fS )
Nr   r
   r   r%   r   r   r   )r   	attn_maskr&   )
r   r   r   r   Fscaled_dot_product_attentionr   r   r*   r,   )r   r/   r   r   r   r   r   rO   r   r0   r1   r   r   r   r   s                  r7   sdpa_attention_forwardr     s       < @@HCa++22q299E3%eS#s;;JE3("", 	
&28/Jf..s$	
 	
 	
 
1a	  ""2r3//K>r9   )flash_attention_2eagersdpac                   r     e Zd ZdZddedee         f fdZ	 ddej	        dee
         d	ej	        fd
Z xZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    Nrl   layer_idc                    t                                                       || _        || _        |j        |j        z  dk    r t          d|j         d|j         d          |j        | _        |j        | _        |j        | _	        |j        |j        z  | _
        | j
        | j	        z  | _        t          j        |j        d| j        z  |j                  | _        ||j        z  dk    r6|j        dz  |j        dz  f| _        |j        |j        n|j        }|j        }nd| _        |j        }|j        }|j        d	k    rt-          | j
        ||
          | _        n0t1          j        |          }||_        t7          |          | _        t          j        |j        |j        |j                  | _        |j        dk    rt          j        |j                  nt          j                    | _        tA                      | _!        d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   r   r%   r   r   )rO   r#   rP   )rl   r   )"rU   rV   rl   r  rt   num_attention_heads
ValueErrorr   r   	num_headsr   all_head_sizer   r   attention_biasWqkvglobal_attn_every_n_layersr   local_rope_thetaglobal_rope_thetar   _attn_implementationrM   r   copydeepcopy
rope_thetar   r   r{   Identityout_dropsetpruned_heads)rX   rl   r  r  r   config_copyrY   s         r7   rV   zModernBertAttention.__init__  s     ::a?? LF$6  L  Lnt  oI  L  L  L   "(!9(.(G%3*f.HH!]T^;If0!d6H2HvOdeee	f771<<$*$:a$?AW[\A\#]D 4:4K4W00]c]uJ&,&<###+D &,&D#1J&*===?M.EJ  DOO -//K%/K"7{KKKDO)F.0BI^___@F@X[^@^@^
6#;<<<dfdodqdqEEr9   Fr   r   rZ   c           
         |                      |          }|j        d         }| j        j        dk    r#|                    dd| j        | j                  }n#|                    |dd| j        | j                  }t          | j        j                 | f|| j        | j	        || j
        |d|}|d         }|                     |                     |                    }|f|dd          z   S )Nr   r   r&   r
   )r/   r   r   r   rO   r   r   )r
  r+   rl   r  r,   r  r   MODERNBERT_ATTENTION_FUNCTIONr   r   r  r  r   )rX   r   r   kwargsr/   r   attn_outputss          r7   r8   zModernBertAttention.forward  s     ii&& #;+/BBB((2q$.$-@@CC((2r1dndmDDC4T[5UV	
 0"/	
 	
 	
 	
 %Qdggm&<&<==,qrr"222r9   rc   F)rA   rB   rC   rd   r   r   rG   rV   rE   rF   boolr8   rh   ri   s   @r7   r   r     s         %" %"/ %"8C= %" %" %" %" %" %"T -23 3|3 $D>3
 
3 3 3 3 3 3 3 3r9   c                   B    e Zd Zddedee         f fdZ ej        d          dej	        dej	        fd	            Z
	 	 	 	 	 	 ddej	        deej	                 deej	                 deej                 deej	                 dee         dee         dej	        fdZ xZS )ModernBertEncoderLayerNrl   r  c                    t                                                       || _        |dk    rt          j                    | _        n+t          j        |j        |j        |j	                  | _        t          ||          | _        t          j        |j        |j        |j	                  | _        t          |          | _        d S )Nr   ro   )rl   r  )rU   rV   rl   r   r  	attn_normrw   rt   rx   ry   r   r   mlp_normr   mlp)rX   rl   r  rY   s      r7   rV   zModernBertEncoderLayer.__init__  s    q==[]]DNN\&*<&/X^XhiiiDN'vIII	V%7V_SYScddd ((r9   Tr   r   rZ   c                 R    |                      |                     |                    S rc   )r"  r!  rX   r   s     r7   compiled_mlpz#ModernBertEncoderLayer.compiled_mlp  s     xxm44555r9   Fr   r   r   r"   r#   r   c           	      .   |                      |                     |          ||||||          }||d         z   }| j        j        r|                     |          n'|                     |                     |                    }	||	z   }|f|dd          z   S )Nr   r   r   r"   r#   r   r   r   )r   r   rl   r   r%  r"  r!  )
rX   r   r   r   r   r"   r#   r   r  
mlp_outputs
             r7   r8   zModernBertEncoderLayer.forward  s     yyNN=))) 3%!!/ ! 
 
 &Q7 {,8Dm,,,$--6677 	
 &
2,qrr"222r9   rc   )NNNNNF)rA   rB   rC   r   r   rG   rV   rE   r   rF   r%  r   r  r8   rh   ri   s   @r7   r  r    s;       	) 	)/ 	)8C= 	) 	) 	) 	) 	) 	) U]4   6%, 65< 6 6 6 ! 6 266:37-1$(,13 3|3 !.3 &el3	3
 u/03 U\*3 SM3 $D>3 
3 3 3 3 3 3 3 3r9   r  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dej        fdZ	 dd	ee         d
edef fdZd Z fdZ xZS )ModernBertPreTrainedModelrl   modelTrk   r  Fr   c                 p   | j         j        ddt          j        dt          ffd}| j         j        | j         j        t          j        d| j         j        z            z  | j         j        | j         j	        dz  d}t          |t                    r ||j        |d                    d S t          |t                    r0 ||j        |d	                     ||j        |d
                    d S t          |t                     r0 ||j        |d	                     ||j        |d
                    d S t          |t$                    r ||j        |d
                    d S t          |t(                    r ||j        |d
                    d S t          |t,          t.          t0          t2          f          r ||j        |d                    d S t          |t          j                  rF|j        j                            d           |j        "|j        j                                          d S d S d S )Nr
   r   stdc                     t           j                            | j        d| |z  |z             t	          | t           j                  r-| j        (t           j                            | j                   d S d S d S )Nr   )meanr-  ab)r   inittrunc_normal_weightr   r   rq   zeros_)r   r-  cutoff_factors     r7   init_weightz<ModernBertPreTrainedModel._init_weights.<locals>.init_weightA  s    G!! .3&#% "    &"),, 0;*GNN6;/////0 0**r9   g       @r   )inout	embedding	final_outr:  r8  r9  r;  g      ?)!rl   initializer_cutoff_factorr   Modulere   initializer_rangemathsqrtnum_hidden_layersrt   r   rk   rv   r   r   r   r   r
  ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierrw   r4  datafill_rq   zero_)rX   r   r7  stdsr6  s       @r7   _init_weightsz'ModernBertPreTrainedModel._init_weights<  sd   = M	0	 	0 	0 	0 	0 	0 	0 	0 +/;049S4;C`=`3a3aa60$6	
 
 f233 	)K-tK/@AAAAA.. 	)K	4:...K	4;///// 344 	)KT$Z000K	4;///// 899 	)Kd5k22222 566 	)KU444443+0.	
 
 	) K)4+<=====-- 	)M$$S))){& &&(((((	) 	)&&r9   attn_implementationis_init_checkrZ   c                     	 ||                                  rdn|}n# t          t          f$ r Y nw xY wt                                          ||          S )zR
        Checks and dispatches to hhe requested attention implementation.
        Nr   )rP  rQ  )_flash_attn_2_can_dispatchr  ImportErrorrU   %_check_and_adjust_attn_implementation)rX   rP  rQ  rY   s      r7   rU  z?ModernBertPreTrainedModel._check_and_adjust_attn_implementationp  s    	 '.43R3R3T3T. $#(  
 K( 	 	 	D	ww<< 3= = 
 
 	
s    22c                 .   | j         j        du rd S t          | d          rJt          | j                  dk    r2| j         j        rt
                              d           d| j         _        | j        j        dk    r2| j         j        rt
                              d           d| j         _        | j        j        dk    r2| j         j        rt
                              d           d| j         _        | j         j        t                      | j         _        d S d S )	NFhf_device_mapr   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
rl   r   r   lenrW  loggerwarning_oncerQ   r   r   ra   s    r7   _maybe_set_compilez,ModernBertPreTrainedModel._maybe_set_compile  s)   ;(E11F4)) 	2c$2D.E.E.I.I{, ##9   -2DK);u$${, ##9   -2DK);u$${, ##9   -2DK);(0,?,A,ADK))) 10r9   c                      t                      j        |i |}| j        j        dv r2| j        j        rt                              d           d| j        _        |S )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rU   resize_token_embeddingsrl   r   rY  rZ  )rX   argsr  model_embedsrY   s       r7   r]  z1ModernBertPreTrainedModel.resize_token_embeddings  sh    6uww6GGG;(L88{, ##y   -2DK)r9   r  )rA   rB   rC   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr   r=  rO  r   rg   r  rU  r[  r]  rh   ri   s   @r7   r*  r*  2  s         &*#/1IJN2)BI 2) 2) 2) 2)j IN
 
#+C=
AE
	
 
 
 
 
 
.B B B>
 
 
 
 
 
 
 
 
r9   r*  inputslabelsc                    |                     dt          j                  }t          j        |                                d                                          }t          |                                                                          }t          j        j	        
                    t          j        |dt          j                  d          }|                                 dk    r|                                 |         }n#| j        ^}	}
}|	|
z  } | j        |g|R  |         }||                                |         nd}||                                |         nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    r&   r   F)as_tupler   )r   r   r%   N)sumrE   int32nonzeroflattenrG   maxitemr   r   padcumsumrO   r+   r,   )rf  r   r   rg  seqlens_in_batchindicesmax_seqlen_in_batchr"   unpadded_inputsbatchseqlenrestr+   unpadded_position_idsunpadded_labelss                  r7   _unpad_modernbert_inputr{    sW   . &))b)DDmN2244uEEEMMOOG.224499;;<<$((6FAUZU`)a)a)acijjJzz||q ..**73%|v%&+e3d333G<?K?WL00227;;]a393Efnn&&w//4OGZ1DF[]lllr9   rs  rv  rw  c                 6   |                                  dk    r@t          j        ||z  | j        | j                  }| ||<   |                    ||          }n@| j        ^}}t          j        ||z  g|R | j        | j        d}| ||<    |j        ||g|R  }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    r   rR   rQ   )rO   rE   zerosrR   rQ   r,   r+   )rf  rs  rv  rw  outputpadded_inputs_rx  s           r7   _pad_modernbert_outputr    s    $ zz||qUV^6<VVV wE622<DUV^]d]]&,v}]]] w#E69D999r9   c            !           e Zd Zdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j                 d	ee	j
                 d
ee	j                 dee	j                 dee	j                 dee         dee         dee         dee         dee         dee         deee	j        df         ef         fd            Zde	j        dede	j        fdZ xZS )ModernBertModelrl   c                 |   t                                                     | _        t                    | _        t          j        fdt          j                  D                       | _	        t          j
        j        j        j                  | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S rH   )r  ).0r  rl   s     r7   
<listcomp>z,ModernBertModel.__init__.<locals>.<listcomp>  s$    fff(#FH55fffr9   ro   F)rU   rV   rl   rk   
embeddingsr   
ModuleListrangerA  layersrw   rt   rx   ry   
final_normgradient_checkpointing	post_initr~   s    `r7   rV   zModernBertModel.__init__  s       .v66mffffeFLdFeFefff
 
 ,v'9vU[Uefff&+#r9   c                     | j         j        S rc   r  rv   ra   s    r7   get_input_embeddingsz$ModernBertModel.get_input_embeddings  s    --r9   c                     || j         _        d S rc   r  )rX   r   s     r7   set_input_embeddingsz$ModernBertModel.set_input_embeddings  s    ).&&&r9   Nr   r   r   r   r   rs  r"   r#   
batch_sizeseq_lenr   output_hidden_statesreturn_dictrZ   .c           
        	
 ||n| j         j        }||n| j         j        }||n| j         j        }|du |duz  rt	          d          |rdnd}|rdnd}|                                  ||                     ||           	)
'||j        dd         \  	
n|j        dd         \  	
||j        n|j        }|#t          j
        	
f|t          j                  }d}| j         j        dk    rc`|^|\d}|Bt          j                    5  t          ||	          ^}}}}ddd           n# 1 swxY w Y   n\t          ||	          ^}}}}nE|)t          j        
|
                              d          }|                     ||          \  }}|                     ||          }| j        D ]E}|r||fz   } ||||||||          }|d         }|rt)          |          dk    r||d         fz   }F|r||fz   }|                     |          }|r3t-          |	
          }|t/          	
fd|D                       }n^| j         j        dk    rN|L|d                                         dk    r.|                    d          }t/          d |D                       }|st/          d |||fD                       S t3          |||          S )  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nz:You must specify exactly one of input_ids or inputs_embedsrH   r%   rT   Fr   T)rf  r   rQ   r   )r   )r   r   r'  r   rf  rs  rv  rw  c              3   >   K   | ]}t          |           V  dS )r  N)r  )r  hsr  rs  r  s     r7   	<genexpr>z*ModernBertModel.forward.<locals>.<genexpr>|  sI       * * +"gZ`ghhh* * * * * *r9   r&   c              3   @   K   | ]}|                     d           V  dS )r   N)r   )r  r  s     r7   r  z*ModernBertModel.forward.<locals>.<genexpr>  s,      %R%R"bll1oo%R%R%R%R%R%Rr9   c              3      K   | ]}||V  	d S rc   rH   )r  vs     r7   r  z*ModernBertModel.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr9   )last_hidden_stater   
attentions)rl   r   r  use_return_dictr  r[  %warn_if_padding_and_no_attention_maskr+   rQ   rE   onesr  r  r   r{  aranger   _update_attention_maskr  r  rX  r  r  rf   rO   r   )rX   r   r   r   r   r   rs  r"   r#   r  r  r   r  r  all_hidden_statesall_self_attentionsrQ   repadr  r   encoder_layerlayer_outputss         `  ``           r7   r8   zModernBertModel.forward  sb   B 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]-t";< 	[YZZZ"6@BBD$5?bb4!!! 66y.QQQ'/(&3&9"1"&=#
GG&/obqb&9#
G%.%:!!@T!"ZW(=fTYT^___N;+/BBB:#5*:L (  I`#,^J J JF	7J
Q              
 Ja,^J J JFM7J
Q #$|GFCCCMMaPP262M2M2C 3N 3 3/N/ )=YY![ 	P 	PM# I$58H$H!)M-$7)%%"3  M *!,M  PS%7%7!%;%;&9]1=M<O&O# 	E 1]4D D66 	S2$gZPW  M !,$) * * * * * */* * * % %! K,0CCC!-!"%))++q00)33A66M %%R%R@Q%R%R%R R R 	nmm]4EGZ$[mmmmmm++*
 
 
 	
s   D66D:=D:c                    |ro| j         j        dk    r't                              d           d| j         _        n8| j         j        dk    r(t                              d| j         j         d           t	          || j                  }t          j        |j        d                   	                    d          }t          j
        ||j        z
            }|| j         j        dz  k    	                    d          	                    d                              |j                  }|                    |                                t          j        | j                  j                  }||fS )Nr   zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r   zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r%   r   )rl   r  rY  rZ  r   rR   rE   r  r+   r   absTr   r   rQ   masked_filllogical_notfinfomin)rX   r   r   global_attention_maskrowsdistancewindow_maskr   s           r7   r  z&ModernBertModel._update_attention_mask  sg    	{/699##V   4;001W<<##: $ @: : :   !;>4: V V |17:;;EEaHH9TDF]++ 499DDQGGQQRSTTWWXfXmnn 	 4??@W@W@Y@Y[`[fgkgq[r[r[vww$&999r9   NNNNNNNNNNNNN)rA   rB   rC   r   rV   r  r  r   r   rE   r   rF   rG   r  r   rf   r   r8   r  rh   ri   s   @r7   r  r    s       	/ 	 	 	 	 	 	. . ./ / /  15156:3704*.-1$($(!%,0/3&*A
 A
E,-A
 !.A
 &el3	A

 u/0A
  -A
 %,'A
 U\*A
 SMA
 SMA
 #A
 $D>A
 'tnA
 d^A
 
uU\3&'8	9A
 A
 A
 ^A
F:U\ :VZ :_d_k : : : : : : : :r9   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )rB  rl   c                 .   t                                                       || _        t          j        |j        |j        |j                  | _        t          |j	                 | _
        t          j        |j        |j        |j                  | _        d S )Nro   )rU   rV   rl   r   r   rt   classifier_biasrC  r   classifier_activationr   rw   rx   ry   rz   r~   s     r7   rV   z!ModernBertPredictionHead.__init__  sq    Yv163EvG]^^
&67L!3vO_```			r9   r   rZ   c                 x    |                      |                     |                     |                              S rc   )rz   r   rC  r$  s     r7   r8   z ModernBertPredictionHead.forward  s,    yy$**]";";<<===r9   )	rA   rB   rC   r   rV   rE   rF   r8   rh   ri   s   @r7   rB  rB    sr        a/ a a a a a a>U\ >el > > > > > > > >r9   rB  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc            "       (    e Zd ZdgZdef fdZd Zdej        fdZ	 e
j        d          d	e
j        d
e
j        fd            Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee
j                 dee
j                 dee
j                 dee
j                 dee
j                 dee
j                 dee
j                 dee
j                 dee         dee         dee         dee         dee         dee         d
eee
j                 ef         fd            Z xZS )rD  zdecoder.weightrl   c                 j   t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        |j
        |j                  | _        | j        j        | _        | j        j        | _        |                                  d S )Nr   )rU   rV   rl   r  r+  rB  headr   r   rt   rs   decoder_biasrE  sparse_predictionsparse_pred_ignore_indexr  r~   s     r7   rV   zModernBertForMaskedLM.__init__  s       $V,,
,V44	y!3V5FVM`aaa!%!>(,(L% 	r9   c                     | j         S rc   rE  ra   s    r7   get_output_embeddingsz+ModernBertForMaskedLM.get_output_embeddings  s
    |r9   new_embeddingsc                     || _         d S rc   r  )rX   r  s     r7   set_output_embeddingsz+ModernBertForMaskedLM.set_output_embeddings  s    %r9   Tr   r  rZ   c                 R    |                      |                     |                    S rc   )rE  r  )rX   r  s     r7   compiled_headz#ModernBertForMaskedLM.compiled_head  s     ||DIIf--...r9   Nr   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  c                 (   ||n| j         j        }|                                  | j         j        dk    r|||	|
)|'||j        dd         \  }
}n|j        dd         \  }
}||j        n|j        }|#t          j        |
|f|t          j                  }|Ft          j	                    5  t          ||||          \  }}}}	}}ddd           n# 1 swxY w Y   nt          ||||          \  }}}}	}}|                     ||||||||	|
||||          }|d         }| j        rS|Q|                    d          }|                    |j        d         d          }|| j        k    }||         }||         }| j         j        r|                     |          n'|                     |                     |                    }d}| | j        ||fd	| j         j        i|}| j         j        dk    r| j         j        s|t-                      nt          j	                    5  t/          |||
|
          }ddd           n# 1 swxY w Y   t1          |dd          g }|j        D ]f}|                                dk    r&|j        d         dk    r|                    d          }|                    t/          |||
|
                     gt;          |          |_        |s|f}||f|z   n|S t=          |||j        |j                  S )r  Nr   r%   rT   )rf  r   r   rg  r   r   r   r   r   rs  r"   r#   r  r  r   r  r  r   r&   rs   r  r   r
   r   losslogitsr   r  ) rl   r  r[  r  r+   rQ   rE   r  r  r   r{  r+  r  r,   r  r   r  rE  r  loss_functionrs   repad_logits_with_gradr   r  getattrr   rO   squeezeappendrf   r   r  )rX   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  r  rQ   outputsr  mask_tokensr  r  padded_hidden_statesr  r  s                            r7   r8   zModernBertForMaskedLM.forward  sX   F &1%<kk$+B]!!!;+/BBB:#5*:L%'/$0.;.A"1".E+
GG.7obqb.A+
G-6-B))H\!)%*ZW0Ef\a\f%g%g%gN (  [r#,^Zfou\ \ \X	7J
LRX              
 \s,^Zfou\ \ \XM7J
LRX **) 3%'!!!/!5#  
 
 $AJ! 	)f&8[[__F 1 6 6v|A K K !D$AAK 1+ >K(F {,<D0111dii(9::;; 	 %4%ffbbAWb[abbD;+/BBB"&+"Dk\a\i\k\k r r/vwV`ipqqqr r r r r r r r r r r r r r r w66B')$!/  Bvvxx1}}!)9)9ZZ]](//.b'Q[dklll    )..B(C(C% 	FYF)-)9TGf$$vE!/)	
 
 
 	
s$   0CCC(IIINNNNNNNNNNNNNN)rA   rB   rC   _tied_weights_keysr   rV   r  r   r   r  rE   r   rF   r  r   r   r   rG   r  r   rf   r   r8   rh   ri   s   @r7   rD  rD    s        ++/        &BI & & & & U]4   /EL /U\ / / / ! /  15156:/304)-*.-1$($(!%,0/3&*x
 x
E,-x
 !.x
 &el3	x

 u|,x
  -x
 &x
 %,'x
 U\*x
 SMx
 SMx
 #x
 $D>x
 'tnx
 d^x
" 
uU\"N2	3#x
 x
 x
 ^x
 x
 x
 x
 x
r9   rD  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c            "           e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 deej	                 deej	                 d	eej	                 d
eej	                 deej	                 dee
         dee
         dee
         dee         dee         dee         deeej	                 ef         fd            Z xZS )rF  rl   c                    t                                          |           |j        | _        || _        t	          |          | _        t          |          | _        t          j	        
                    |j                  | _        t          j        |j        |j                  | _        |                                  d S rc   )rU   rV   
num_labelsrl   r  r+  rB  r  rE   r   r{   classifier_dropoutr}   r   rt   rJ  r  r~   s     r7   rV   z,ModernBertForSequenceClassification.__init___  s        +$V,,
,V44	H$$V%>??	)F$68IJJ 	r9   Nr   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  rZ   c                    ||n| j         j        }|                                  ||                     ||           |
)|'||j        dd         \  }
}n|j        dd         \  }
}||j        n|j        }|#t          j        |
|f|t          j                  }| 	                    ||||||||	|
||||          }|d         }| j         j
        dk    r|dddf         }nT| j         j
        dk    rD||                    d          z                      d	
          |                    d	d          z  }|                     |          }|                     |          }|                     |          }d}|Z| j         j        f| j        d	k    rd| j         _        nN| j        d	k    r7|j        t          j        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt+                      }| j        d	k    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt/                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt3                      } |||          }|s|f}||f|z   n|S t5          |||j        |j                  S )aB  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr%   rT   r  r   clsr/  r&   r   r   TrO   keepdim
regressionsingle_label_classificationmulti_label_classificationr  )rl   r  r[  r  r+   rQ   rE   r  r  r+  classifier_poolingr   rj  r  r}   rJ  problem_typer  rR   longrG   r	   r  r   r,   r   r   r   r  )rX   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  r  rQ   r  r  pooled_outputr  r  loss_fctr  s                           r7   r8   z+ModernBertForSequenceClassification.forwardl  s]   N &1%<kk$+B]!!! 66y.QQQ'/(&3&9"1"&=#
GG&/obqb&9#
G%.%:!!@T!"ZW(=fTYT^___N**) 3%'!!!/!5#  
 
 $AJ;)U22 1!!!Q$ 7[+v55!2^5M5Mb5Q5Q!Q V V[\ V ] ]`n`r`rt as a a ! 		"344		-00//{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FYF)-)9TGf$$vE'!/)	
 
 
 	
r9   r  )rA   rB   rC   r   rV   r   r   rE   r   rF   rG   r  r   rf   r   r8   rh   ri   s   @r7   rF  rF  Y  s       /        15156:/304)-*.-1$($(!%,0/3&*r
 r
E,-r
 !.r
 &el3	r

 u|,r
  -r
 &r
 %,'r
 U\*r
 SMr
 SMr
 #r
 $D>r
 'tnr
 d^r
" 
uU\"$<<	=#r
 r
 r
 ^r
 r
 r
 r
 r
r9   rF  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c            "           e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 deej	                 deej	                 d	eej	                 d
eej	                 deej	                 dee
         dee
         dee
         dee         dee         dee         deeej	                 ef         fd            Z xZS )rH  rl   c                 t   t                                          |           |j        | _        t          |          | _        t          |          | _        t          j        	                    |j
                  | _        t          j        |j        |j                  | _        |                                  d S rc   rU   rV   r  r  r+  rB  r  rE   r   r{   r  r}   r   rt   rJ  r  r~   s     r7   rV   z)ModernBertForTokenClassification.__init__  s        +$V,,
,V44	H$$V%>??	)F$68IJJ 	r9   Nr   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  rZ   c                    ||n| j         j        }|                                  |                     ||||||||	|
||||          }|d         }|                     |          }|                     |          }|                     |          }d}|Ft                      } ||                    d| j	                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr  r   r&   r   r  )rl   r  r[  r+  r  r}   rJ  r   r,   r  r   r   r  )rX   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  r  r  r  r  r  r  s                        r7   r8   z(ModernBertForTokenClassification.forward  sK   H &1%<kk$+B]!!!**) 3%'!!!/!5#  
 
 $AJ II&788 II&788!233'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r9   r  )rA   rB   rC   r   rV   r   r   rE   r   rF   rG   r  r   rf   r   r8   rh   ri   s   @r7   rH  rH    s       
/ 
 
 
 
 
 
  15156:/304)-*.-1$($(!%,0/3&*I
 I
E,-I
 !.I
 &el3	I

 u|,I
  -I
 &I
 %,'I
 U\*I
 SMI
 SMI
 #I
 $D>I
 'tnI
 d^I
  
uU\"$99	:!I
 I
 I
 ^I
 I
 I
 I
 I
r9   rH  c            "           e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee	         dee	         dee	         dee
         dee
         dee
         deeej                 ef         fd            Z xZS )rI  rl   c                 t   t                                          |           |j        | _        t          |          | _        t          |          | _        t          j        	                    |j
                  | _        t          j        |j        |j                  | _        |                                  d S rc   r  r~   s     r7   rV   z'ModernBertForQuestionAnswering.__init__C  s        +$V,,
,V44	H$$V%>??	)F$68IJJr9   Nr   r   r   r   start_positionsend_positionsrs  r"   r#   r  r  r   r  r  rZ   c                    ||n| j         j        }|                                  |                     |||||||	|
||||          }|d         }|                     |          }|                     |          }|                     |          }|                    dd          \  }}|                    d          	                                }|                    d          	                                }d}|| | j
        ||||fi |}|s||f|dd         z   }||f|z   n|S t          ||||j        |j                  S )r  N)r   r   r   rs  r"   r#   r  r  r   r  r  r   r   r&   r   )r  start_logits
end_logitsr   r  )rl   r  r[  r+  r  r}   rJ  splitr  r*   r  r   r   r  )rX   r   r   r   r   r  r  rs  r"   r#   r  r  r   r  r  r  r  r  r  r  r  r  r  s                          r7   r8   z&ModernBertForQuestionAnswering.forwardN  s   F &1%<kk$+B]!!!**) 3%!!!/!5#  
 
 $AJ II&788 II&788!233#)<<r<#:#: j#++B//::<<''++6688
&=+D%4%lJQ^iibhiiD 	F"J/'!""+=F)-)9TGf$$vE+%!!/)
 
 
 	
r9   r  )rA   rB   rC   r   rV   r   r   rE   rF   rG   r  r   rf   r   r8   rh   ri   s   @r7   rI  rI  A  s       	/ 	 	 	 	 	 	  266:/32604*.-1$($(!%,0/3&*K
 K
EL)K
 !.K
 &el3	K

 u|,K
 "%,/K
  -K
 %,'K
 U\*K
 SMK
 SMK
 #K
 $D>K
 'tnK
 d^K
" 
uU\"$@@	A#K
 K
 K
 ^K
 K
 K
 K
 K
r9   rI  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c            "           e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 deej	                 deej	                 d	eej	                 d
eej	                 deej	                 dee
         dee
         dee
         dee         dee         dee         deeej	                 ef         fd            Z xZS )rG  rl   c                 `   t                                          |           || _        t          |          | _        t          |          | _        t          j        	                    |j
                  | _        t          j        |j        d          | _        |                                  d S Nr   )rU   rV   rl   r  r+  rB  r  rE   r   r{   r  r}   r   rt   rJ  r  r~   s     r7   rV   z$ModernBertForMultipleChoice.__init__  s       $V,,
,V44	H$$V%>??	)F$6:: 	r9   Nr   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  rZ   c                    ||n| j         j        }||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                                  |                     ||||||||	|
||||          }|d         }| j         j        dk    rt          j	        |j        d         |j
                  }|/|                    d	                              |j
                  }n&t          j        dt          j        |j
        
          }|||f         }nV| j         j        dk    rF|                    dd          }||                    d          z                      d	          |z  }|                     |          }|                     |          }|                     |          }|                    d|          }d}|t)          j                    } |||          }|s|f|dd         z   }||f|z   n|S t-          |||j        |j                  S )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr   r&   r  r   r  r  r   r}  r/  Tr  r  )rl   r  r+   r,   sizer[  r+  r  rE   r  rQ   argmaxr   tensorr  rj  r   r  r}   rJ  r   r   r   r   r  )rX   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  r  num_choicesr  r  	indices_0cls_masknum_non_pad_tokensr  r  reshaped_logitsr  r  r  s                               r7   r8   z#ModernBertForMultipleChoice.forward  s!   L &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 	!!!**) 3%'!!!/!5#  
 
 $AJ ;)U22%6%<Q%?HYH`aaaI))00R088;;<M<TUU !<DUD\]]] 1)X2E F [+v55!/!3!34!3!H!H!2^5M5Mb5Q5Q!Q V V[\ V ] ]`r r		"344		-00// ++b+66*,,H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
r9   r  )rA   rB   rC   r   rV   r   r   rE   r   rF   rG   r  r   rf   r   r8   rh   ri   s   @r7   rG  rG    s       
/ 
 
 
 
 
 
  15156:/304)-*.-1$($(!%,0/3&*i
 i
E,-i
 !.i
 &el3	i

 u|,i
  -i
 &i
 %,'i
 U\*i
 SMi
 SMi
 #i
 $D>i
 'tni
 d^i
" 
uU\"$==	>#i
 i
 i
 ^i
 i
 i
 i
 i
r9   rG  )r  r*  rD  rF  rH  rI  rG  r@   r  r  )Wr  r?  
contextlibr   typingr   r   rE   torch.nn.functionalr   r   r   torch.nnr   r   r	   activationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   r   utils.import_utilsr   configuration_modernbertr   flash_attn.flash_attn_interfacer   flash_attn.layers.rotaryr   flash_attn.ops.triton.rotaryr   object
get_loggerrA   rY  autogradFunctionr!   rF   rG   rK   rM   r=  rk   r   r   r   r   r   rf   r  r   r   rR   r   r   r  r   r  r*  r{  r  r  rB  rD  rF  rH  rI  rG  __all__rH   r9   r7   <module>r     s  ,   " " " " " " " " " " " " " "                 A A A A A A A A A A ! ! ! ! ! ! B B B B B B 9 9 9 9 9 9                L K K K K K K K - - - - - - G G G G G G G G G G 5 5 5 5 5 5 6 6 6 6 6 6  PPPPPP8888889999999O 
	H	%	%46 46 46 46 46%.1 46 46 46v *. $L L &	L
 L L L L42Q 2Q 2Q 2Q 2Q 2Q 2Q 2Qj    29   <: : : : :BI : : :(!< !< !< !< !<	 !< !< !<H( ( (   H )." "!"	" L" 	"
 5+," 38_" 	" 
"  ~" 5u|+,eEL.AAB" " " "\ !&(! (!!(!	(! 2(! 	(!
 (! 38_(! 	(! 
(! +(! 5<(! (! (! (!V ! 	  L  	 
 5+,  38_  	  
  5<       H 1$"! ! L3 L3 L3 L3 L3") L3 L3 L3^+3 +3 +3 +3 +37 +3 +3 +3\ } } } } } } } }F ,0%)	&m &mL&mL&m 5<(&m U\"	&m
 5<u|S(5<:PRZ[`[gRhhi&m &m &m &mRL\  	
 \   > s: s: s: s: s:/ s: s: s:l	> 	> 	> 	> 	>ry 	> 	> 	>   
S
 S
 S
 S
 S
5 S
 S
 
S
l   
A
 A
 A
 A
 A
*C A
 A
 
A
H   
W
 W
 W
 W
 W
'@ W
 W
 
W
t X
 X
 X
 X
 X
%> X
 X
 X
v   
w
 w
 w
 w
 w
"; w
 w
 
w
t  r9   