
     `i                     X   d dl mZmZ d dlZd dlmZ d dlZd dlm	Z
 d dl	Zd dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZmZm Z m!Z!m"Z" d	dl#m$Z$m%Z%m&Z&m'Z'm(Z( d	dl)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/  e-j0        e1          Z2dZ3dZ4ej5        Z5ej6        j7         G d de*                      Z8dZ9dZ: G d dej;                  Z< G d dej;                  Z= G d dej;                  Z> G d dej;                  Z? G d dej;                  Z@ G d d ej;                  ZA G d! d"ej;                  ZB G d# d$ej;                  ZC G d% d&ej;                  ZD G d' d(ej;                  ZE G d) d*ej;                  ZF G d+ d,e%          ZG G d- d.ej;                  ZH e+d/e9           G d0 d1eG                      ZI e&eIe3ee4            G d2 d3ej;                  ZJ G d4 d5ej;                  ZK e+d6e9           G d7 d8eG                      ZL e&eLe3ee4            G d9 d:ej;                  ZM e+d;e9           G d< d=eG                      ZNd>ZO e(eNe:P                    d?          eOz               e'eNe8e4@            G dA dBej;                  ZQ e+dCe9           G dD dEeG                      ZR e&eRe3e"e4           dF ZS G dG dHej;                  ZT G dI dJej;                  ZU e+dKe9           G dL dMeG                      ZV e(eVe:P                    dN                      e&eVe3ee4            G dO dPej;                  ZW e+dQe9           G dR dSeG                      ZX e&eXe3e e4            G dT dUej;                  ZY G dV dWej;                  ZZ e+dXe9           G dY dZeG                      Z[ e&e[e3e!e4            G d[ d\ej;                  Z\ e+d]e9           G d^ d_eG                      Z] e&e]e3ee4           g d`Z^dS )a    )CallableOptionalN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutput-FlaxBaseModelOutputWithPastAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )ElectraConfigz"google/electra-small-discriminatorr"   c                       e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dS )FlaxElectraForPreTrainingOutputaa  
    Output type of [`ElectraForPreTraining`].

    Args:
        logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlogitshidden_states
attentions)__name__
__module____qualname____doc__r%   jnpndarray__annotations__r&   r   tupler'        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/electra/modeling_flax_electra.pyr$   r$   ;   sh          & FCK26M8E#+./666/3Js{+,33333r1   r$   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   T    e Zd ZU dZeed<   ej        Zej        ed<   d Z	d	de
fdZdS )
FlaxElectraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    t          j        | j        j        | j        j        t
          j         j                            | j        j                            | _	        t          j        | j        j
        | j        j        t
          j         j                            | j        j                            | _        t          j        | j        j        | j        j        t
          j         j                            | j        j                            | _        t          j        | j        j        | j                  | _        t          j        | j        j                  | _        d S )N)stddev)embedding_initepsilonr6   rate)nnEmbedr5   
vocab_sizeembedding_sizejaxinitializersnormalinitializer_rangeword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr6   Dropouthidden_dropout_probdropoutselfs    r2   setupzFlaxElectraEmbeddings.setup   s   !xK"K&6.55T[=Z5[[ 
  
  

 $&8K/K&6.55T[=Z5[[$
 $
 $
 
 &(XK'K&6.55T[=Z5[[&
 &
 &
"
 dk.HPTPZ[[[zt{'FGGGr1   Tdeterministicc                 ^   |                      |                    d                    }|                     |                    d                    }|                     |                    d                    }||z   |z   }	|                     |	          }	|                     |	|          }	|	S )Ni4rS   )rF   astyperH   rJ   rK   rO   )
rQ   	input_idstoken_type_idsposition_idsattention_maskrS   inputs_embedsposition_embedsrJ   r&   s
             r2   __call__zFlaxElectraEmbeddings.__call__   s    ,,Y-=-=d-C-CDD22<3F3Ft3L3LMM $ : :>;P;PQU;V;V W W &(==O }55]-PPr1   NTr(   r)   r*   r+   r"   r.   r,   float32r6   rR   boolr^   r0   r1   r2   r4   r4      so         QQ{E39"""H H H( _c      r1   r4   c                       e Zd ZU eed<   dZeed<   ej        Z	ej	        ed<   d Z
d Zd Zej        d             Z	 	 	 	 ddeej                 dedefdZd	S )FlaxElectraSelfAttentionr5   Fcausalr6   c                 "   | j         j        | j         j        z  | _        | j         j        | j         j        z  dk    rt	          d          t          j        | j         j        | j        t          j        j	        
                    | j         j                            | _        t          j        | j         j        | j        t          j        j	        
                    | j         j                            | _        t          j        | j         j        | j        t          j        j	        
                    | j         j                            | _        | j        r8t!          t#          j        d| j         j        fd          d          | _        d S d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r6   kernel_initr!   rb   r6   )r5   hidden_sizenum_attention_headshead_dim
ValueErrorr>   Denser6   rB   rC   rD   rE   querykeyvaluere   r	   r,   onesrG   causal_maskrP   s    r2   rR   zFlaxElectraSelfAttention.setup   sb   /4;3RR;"T[%DDIII  
 XK#*+224;3PQQ
 
 


 8K#*+224;3PQQ
 
 

 XK#*+224;3PQQ
 
 

 ; 	/!T[@APPPX^     D	 	r1   c                 n    |                     |j        d d         | j        j        | j        fz             S N   )reshapeshaper5   rj   rk   rQ   r&   s     r2   _split_headsz%FlaxElectraSelfAttention._split_heads   s5    $$]%8!%<@_aean?o%opppr1   c                 b    |                     |j        d d         | j        j        fz             S rt   )rv   rw   r5   ri   rx   s     r2   _merge_headsz%FlaxElectraSelfAttention._merge_heads   s/    $$]%8!%<@W?Y%YZZZr1   c                    |                      dd          }|                     ddt          j        |j        |j                  }|                     ddt          j        |j        |j                  }|                     ddd           }|r|j        j        ^ }	}
}}|j        }dt          |	          z  |ddfz   }t          j	        |j        ||          }t          j	        |j        ||          }||_        ||_        |j        d         }|j        |z   |_        t          j
        t          j        |
          ||z   k     t          |	          d||
fz             }t          ||          }|||fS )	a\  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slightly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                  B    t          j        dt           j                  S )Nr   rh   )r,   arrayint32r0   r1   r2   <lambda>z@FlaxElectraSelfAttention._concatenate_to_cache.<locals>.<lambda>   s    CIaWZW`DaDaDa r1   r   r   r!   )has_variablevariabler,   zerosrw   r6   rp   lenr   dynamic_update_slicebroadcast_toaranger/   r   )rQ   ro   rp   rn   r[   is_initializedr~   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r2   _concatenate_to_cachez.FlaxElectraSelfAttention._concatenate_to_cache   sp    **7LAA]]7L#)SYPSPYZZ
}}WnciV[VabbmmG]<a<abb 	EAKAQAW>ZY#)IS__,	1a/@@G*:+;S'JJC,\-?PPE"J!&L(-A% + 14M MK'
:&&5N)NNj!!Q(A:$NN H +8^DDNE>))r1   NTkey_value_states
init_cacheoutput_attentionsc                    |d u}|j         d         }	|                     |          }
|r+|                     |          }|                     |          }n*|                     |          }|                     |          }|                     |
          }
|                     |          }|                     |          }| j        r|
j         d         |j         d         }}|                     dd          rU| j        d         d         }| j        d         d         j         d         }t          j	        | j
        dd|dfdd||f          }n| j
        d d d d d |d |f         }t          j        ||	f|j         dd          z             }|F| j        r?t          j        t          j        |d          |j                   }t          ||          }n"| j        r|}n|t          j        |d          }| j        r4|                     dd          s|r|                     |||
|          \  }}}|t          j        |dk    t          j        |j         d                              | j                  t          j        |j         t          j        | j                  j                                      | j                            }nd }d }|s%| j        j        dk    r|                     d	          }t3          |
|||| j        j        d
|| j        d 	  	        }|t          j        d||          }t          j        d||          }|                    |j         d d         dz             }|r||fn|f}|S )Nr   r!   r}   r~   r   )axisg        rO   T)biasdropout_rngdropout_ratebroadcast_dropoutrS   r6   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdru   ))rw   rn   ro   rp   ry   re   r   	variablesr   dynamic_slicerr   r,   r   expand_dimsr   r   selectfullrW   r6   finfominr5   attention_probs_dropout_probmake_rngr   einsumrv   )rQ   r&   r[   layer_head_maskr   r   rS   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthrr   attention_biasr   attn_weightsattn_outputoutputss                          r2   r^   z!FlaxElectraSelfAttention.__call__  s    .T9"(+
 zz-00 	5"233J::&677LL -00J::m44L((66&&z22
((66 ; 
	_'3'9!'<j>Nq>Q*L  ,77 Q!^G4]C
%)^G%<\%J%PQR%S"!/$q!Z&;aLRd=e  #.qqq!!!]l]KZK/OP*;HYZ[Z\Z\H]8]^^K %$+% -conS[.\.\.\^i^oppN*>;GGNN[ 	L(NN' _^(KKKN ; 	D--g|DD 	
 	7;7Q7QL,8 84Jn
 % Z"-s33::4:FF-sy/D/D/HIIPPQUQ[\\ NN "N 	3!IC!O!O--	22K4#A"'*

 

 

 &:&8,XXLj!8,UU!))+*;BQB*?%*GHH1BV;--r1   NFTF)r(   r)   r*   r"   r.   re   rb   r,   ra   r6   rR   ry   r{   r>   compactr   r   r-   r^   r0   r1   r2   rd   rd      s         FD{E39"""  :q q q[ [ [ Z* * Z*H 37 "'_ _
 #3;/_ _  _ _ _ _ _ _r1   rd   c                   P    e Zd ZU eed<   ej        Zej        ed<   d Zdde	fdZ
dS )	FlaxElectraSelfOutputr5   r6   c                 P   t          j        | j        j        t          j         j                            | j        j                  | j                  | _	        t          j
        | j        j        | j                  | _
        t          j        | j        j                  | _        d S )Nrg   r6   r:   r<   )r>   rm   r5   ri   rB   rC   rD   rE   r6   denserK   rL   rM   rN   rO   rP   s    r2   rR   zFlaxElectraSelfOutput.setupl  s    XK#+224;3PQQ*
 
 


 dk.HPTPZ[[[zt{'FGGGr1   TrS   c                     |                      |          }|                     ||          }|                     ||z             }|S NrV   r   rO   rK   )rQ   r&   input_tensorrS   s       r2   r^   zFlaxElectraSelfOutput.__call__u  sD    

=11]-PP}|'CDDr1   Nr_   r(   r)   r*   r"   r.   r,   ra   r6   rR   rb   r^   r0   r1   r2   r   r   h  sh         {E39"""H H H 4      r1   r   c                   f    e Zd ZU eed<   dZeed<   ej        Z	ej	        ed<   d Z
	 	 	 	 d
defd	ZdS )FlaxElectraAttentionr5   Fre   r6   c                     t          | j        | j        | j                  | _        t          | j        | j                  | _        d S )Nre   r6   rh   )rd   r5   re   r6   rQ   r   outputrP   s    r2   rR   zFlaxElectraAttention.setup  s<    ,T[TXT^___	+DKtzJJJr1   NTr   c           	          |                      |||||||          }|d         }	|                     |	||          }|f}
|r|
|d         fz  }
|
S )N)r   r   r   rS   r   r   rV   r!   )rQ   r   )rQ   r&   r[   r   r   r   rS   r   attn_outputsr   r   s              r2   r^   zFlaxElectraAttention.__call__  sy     yy+-!'/ ! 
 
 #1oKm\\ " 	*Q))Gr1   r   )r(   r)   r*   r"   r.   re   rb   r,   ra   r6   rR   r^   r0   r1   r2   r   r   }  s         FD{E39"""K K K "'        r1   r   c                   H    e Zd ZU eed<   ej        Zej        ed<   d Zd Z	dS )FlaxElectraIntermediater5   r6   c                     t          j        | j        j        t          j         j                            | j        j                  | j                  | _	        t          | j        j                 | _        d S )Nr   )r>   rm   r5   intermediate_sizerB   rC   rD   rE   r6   r   r   
hidden_act
activationrP   s    r2   rR   zFlaxElectraIntermediate.setup  sY    XK)+224;3PQQ*
 
 


 !!78r1   c                 Z    |                      |          }|                     |          }|S N)r   r   rx   s     r2   r^   z FlaxElectraIntermediate.__call__  s*    

=1166r1   N
r(   r)   r*   r"   r.   r,   ra   r6   rR   r^   r0   r1   r2   r   r     sT         {E39"""9 9 9    r1   r   c                   P    e Zd ZU eed<   ej        Zej        ed<   d Zdde	fdZ
dS )	FlaxElectraOutputr5   r6   c                 P   t          j        | j        j        t          j         j                            | j        j                  | j                  | _	        t          j
        | j        j                  | _        t          j        | j        j        | j                  | _        d S )Nr   r<   r:   )r>   rm   r5   ri   rB   rC   rD   rE   r6   r   rM   rN   rO   rK   rL   rP   s    r2   rR   zFlaxElectraOutput.setup  s    XK#+224;3PQQ*
 
 


 zt{'FGGGdk.HPTPZ[[[r1   TrS   c                     |                      |          }|                     ||          }|                     ||z             }|S r   r   )rQ   r&   attention_outputrS   s       r2   r^   zFlaxElectraOutput.__call__  sE    

=11]-PP}7G'GHHr1   Nr_   r   r0   r1   r2   r   r     sh         {E39"""\ \ \ t      r1   r   c                       e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 	 dde	ej
                 de	ej
                 d	ed
edef
dZdS )FlaxElectraLayerr5   r6   c                 :   t          | j        | j        j        | j                  | _        t          | j        | j                  | _        t          | j        | j                  | _        | j        j	        r#t          | j        d| j                  | _
        d S d S )Nr   rh   F)r   r5   
is_decoderr6   	attentionr   intermediater   r   add_cross_attentioncrossattentionrP   s    r2   rR   zFlaxElectraLayer.setup  s    -dk$+BX`d`jkkk3DKtzRRR'4:FFF;* 	d"6t{5X\Xb"c"c"cD	d 	dr1   NFTencoder_hidden_statesencoder_attention_maskr   rS   r   c	                 .   |                      ||||||          }	|	d         }
|#|                     |
|||||          }|d         }
|                     |
          }|                     ||
|          }|f}|r||	d         fz  }|||d         fz  }|S )N)r   r   rS   r   r   )r[   r   r   rS   r   rV   r!   )r   r   r   r   )rQ   r&   r[   r   r   r   r   rS   r   attention_outputsr   cross_attention_outputsr   s                r2   r^   zFlaxElectraLayer.__call__  s     !NN+!'/ + 
 
 -Q/ !,&*&9&9 5 /!6+"3 ': ' '#  7q9))*:;;M3CS`aa " 	9)!,..G$03A688r1   )NNFTF)r(   r)   r*   r"   r.   r,   ra   r6   rR   r   r-   rb   r^   r0   r1   r2   r   r     s         {E39"""d d d 8<8< ""'+ +
  (4+ !) 5+ + +  + + + + + +r1   r   c                       e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                 d	eej                 d
e	de	de	de	de	fdZdS )FlaxElectraLayerCollectionr5   r6   Fgradient_checkpointingc                       j         rCt          t          d           fdt           j        j                  D              _        d S  fdt           j        j                  D              _        d S )N)         )static_argnumsc                 Z    g | ]'} j         t          |          j                   (S )namer6   )r5   strr6   ).0iFlaxElectraCheckpointLayerrQ   s     r2   
<listcomp>z4FlaxElectraLayerCollection.setup.<locals>.<listcomp>  sE        +*4;SVV4:VVV  r1   c                 b    g | ]+}t          j        t          |          j                   ,S r   )r   r5   r   r6   )r   r   rQ   s     r2   r   z4FlaxElectraLayerCollection.setup.<locals>.<listcomp>  sB        !3q66LLL  r1   )r   rematr   ranger5   num_hidden_layerslayers)rQ   r   s   `@r2   rR   z FlaxElectraLayerCollection.setup  s    & 
	)./?PY)Z)Z)Z&    t{<==  DKKK
   t{<==  DKKKr1   NTr   r   r   rS   r   output_hidden_statesreturn_dictc                    |rdnd }|	rdnd }|r|dnd }|V|j         d         t          | j                  k    r3t          dt          | j                   d|j         d          d          t	          | j                  D ]M\  }}|	r||fz  } ||||||         nd |||||          }|d         }|r||d         fz  }|||d         fz  }N|	r||fz  }||||f}|
st          d |D                       S t          ||||	          S )
Nr0   r   z&The head_mask should be specified for z/ layers, but it is for                         .r!   ru   c              3      K   | ]}||V  	d S r   r0   )r   vs     r2   	<genexpr>z6FlaxElectraLayerCollection.__call__.<locals>.<genexpr>R  s"      ==qq}}}}}==r1   )last_hidden_stater&   r'   cross_attentions)rw   r   r   rl   	enumerater/   r   )rQ   r&   r[   	head_maskr   r   r   rS   r   r   r  all_attentionsall_hidden_statesall_cross_attentionsr   layerlayer_outputsr   s                     r2   r^   z#FlaxElectraLayerCollection.__call__  s     1:d"6@BBD&7h<Q<]rrdh  q!c$+&6&677 4S=M=M 4 4'oa04 4 4  
 "$+.. 	@ 	@HAu# 6!m%55!!E ) 5	!4%&!	 	M *!,M  @=#3"55(4(]1-=,??( 	2-!11 "3^EYZ 	>==G======<++%1	
 
 
 	
r1   NNFTFFTr(   r)   r*   r"   r.   r,   ra   r6   r   rb   rR   r   r-   r^   r0   r1   r2   r   r   
  s         {E39"""#(D(((  $ 8<8< ""'%* =
 =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
 =
 =
 =
 =
 =
r1   r   c                       e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                 d	eej                 d
e	de	de	de	de	fdZdS )FlaxElectraEncoderr5   r6   Fr   c                 R    t          | j        | j        | j                  | _        d S )Nr6   r   )r   r5   r6   r   r  rP   s    r2   rR   zFlaxElectraEncoder.setupb  s,    /K*#'#>
 
 



r1   NTr   r   r   rS   r   r   r  c                 @    |                      |||||||||	|

  
        S )N)r
  r   r   r   rS   r   r   r  )r  )rQ   r&   r[   r
  r   r   r   rS   r   r   r  s              r2   r^   zFlaxElectraEncoder.__call__i  s=     zz"7#9!'/!5#  
 
 	
r1   r  r  r0   r1   r2   r  r  ]  s         {E39"""#(D(((
 
 
 8<8< ""'%* 
 

  (4
 !) 5
 
 
  
 #
 
 
 
 
 
 
r1   r  c                   H    e Zd ZU eed<   ej        Zej        ed<   d Zd Z	dS )FlaxElectraGeneratorPredictionsr5   r6   c                     t          j        | j        j        | j                  | _        t          j        | j        j        | j                  | _        d S )Nr:   rh   )r>   rK   r5   rL   r6   rm   rA   r   rP   s    r2   rR   z%FlaxElectraGeneratorPredictions.setup  sB    dk.HPTPZ[[[Xdk8
KKK


r1   c                     |                      |          }t          | j        j                 |          }|                     |          }|S r   )r   r   r5   r   rK   rx   s     r2   r^   z(FlaxElectraGeneratorPredictions.__call__  sA    

=11t{56}EE}55r1   Nr   r0   r1   r2   r  r    sW         {E39"""L L L    r1   r  c                   L    e Zd ZU dZeed<   ej        Zej        ed<   d Z	d Z
dS )#FlaxElectraDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.r5   r6   c                     t          j        | j        j        | j                  | _        t          j        d| j                  | _        d S )Nrh   r!   )r>   rm   r5   ri   r6   r   dense_predictionrP   s    r2   rR   z)FlaxElectraDiscriminatorPredictions.setup  s>    Xdk5TZHHH
 "$* = = =r1   c                     |                      |          }t          | j        j                 |          }|                     |                              d          }|S )Nr   )r   r   r5   r   r  squeezerx   s     r2   r^   z,FlaxElectraDiscriminatorPredictions.__call__  sQ    

=11t{56}EE--m<<DDRHHr1   N)r(   r)   r*   r+   r"   r.   r,   ra   r6   rR   r^   r0   r1   r2   r  r    sZ         OO{E39"""> > >    r1   r  c                       e Zd ZU dZeZdZdZej	        e
d<   ddej        ddfd	ed
ededej        dedef fdZd Zddej        j        d
ededefdZd Z ee                    d                    	 	 	 	 	 	 	 	 	 	 	 	 	 ddee         dej        j        dedee         dee         dee         dee         fd            Z xZS ) FlaxElectraPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    electraNmodule_class)r!   r!   r   TFr5   input_shapeseedr6   _do_initr   c                 x     | j         d|||d|}t                                          ||||||           d S )Nr5   r6   r   )r%  r&  r6   r'  r0   )r$  super__init__)
rQ   r5   r%  r&  r6   r'  r   kwargsmodule	__class__s
            r2   r+  z#FlaxElectraPreTrainedModel.__init__  sU     #"w&Vlwwpvww[tSXcklllllr1   c                 T    |                      | j        | j        d          | _        d S )NTr)  )r$  r5   r6   _modulerP   s    r2   enable_gradient_checkpointingz8FlaxElectraPreTrainedModel.enable_gradient_checkpointing  s/    ((;*#' ) 
 
r1   rngparamsreturnc                    t          j        |d          }t          j        |          }t          j        t          j        t          j        |          j        d                   |          }t          j        |          }t          j        | j	        j
        | j	        j        f          }t          j                            |          \  }	}
|	|
d}| j	        j        rHt          j        || j	        j        fz             }|}| j                            ||||||||d	  	        }n!| j                            ||||||d          }|d         }||t'          t)          |                    }t'          t)          |                    }| j        D ]}||         ||<   t-                      | _        t/          t1          |                    S |S )NrU   rh   r   )r3  rO   F)r  r3  )r,   r   
zeros_liker   r   
atleast_2drw   	ones_likerq   r5   r   rj   rB   randomsplitr   ri   r-  initr   r   _missing_keyssetr   r   )rQ   r2  r%  r3  rX   rY   rZ   r[   r
  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r2   init_weightsz'FlaxElectraPreTrainedModel.init_weights  s   Ik666		22'
3>)3L3L3RSU3V(W(WYdeey11Hdk;T[=\]^^	"%*"2"23"7"7
K$==;* 	$'IkT[=T<V.V$W$W!%3""&+"2"2%&! #3 
# 
# #'+"2"2iyfk #3 # # ,H5(-)@)@AAM!(6"2"233F#1 A A&3K&@{##!$D.00111  r1   c                    t          j        ||fd          }t          j        |d          }t          j        t          j        t          j        |          j        d                   |j                  }| j                            t          j
                            d          |||dd          }t          |d                   S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        rU   rh   r   r   FT)r  r   r}   )r,   rq   r8  r   r   r7  rw   r-  r;  rB   r9  PRNGKeyr   )rQ   r   r   rX   r[   rZ   init_variabless          r2   r   z%FlaxElectraPreTrainedModel.init_cache  s     Hj*5TBBB	y==='
3>)3L3L3RSU3V(W(WYbYhii))Jq!!9nlX]jn * 
 
 w/000r1   batch_size, sequence_lengthr   trainr   r   r  past_key_valuesc                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          j        |          }|It	          j        t	          j        t	          j        |          j	        d                   |j	                  }|t	          j        |          }|*t	          j
        | j         j        | j         j        f          }i }|	|	|d<   d|p| j        i}| j         j        r|r	||d<   dg}nd}| j                            |t	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          |||
 |||||          }|!|r|\  }}t%          |d                   |d	<   |S |3|s1|\  }}|d d
         t%          |d                   fz   |d
d          z   }n| j                            |t	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          |
 ||||          }|S )Nr   rO   r3  r}   FrU   rh   )rY   rZ   r
  r   r   rS   r   r   r  r?  mutablerI  r!   )rY   rZ   r
  rS   r   r   r  r?  )r5   r   r   r  r,   r8  r   r   r7  rw   rq   r   rj   r3  r   r-  applyr   r   )rQ   rX   r[   rY   rZ   r
  r   r   r3  r   rH  r   r   r  rI  r?  inputsrK  r   s                      r2   r^   z#FlaxElectraPreTrainedModel.__call__  s   $ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+BY ! ]955N+CJs~i7P7P7VWY7Z,[,[]f]lmmL! ]955N$+"?A`!abbI ")DOF1dk2;* 1	   "1w")k''	)4000	.555"ytDDD Y|4@@@)IT:::&;'="'i"3%9' (  G$ *{*+2(-5og6N-O-O)* ,[,+2(!"1"+/'2J)K)K(MMPWXYXZXZP[[ k''	)4000	.555"ytDDD Y|4@@@)IT:::"'i"3%9' (  G r1   r   )NNNNNNNNFNNNN) r(   r)   r*   r+   r"   config_classbase_model_prefixr$  r>   Moduler.   r,   ra   r/   intr6   rb   r+  r1  rB   r9  rE  r   rC  r   r   ELECTRA_INPUTS_DOCSTRINGformatr   dictr^   __classcell__)r.  s   @r2   r"  r"    s         
 !L!"L")"""
 $;',m mm m 	m
 ym m !%m m m m m m
 
 
(! (!
 2 (! (!PZ (!fp (! (! (! (!V1 1 1& +*+C+J+JKh+i+ijj "#!%*.,0/3&**.^ ^ ^ Z'^ ^ $D>^ 'tn^ d^^ "$^ ^ ^ kj^ ^ ^ ^ ^r1   r"  c                       e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 de	de	de	de	de	fdZdS )FlaxElectraModuler5   r6   Fr   c                    t          | j        | j                  | _        | j        j        | j        j        k    r*t          j        | j        j        | j                  | _        t          | j        | j        | j
                  | _        d S )Nrh   r  )r4   r5   r6   
embeddingsrA   ri   r>   rm   embeddings_projectr  r   encoderrP   s    r2   rR   zFlaxElectraModule.setupi  su    /4:NNN;%)@@@&(ht{/Fdj&Y&Y&YD#)Ktz$B]
 
 
r1   NTr
  r   r   r   rS   r   r   r  c                     |                      |||||	          }t          | d          r|                     |          }|                     ||||	||||
||
  
        S )NrV   rZ  )r
  rS   r   r   r   r   r   r  )rY  hasattrrZ  r[  )rQ   rX   r[   rY   rZ   r
  r   r   r   rS   r   r   r  rY  s                 r2   r^   zFlaxElectraModule.__call__q  s     __~|^S` % 
 

 4-.. 	=00<<J||'"7#9!/!5#  
 
 	
r1   )NNNFTFFT)r(   r)   r*   r"   r.   r,   ra   r6   r   rb   rR   r   npr-   r^   r0   r1   r2   rW  rW  d  s         {E39"""#(D(((
 
 
 +/7;8< ""'%*  
  
 BJ' 
  (4 
 !) 5 
  
  
   
 # 
  
  
  
  
  
  
r1   rW  zaThe bare Electra Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZdS )FlaxElectraModelN)r(   r)   r*   rW  r$  r0   r1   r2   r`  r`    s        
 %LLLr1   r`  c                       e Zd ZU eed<   ej        Zej        ed<   dZe	j
        j        j        Zedej        f         ed<   d Zd ZdS )FlaxElectraTiedDenserA   r6   N.	bias_initc                 T    |                      d| j        | j        f          | _        d S )Nr   )paramrc  rA   r   rP   s    r2   rR   zFlaxElectraTiedDense.setup  s%    JJvt~8K7MNN			r1   c                    t          j        || j                  }t          j        || j                  }t          j        |||j        dz
  fdfdf| j                  }t          j        | j        | j                  }||z   S )Nr!   r   )r0   r0   )r   )r,   asarrayr6   r   dot_generalndimr   r   )rQ   xkernelyr   s        r2   r^   zFlaxElectraTiedDense.__call__  s~    K4:&&VTZ00OvzmT"H-n	
 
 
 {49dj114xr1   )r(   r)   r*   rQ  r.   r,   ra   r6   r   rB   r>   rC   r   rc  r   r^  r-   rR   r^   r0   r1   r2   rb  rb    s         {E39"""I+.6+>+DIxRZ(DDDO O O
 
 
 
 
r1   rb  c            	       z    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	d	e	d
e	de	fdZdS )FlaxElectraForMaskedLMModuler5   r6   Fr   c                 L   t          | j        | j        | j                  | _        t          | j        | j                  | _        | j        j        r't          | j        j	        | j                  | _
        d S t          j        | j        j	        | j                  | _
        d S Nr)  r5   r6   rh   rW  r5   r6   r   r#  r  generator_predictionstie_word_embeddingsrb  r@   generator_lm_headr>   rm   rP   s    r2   rR   z"FlaxElectraForMaskedLMModule.setup      (;djId
 
 
 &EDK_c_i%j%j%j";* 	X%9$+:PX\Xb%c%c%cD"""%'Xdk.DDJ%W%W%WD"""r1   NTrS   r   r   r  c
                    |                      |||||||||		  	        }
|
d         }|                     |          }| j        j        r@| j         j        d         d         d         d         }|                     ||j                  }n|                     |          }|	s|f|
dd          z   S t          ||
j        |
j	                  S )	NrS   r   r   r  r   r3  rY  rF   	embeddingr!   r%   r&   r'   )
r#  rs  r5   rt  r   ru  Tr   r&   r'   )rQ   rX   r[   rY   rZ   r
  rS   r   r   r  r   r&   prediction_scoresshared_embeddings                 r2   r^   z%FlaxElectraForMaskedLMModule.__call__  s     ,,'/!5#  

 

  
 66}EE;* 	J#|5h?MN_`alm $ 6 67HJZJ\ ] ] $ 6 67H I I 	6%''!""+55!$!/)
 
 
 	
r1   NNNNTFFTr(   r)   r*   r"   r.   r,   ra   r6   r   rb   rR   r^   r0   r1   r2   rn  rn    s         {E39"""#(D(((X X X ""'%* '
 '
 '
  '
 #'
 '
 '
 '
 '
 '
 '
r1   rn  z5Electra Model with a `language modeling` head on top.c                       e Zd ZeZdS )FlaxElectraForMaskedLMN)r(   r)   r*   rn  r$  r0   r1   r2   r  r    s        /LLLr1   r  c            	       z    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	d	e	d
e	de	fdZdS )FlaxElectraForPreTrainingModuler5   r6   Fr   c                     t          | j        | j        | j                  | _        t          | j        | j                  | _        d S Nr)  rq  )rW  r5   r6   r   r#  r  discriminator_predictionsrP   s    r2   rR   z%FlaxElectraForPreTrainingModule.setup  sL    (;djId
 
 
 *MTXT_gkgq)r)r)r&&&r1   NTrS   r   r   r  c
                     |                      |||||||||		  	        }
|
d         }|                     |          }|	s|f|
dd          z   S t          ||
j        |
j                  S )Nrx  r   r!   rz  )r#  r  r$   r&   r'   rQ   rX   r[   rY   rZ   r
  rS   r   r   r  r   r&   r%   s                r2   r^   z(FlaxElectraForPreTrainingModule.__call__  s     ,,'/!5#  

 

  
//>> 	+9wqrr{**.!/)
 
 
 	
r1   r~  r  r0   r1   r2   r  r    s         {E39"""#(D(((s s s ""'%* #
 #
 #
  #
 ##
 #
 #
 #
 #
 #
 #
r1   r  z
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    c                       e Zd ZeZdS )FlaxElectraForPreTrainingN)r(   r)   r*   r  r$  r0   r1   r2   r  r  '  s         3LLLr1   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxElectraForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
    >>> model = FlaxElectraForPreTraining.from_pretrained("google/electra-small-discriminator")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.logits
    ```
rG  )output_typerN  c            	       z    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	d	e	d
e	de	fdZdS )'FlaxElectraForTokenClassificationModuler5   r6   Fr   c                     t          | j        | j        | j                  | _        | j        j        | j        j        n| j        j        }t          j        |          | _	        t          j
        | j        j        | j                  | _        d S Nr)  rh   )rW  r5   r6   r   r#  classifier_dropoutrN   r>   rM   rO   rm   
num_labels
classifierrQ   r  s     r2   rR   z-FlaxElectraForTokenClassificationModule.setupS  s    (;djId
 
 

 {-9 K**0 	
 z"455(4;#9LLLr1   NTrS   r   r   r  c
                     |                      |||||||||		  	        }
|
d         }|                     ||          }|                     |          }|	s|f|
dd          z   S t          ||
j        |
j                  S Nrx  r   rV   r!   rz  )r#  rO   r  r   r&   r'   r  s                r2   r^   z0FlaxElectraForTokenClassificationModule.__call___  s     ,,'/!5#  

 

  
]-PP// 	+9wqrr{**(!/)
 
 
 	
r1   r~  r  r0   r1   r2   r  r  N  s         {E39"""#(D(((
M 
M 
M ""'%* $
 $
 $
  $
 #$
 $
 $
 $
 $
 $
 $
r1   r  z
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    c                       e Zd ZeZdS )!FlaxElectraForTokenClassificationN)r(   r)   r*   r  r$  r0   r1   r2   r  r    s         ;LLLr1   r  c                     | S r   r0   )rj  r,  s     r2   identityr    s    Hr1   c                   T    e Zd ZU dZeed<   ej        Zej        ed<   d Z	d	de
fdZdS )
FlaxElectraSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r5   r6   c                    t           | _        t          | j        d          rv| j        j        rjt          | j        d          r)| j        j        r| j        j        dk    r| j        j        }n| j        j        }t          j	        || j
                  | _        t          | j        dd           }|rt          |         nd | _        t           | _        t          | j        d          r3| j        j        dk    r#t          j        | j        j                  | _        t           | _        t          | j        d          r5| j        j        dk    r't          j        | j        j                  | _        d S d S d S )	Nsummary_use_projsummary_proj_to_labelsr   rh   summary_activationc                     | S r   r0   )rj  s    r2   r   z2FlaxElectraSequenceSummary.setup.<locals>.<lambda>  s    XY r1   summary_first_dropoutsummary_last_dropout)r  summaryr]  r5   r  r  r  ri   r>   rm   r6   getattrr   r   first_dropoutr  rM   last_dropoutr  )rQ   num_classesactivation_strings      r2   rR   z FlaxElectraSequenceSummary.setup  s^   4; 233 		C8T 		C%=>>6K66 K*Q.."k4"k58KtzBBBDL#DK1EtLL7HY&!233kk%4; 788 	OT[=^ab=b=b!#DK,M!N!ND$4; 677 	MDK<\_`<`<` "
4;+K L LD	M 	M<`<`r1   NTrS   c                     |dddf         }|                      ||          }|                     |          }|                     |          }|                     ||          }|S )aZ  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`jnp.ndarray` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`jnp.ndarray` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `jnp.ndarray`: The summary of the sequence hidden states.
        Nr   rV   )r  r  r   r  )rQ   r&   	cls_indexrS   r   s        r2   r^   z#FlaxElectraSequenceSummary.__call__  sm     qqq!t$##F-#HHf%%((""6"GGr1   )NTr`   r0   r1   r2   r  r    st          " {E39"""M M M0 T      r1   r  c            	       z    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	d	e	d
e	de	fdZdS )"FlaxElectraForMultipleChoiceModuler5   r6   Fr   c                     t          | j        | j        | j                  | _        t          | j        | j                  | _        t          j        d| j                  | _	        d S )Nr)  rq  r!   rh   )
rW  r5   r6   r   r#  r  sequence_summaryr>   rm   r  rP   s    r2   rR   z(FlaxElectraForMultipleChoiceModule.setup  s_    (;djId
 
 
 !;$+UYU_ ` ` `(1DJ777r1   NTrS   r   r   r  c
                 l   |j         d         }
|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|                     |||||||||		  	        }|d         }|                     ||          }|                     |          }|                    d|
          }|	s|f|dd          z   S t          ||j        |j                  S )Nr!   r   rx  r   rV   rz  )rw   rv   r#  r  r  r   r&   r'   )rQ   rX   r[   rY   rZ   r
  rS   r   r   r  num_choicesr   r&   pooled_outputr%   reshaped_logitss                   r2   r^   z+FlaxElectraForMultipleChoiceModule.__call__  so     oa(BKBWI%%b)/"*=>>>]a	Q_Qk//N4H4LMMMquQ_Qk//N4H4LMMMquKWKc|++B0B20FGGGim ,,'/!5#  

 

  
--m=-YY// ..[99 	4#%33,"!/)
 
 
 	
r1   r~  r  r0   r1   r2   r  r    s         {E39"""#(D(((8 8 8 ""'%* +
 +
 +
  +
 #+
 +
 +
 +
 +
 +
 +
r1   r  z
    ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZdS )FlaxElectraForMultipleChoiceN)r(   r)   r*   r  r$  r0   r1   r2   r  r    s         6LLLr1   r  z(batch_size, num_choices, sequence_lengthc            	       z    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	d	e	d
e	de	fdZdS )%FlaxElectraForQuestionAnsweringModuler5   r6   Fr   c                     t          | j        | j        | j                  | _        t          j        | j        j        | j                  | _        d S r  )	rW  r5   r6   r   r#  r>   rm   r  
qa_outputsrP   s    r2   rR   z+FlaxElectraForQuestionAnsweringModule.setup7  sI    (;djId
 
 
 (4;#9LLLr1   NTrS   r   r   r  c
                 p   |                      |||||||||		  	        }
|
d         }|                     |          }t          j        || j        j        d          \  }}|                    d          }|                    d          }|	s||f|
dd          z   S t          |||
j        |
j	                  S )Nrx  r   r   r   r!   )start_logits
end_logitsr&   r'   )
r#  r  r,   r:  r5   r  r   r   r&   r'   )rQ   rX   r[   rY   rZ   r
  rS   r   r   r  r   r&   r%   r  r  s                  r2   r^   z.FlaxElectraForQuestionAnsweringModule.__call__=  s     ,,'/!5#  

 

  
//#&9VT[5KRT#U#U#U j#++B//''++
 	< *-;;/%!!/)	
 
 
 	
r1   r~  r  r0   r1   r2   r  r  2  s         {E39"""#(D(((M M M ""'%* &
 &
 &
  &
 #&
 &
 &
 &
 &
 &
 &
r1   r  z
    ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZdS )FlaxElectraForQuestionAnsweringN)r(   r)   r*   r  r$  r0   r1   r2   r  r  f  s         9LLLr1   r  c                   T    e Zd ZU dZeed<   ej        Zej        ed<   d Z	d	de
fdZdS )
FlaxElectraClassificationHeadz-Head for sentence-level classification tasks.r5   r6   c                 (   t          j        | j        j        | j                  | _        | j        j        | j        j        n| j        j        }t          j        |          | _	        t          j        | j        j
        | j                  | _        d S )Nrh   )r>   rm   r5   ri   r6   r   r  rN   rM   rO   r  out_projr  s     r2   rR   z#FlaxElectraClassificationHead.setup  s{    Xdk5TZHHH
 {-9 K**0 	
 z"455!7tzJJJr1   TrS   c                     |d d dd d f         }|                      ||          }|                     |          }t          d         |          }|                      ||          }|                     |          }|S )Nr   rV   gelu)rO   r   r   r  )rQ   r&   rS   rj  s       r2   r^   z&FlaxElectraClassificationHead.__call__  sz    !!!Q'"LL-L88JJqMM6N1LL-L88MM!r1   Nr_   r`   r0   r1   r2   r  r  y  sn         77{E39"""K K K T      r1   r  c            	       z    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	d	e	d
e	de	fdZdS )*FlaxElectraForSequenceClassificationModuler5   r6   Fr   c                     t          | j        | j        | j                  | _        t          | j        | j                  | _        d S r  )rW  r5   r6   r   r#  r  r  rP   s    r2   rR   z0FlaxElectraForSequenceClassificationModule.setup  sF    (;djId
 
 
 8t{RVR\]]]r1   NTrS   r   r   r  c
                     |                      |||||||||		  	        }
|
d         }|                     ||          }|	s|f|
dd          z   S t          ||
j        |
j                  S r  )r#  r  r   r&   r'   r  s                r2   r^   z3FlaxElectraForSequenceClassificationModule.__call__  s     ,,'/!5#  

 

  
mLL 	+9wqrr{**+!/)
 
 
 	
r1   r~  r  r0   r1   r2   r  r    s         {E39"""#(D(((^ ^ ^ ""'%* "
 "
 "
  "
 #"
 "
 "
 "
 "
 "
 "
r1   r  z
    Electra Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd ZeZdS )$FlaxElectraForSequenceClassificationN)r(   r)   r*   r  r$  r0   r1   r2   r  r    s         >LLLr1   r  c                       e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deej                 deej                 de	de	de	de	de	fdZdS )FlaxElectraForCausalLMModuler5   r6   Fr   c                 L   t          | j        | j        | j                  | _        t          | j        | j                  | _        | j        j        r't          | j        j	        | j                  | _
        d S t          j        | j        j	        | j                  | _
        d S rp  rr  rP   s    r2   rR   z"FlaxElectraForCausalLMModule.setup  rv  r1   NTr[   rY   rZ   r
  r   r   r   rS   r   r   r  c                    |                      |||||||||	|
||          }|d         }|                     |          }| j        j        r@| j         j        d         d         d         d         }|                     ||j                  }n|                     |          }|s|f|dd          z   S t          ||j        |j	        |j
                  S )	N)r   r   r   rS   r   r   r  r   r3  rY  rF   ry  r!   )r%   r&   r'   r  )r#  rs  r5   rt  r   ru  r{  r   r&   r'   r  )rQ   rX   r[   rY   rZ   r
  r   r   r   rS   r   r   r  r   r&   r|  r}  s                    r2   r^   z%FlaxElectraForCausalLMModule.__call__  s    ,,"7#9!'/!5#  
 
  
 66}EE;* 	J#|5h?MN_`alm $ 6 67HJZJ\ ] ] $ 6 67H I I 	6%''!""+554$!/)$5	
 
 
 	
r1   )NNNNNNFTFFTr  r0   r1   r2   r  r    s-        {E39"""#(D(((X X X 1504.2+/7;8< ""'%* .
 .
 !-.
 !-	.

 s{+.
 CK(.
  (4.
 !) 5.
 .
 .
  .
 #.
 .
 .
 .
 .
 .
 .
r1   r  z
    Electra Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   <    e Zd ZeZddeej                 fdZd Z	dS )FlaxElectraForCausalLMNr[   c                 L   |j         \  }}|                     ||          }t          j        ||fd          }|0|                    d          dz
  }t          j        ||d          }n5t          j        t          j        |d          d d d f         ||f          }|||dS )NrU   rh   r   r   r!   )r   r   )rI  r[   rZ   )	rw   r   r,   rq   cumsumr   r   r   r   )	rQ   rX   r   r[   r   
seq_lengthrI  extended_attention_maskrZ   s	            r2   prepare_inputs_for_generationz4FlaxElectraForCausalLM.prepare_inputs_for_generation!  s    !*
J//*jAA #&(J
+C4"P"P"P%)00b099A=L&)&>?VXfhn&o&o##+CJz,N,N,NtUVUVUVw,WZdfpYqrrL  /5(
 
 	
r1   c                 N    |j         |d<   |d         d d dd f         dz   |d<   |S )NrI  rZ   r   r!   )rI  )rQ   model_outputsmodel_kwargss      r2   update_inputs_for_generationz3FlaxElectraForCausalLM.update_inputs_for_generation6  s;    *7*G&''3N'CAAArssF'Ka'O^$r1   r   )
r(   r)   r*   r  r$  r   rB   Arrayr  r  r0   r1   r2   r  r    sT         0L
 
S[\_\eSf 
 
 
 
*    r1   r  )	r  r  r  r  r  r  r  r`  r"  )_typingr   r   flax
flax.linenlinenr>   rB   	jax.numpynumpyr,   r^  flax.core.frozen_dictr   r   r   r   r	   r
   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r   r   r    configuration_electrar"   
get_loggerr(   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   struct	dataclassr$   ELECTRA_START_DOCSTRINGrR  rP  r4   rd   r   r   r   r   r   r   r  r  r  r"  rW  r`  rb  rn  r  r  r  &FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRINGrS  r  r  r  r  r  r  r  r  r  r  r  r  r  __all__r0   r1   r2   <module>r     sr
    & % % % % % % %        



           > > > > > > > > > > 6 6 6 6 6 6 6 6 6 6 6 6 6 6 > > > > > > ; ; ; ; ; ; ; ;      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	              g f f f f f f f f f f f 0 0 0 0 0 0 
	H	%	%: ! 4 4 4 4 4k 4 4 42 ,$ N& & & & &BI & & &Th h h h hry h h hX    BI   *' ' ' ' '29 ' ' 'V    bi   &    	   *6 6 6 6 6ry 6 6 6tO
 O
 O
 O
 O
 O
 O
 O
f$
 $
 $
 $
 $
 $
 $
 $
N    bi       ")   "} } } } }!4 } } }@-
 -
 -
 -
 -
	 -
 -
 -
` g % % % % %1 % %	 %  -/BDWYh i i i    29   ,6
 6
 6
 6
 6
29 6
 6
 6
r QSjkk0 0 0 0 07 0 0 lk0  35HJ\^m n n n.
 .
 .
 .
 .
bi .
 .
 .
b 
  3 3 3 3 3 : 3 3 3* &$  ##$ABBEkk   !  +JYh   
5
 5
 5
 5
 5
bi 5
 5
 5
p 
  ; ; ; ; ;(B ; ; ;  %	    @ @ @ @ @ @ @ @F7
 7
 7
 7
 7
 7
 7
 7
t   6 6 6 6 6#= 6 6 6
   ":"A"ABl"m"m     !	  1
 1
 1
 1
 1
BI 1
 1
 1
h   9 9 9 9 9&@ 9 9 9  #$	      BI   4-
 -
 -
 -
 -
 -
 -
 -
`   > > > > >+E > > >  ( 	  =
 =
 =
 =
 =
29 =
 =
 =
@       7   <  )	  
 
 
r1   