
     `i                     J	   d dl mZmZ d dlZd dlmZ d dlZd dlm	Z
 d dl	Zd dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d	dl%m&Z&m'Z'm(Z(m)Z)m*Z* d	dl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1  e/j2        e3          Z4dZ5dZ6ej7        Z7ej8        j9         G d de,                      Z:dZ;dZ< G d dej=                  Z> G d dej=                  Z? G d dej=                  Z@ G d dej=                  ZA G d dej=                  ZB G d d ej=                  ZC G d! d"ej=                  ZD G d# d$ej=                  ZE G d% d&ej=                  ZF G d' d(ej=                  ZG G d) d*ej=                  ZH G d+ d,ej=                  ZI G d- d.ej=                  ZJ G d/ d0ej=                  ZK G d1 d2ej=                  ZL G d3 d4e'          ZM G d5 d6ej=                  ZN e-d7e;           G d8 d9eM                      ZO e(eOe5ee6            G d: d;ej=                  ZP e-d<e;           G d= d>eM                      ZQd?ZR e*eQe<S                    d@          eRz               e)eQe:e6A            G dB dCej=                  ZT e-dDe;           G dE dFeM                      ZU e(eUe5ee6            G dG dHej=                  ZV e-dIe;           G dJ dKeM                      ZWdLZX e*eWe<S                    d@          eXz               e)eWe!e6A            G dM dNej=                  ZY e-dOe;           G dP dQeM                      ZZ e(eZe5e#e6            G dR dSej=                  Z[ e-dTe;           G dU dVeM                      Z\ e*e\e<S                    dW                      e(e\e5e e6            G dX dYej=                  Z] e-dZe;           G d[ d\eM                      Z^ e(e^e5e$e6            G d] d^ej=                  Z_ e-d_e;           G d` daeM                      Z` e(e`e5e"e6            G db dcej=                  Za e-dde;           G de dfeM                      Zb e(ebe5ee6           g dgZcdS )h    )CallableOptionalN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )
-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutputFlaxNextSentencePredictorOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )
BertConfigzgoogle-bert/bert-base-uncasedr$   c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	e
eej                          ed<   dZe
eej                          ed<   dS )FlaxBertForPreTrainingOutputaI  
    Output type of [`BertForPreTraining`].

    Args:
        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nprediction_logitsseq_relationship_logitshidden_states
attentions)__name__
__module____qualname____doc__r'   jnpndarray__annotations__r(   r)   r   tupler*        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bert/modeling_flax_bert.pyr&   r&   =   s|          , &*s{)))+/S[///26M8E#+./666/3Js{+,33333r4   r&   a
  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].

a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   T    e Zd ZU dZeed<   ej        Zej        ed<   d Z	d	de
fdZdS )
FlaxBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    t          j        | j        j        | j        j        t
          j         j                            | j        j                  | j	                  | _
        t          j        | j        j        | j        j        t
          j         j                            | j        j                  | j	                  | _        t          j        | j        j        | j        j        t
          j         j                            | j        j                  | j	                  | _        t          j        | j        j        | j	                  | _        t          j        | j        j                  | _        d S )N)stddev)embedding_initr9   epsilonr9   rate)nnEmbedr8   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger9   word_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfs    r5   setupzFlaxBertEmbeddings.setup   s(   !xK"K#6.55T[=Z5[[*	 
  
  
 $&8K/K#6.55T[=Z5[[*	$
 $
 $
  &(XK'K#6.55T[=Z5[[*	&
 &
 &
" dk.HPTPZ[[[zt{'FGGGr4   Tdeterministicc                 ^   |                      |                    d                    }|                     |                    d                    }|                     |                    d                    }||z   |z   }	|                     |	          }	|                     |	|          }	|	S )Ni4rV   )rI   astyperK   rM   rN   rR   )
rT   	input_idstoken_type_idsposition_idsattention_maskrV   inputs_embedsposition_embedsrM   r)   s
             r5   __call__zFlaxBertEmbeddings.__call__   s    ,,Y-=-=d-C-CDD22<3F3Ft3L3LMM $ : :>;P;PQU;V;V W W &(==O }55]-PPr4   NT)r+   r,   r-   r.   r$   r1   r/   float32r9   rU   boolra   r3   r4   r5   r7   r7      so         QQ{E39"""H H H, _c      r4   r7   c                       e Zd ZU eed<   dZeed<   ej        Z	ej	        ed<   d Z
d Zd Zej        d             Z	 	 	 	 ddeej                 dedefdZd	S )FlaxBertSelfAttentionr8   Fcausalr9   c                 "   | j         j        | j         j        z  | _        | j         j        | j         j        z  dk    rt	          d          t          j        | j         j        | j        t          j        j	        
                    | j         j                            | _        t          j        | j         j        | j        t          j        j	        
                    | j         j                            | _        t          j        | j         j        | j        t          j        j	        
                    | j         j                            | _        | j        r8t!          t#          j        d| j         j        fd          d          | _        d S d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r9   kernel_initr#   rd   r9   )r8   rD   num_attention_headshead_dim
ValueErrorrA   Denser9   rE   rF   rG   rH   querykeyvaluerg   r	   r/   onesrJ   causal_maskrS   s    r5   rU   zFlaxBertSelfAttention.setup   sb   /4;3RR;"T[%DDIII  
 XK#*+224;3PQQ
 
 


 8K#*+224;3PQQ
 
 

 XK#*+224;3PQQ
 
 

 ; 	/!T[@APPPX^     D	 	r4   c                 n    |                     |j        d d         | j        j        | j        fz             S N   )reshapeshaper8   rk   rl   rT   r)   s     r5   _split_headsz"FlaxBertSelfAttention._split_heads   s5    $$]%8!%<@_aean?o%opppr4   c                 b    |                     |j        d d         | j        j        fz             S ru   )rw   rx   r8   rD   ry   s     r5   _merge_headsz"FlaxBertSelfAttention._merge_heads  s/    $$]%8!%<@W?Y%YZZZr4   c                    |                      dd          }|                     ddt          j        |j        |j                  }|                     ddt          j        |j        |j                  }|                     ddd           }|r|j        j        ^ }	}
}}|j        }dt          |	          z  |ddfz   }t          j	        |j        ||          }t          j	        |j        ||          }||_        ||_        |j        d         }|j        |z   |_        t          j
        t          j        |
          ||z   k     t          |	          d||
fz             }t          ||          }|||fS )	a\  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slightly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                  B    t          j        dt           j                  S )Nr   rj   )r/   arrayint32r3   r4   r5   <lambda>z=FlaxBertSelfAttention._concatenate_to_cache.<locals>.<lambda>  s    CIaWZW`DaDaDa r4   )r   r   r#   )has_variablevariabler/   zerosrx   r9   rq   lenr   dynamic_update_slicebroadcast_toaranger2   r   )rT   rp   rq   ro   r^   is_initializedr   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r5   _concatenate_to_cachez+FlaxBertSelfAttention._concatenate_to_cache  sp    **7LAA]]7L#)SYPSPYZZ
}}WnciV[VabbmmG]<a<abb 	EAKAQAW>ZY#)IS__,	1a/@@G*:+;S'JJC,\-?PPE"J!&L(-A% + 14M MK'
:&&5N)NNj!!Q(A:$NN H +8^DDNE>))r4   NTkey_value_states
init_cacheoutput_attentionsc                    |d u}|j         d         }	|                     |          }
|r+|                     |          }|                     |          }n*|                     |          }|                     |          }|                     |
          }
|                     |          }|                     |          }| j        r|
j         d         |j         d         }}|                     dd          rU| j        d         d         }| j        d         d         j         d         }t          j	        | j
        dd|dfdd||f          }n| j
        d d d d d |d |f         }t          j        ||	f|j         dd          z             }|F| j        r?t          j        t          j        |d          |j                   }t          ||          }n"| j        r|}n|t          j        |d          }| j        r4|                     dd          s|r|                     |||
|          \  }}}|t          j        |dk    t          j        |j         d                              | j                  t          j        |j         t          j        | j                  j                                      | j                            }nd }d }|s%| j        j        dk    r|                     d	          }t3          |
|||| j        j        d
|| j        d 	  	        }|t          j        d||          }t          j        d||          }|                    |j         d d         dz             }|r||fn|f}|S )Nr   r#   r~   r   r   )axisg        rR   T)biasdropout_rngdropout_ratebroadcast_dropoutrV   r9   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdrv   ))rx   ro   rp   rq   rz   rg   r   	variablesr   dynamic_slicers   r/   r   expand_dimsr   r   selectfullrZ   r9   finfominr8   attention_probs_dropout_probmake_rngr   einsumrw   )rT   r)   r^   layer_head_maskr   r   rV   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthrs   attention_biasr   attn_weightsattn_outputoutputss                          r5   ra   zFlaxBertSelfAttention.__call__&  s    .T9"(+
 zz-00 	5"233J::&677LL -00J::m44L((66&&z22
((66 ; 
	_'3'9!'<j>Nq>Q*L  ,77 Q!^G4]C
%)^G%<\%J%PQR%S"!/$q!Z&;aLRd=e  #.qqq!!!]l]KZK/OP*;HYZ[Z\Z\H]8]^^K %$+% -conS[.\.\.\^i^oppN*>;GGNN[ 	L(NN' _^(KKKN ; 	D--g|DD 	
 	7;7Q7QL,8 84Jn
 % Z"-s33::4:FF-sy/D/D/HIIPPQUQ[\\ NN "N 	3!IC!O!O--	22K4#A"'*

 

 

 &:&8,XXLj!8,UU!))+*;BQB*?%*GHH1BV;--r4   NFTF)r+   r,   r-   r$   r1   rg   rd   r/   rc   r9   rU   rz   r|   rA   compactr   r   r0   ra   r3   r4   r5   rf   rf      s         FD{E39"""  :q q q[ [ [ Z* * Z*H 37 "'_ _
 #3;/_ _  _ _ _ _ _ _r4   rf   c                   P    e Zd ZU eed<   ej        Zej        ed<   d Zdde	fdZ
dS )	FlaxBertSelfOutputr8   r9   c                 P   t          j        | j        j        t          j         j                            | j        j                  | j                  | _	        t          j
        | j        j        | j                  | _
        t          j        | j        j                  | _        d S )Nri   r9   r=   r?   )rA   rn   r8   rD   rE   rF   rG   rH   r9   denserN   rO   rP   rQ   rR   rS   s    r5   rU   zFlaxBertSelfOutput.setup  s    XK#+224;3PQQ*
 
 


 dk.HPTPZ[[[zt{'FGGGr4   TrV   c                     |                      |          }|                     ||          }|                     ||z             }|S NrY   r   rR   rN   )rT   r)   input_tensorrV   s       r5   ra   zFlaxBertSelfOutput.__call__  sD    

=11]-PP}|'CDDr4   Nrb   r+   r,   r-   r$   r1   r/   rc   r9   rU   rd   ra   r3   r4   r5   r   r     sh         {E39"""H H H 4      r4   r   c                   f    e Zd ZU eed<   dZeed<   ej        Z	ej	        ed<   d Z
	 	 	 	 d
defd	ZdS )FlaxBertAttentionr8   Frg   r9   c                     t          | j        | j        | j                  | _        t          | j        | j                  | _        d S )Nrg   r9   rj   )rf   r8   rg   r9   rT   r   outputrS   s    r5   rU   zFlaxBertAttention.setup  s<    )$+dkQUQ[\\\	(DJGGGr4   NTr   c           	          |                      |||||||          }|d         }	|                     |	||          }|f}
|r|
|d         fz  }
|
S )N)r   r   r   rV   r   r   rY   r#   )rT   r   )rT   r)   r^   r   r   r   rV   r   attn_outputsr   r   s              r5   ra   zFlaxBertAttention.__call__  sy     yy+-!'/ ! 
 
 #1oKm\\ " 	*Q))Gr4   r   )r+   r,   r-   r$   r1   rg   rd   r/   rc   r9   rU   ra   r3   r4   r5   r   r     s         FD{E39"""H H H "'        r4   r   c                   H    e Zd ZU eed<   ej        Zej        ed<   d Zd Z	dS )FlaxBertIntermediater8   r9   c                     t          j        | j        j        t          j         j                            | j        j                  | j                  | _	        t          | j        j                 | _        d S Nr   )rA   rn   r8   intermediate_sizerE   rF   rG   rH   r9   r   r   
hidden_act
activationrS   s    r5   rU   zFlaxBertIntermediate.setup  sY    XK)+224;3PQQ*
 
 


 !!78r4   c                 Z    |                      |          }|                     |          }|S N)r   r   ry   s     r5   ra   zFlaxBertIntermediate.__call__  s*    

=1166r4   N
r+   r,   r-   r$   r1   r/   rc   r9   rU   ra   r3   r4   r5   r   r     sT         {E39"""9 9 9    r4   r   c                   P    e Zd ZU eed<   ej        Zej        ed<   d Zdde	fdZ
dS )	FlaxBertOutputr8   r9   c                 P   t          j        | j        j        t          j         j                            | j        j                  | j                  | _	        t          j
        | j        j                  | _        t          j        | j        j        | j                  | _        d S )Nr   r?   r=   )rA   rn   r8   rD   rE   rF   rG   rH   r9   r   rP   rQ   rR   rN   rO   rS   s    r5   rU   zFlaxBertOutput.setup  s    XK#+224;3PQQ*
 
 


 zt{'FGGGdk.HPTPZ[[[r4   TrV   c                     |                      |          }|                     ||          }|                     ||z             }|S r   r   )rT   r)   attention_outputrV   s       r5   ra   zFlaxBertOutput.__call__  sE    

=11]-PP}7G'GHHr4   Nrb   r   r3   r4   r5   r   r     sh         {E39"""\ \ \ t      r4   r   c                       e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 	 dde	ej
                 de	ej
                 d	ed
edef
dZdS )FlaxBertLayerr8   r9   c                 :   t          | j        | j        j        | j                  | _        t          | j        | j                  | _        t          | j        | j                  | _        | j        j	        r#t          | j        d| j                  | _
        d S d S )Nr   rj   F)r   r8   
is_decoderr9   	attentionr   intermediater   r   add_cross_attentioncrossattentionrS   s    r5   rU   zFlaxBertLayer.setup  s    *4;t{?U]a]ghhh0DJOOO$T[
CCC;* 	a"3DKUYU_"`"`"`D	a 	ar4   NFTencoder_hidden_statesencoder_attention_maskr   rV   r   c	                 .   |                      ||||||          }	|	d         }
|#|                     |
|||||          }|d         }
|                     |
          }|                     ||
|          }|f}|r||	d         fz  }|||d         fz  }|S )N)r   r   rV   r   r   )r^   r   r   rV   r   rY   r#   )r   r   r   r   )rT   r)   r^   r   r   r   r   rV   r   attention_outputsr   cross_attention_outputsr   s                r5   ra   zFlaxBertLayer.__call__  s     !NN+!'/ + 
 
 -Q/ !,&*&9&9 5 /!6+"3 ': ' '#  7q9))*:;;M3CS`aa " 	9)!,..G$03A688r4   )NNFTF)r+   r,   r-   r$   r1   r/   rc   r9   rU   r   r0   rd   ra   r3   r4   r5   r   r     s         {E39"""a a a 8<8< ""'+ +
  (4+ !) 5+ + +  + + + + + +r4   r   c                       e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                 d	eej                 d
e	de	de	de	de	fdZdS )FlaxBertLayerCollectionr8   r9   Fgradient_checkpointingc                       j         rCt          t          d           fdt           j        j                  D              _        d S  fdt           j        j                  D              _        d S )N)         )static_argnumsc                 Z    g | ]'} j         t          |          j                   (S )namer9   )r8   strr9   ).0iFlaxBertCheckpointLayerrT   s     r5   
<listcomp>z1FlaxBertLayerCollection.setup.<locals>.<listcomp>-  sE        ('#a&&
SSS  r4   c                 b    g | ]+}t          j        t          |          j                   ,S r   )r   r8   r   r9   )r   r   rT   s     r5   r   z1FlaxBertLayerCollection.setup.<locals>.<listcomp>2  s?       NOdkAdjIII  r4   )r   rematr   ranger8   num_hidden_layerslayers)rT   r   s   `@r5   rU   zFlaxBertLayerCollection.setup*  s    & 		&+M)&T&T&T#    t{<==  DKKK
   SXY]YdYvSwSw  DKKKr4   NTr   r   r   rV   r   output_hidden_statesreturn_dictc                    |rdnd }|	rdnd }|r|dnd }|V|j         d         t          | j                  k    r3t          dt          | j                   d|j         d          d          t	          | j                  D ]M\  }}|	r||fz  } ||||||         nd |||||          }|d         }|r||d         fz  }|||d         fz  }N|	r||fz  }||||f}|
st          d |D                       S t          ||||	          S )
Nr3   r   z&The head_mask should be specified for z/ layers, but it is for                         .r#   rv   c              3      K   | ]}||V  	d S r   r3   )r   vs     r5   	<genexpr>z3FlaxBertLayerCollection.__call__.<locals>.<genexpr>l  s"      ==qq}}}}}==r4   )last_hidden_stater)   r*   cross_attentions)rx   r   r   rm   	enumerater2   r   )rT   r)   r^   	head_maskr   r   r   rV   r   r  r  all_attentionsall_hidden_statesall_cross_attentionsr   layerlayer_outputsr   s                     r5   ra   z FlaxBertLayerCollection.__call__6  s     1:d"6@BBD&7h<Q<]rrdh  q!c$+&6&677 4S=M=M 4 4'oa04 4 4  
 "$+.. 	@ 	@HAu# 6!m%55!!E ) 5	!4%&!	 	M *!,M  @=#3"55(4(]1-=,??( 	2-!11 "3^EYZ 	>==G======<++%1	
 
 
 	
r4   NNFTFFTr+   r,   r-   r$   r1   r/   rc   r9   r   rd   rU   r   r0   ra   r3   r4   r5   r   r   %  s         {E39"""#(D(((
 
 
" 8<8< ""'%* =
 =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
 =
 =
 =
 =
 =
r4   r   c                       e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                 d	eej                 d
e	de	de	de	de	fdZdS )FlaxBertEncoderr8   r9   Fr   c                 R    t          | j        | j        | j                  | _        d S )Nr9   r   )r   r8   r9   r   r  rS   s    r5   rU   zFlaxBertEncoder.setup{  s,    ,K*#'#>
 
 



r4   NTr   r   r   rV   r   r  r  c                 @    |                      |||||||||	|

  
        S )N)r  r   r   r   rV   r   r  r  )r  )rT   r)   r^   r  r   r   r   rV   r   r  r  s              r5   ra   zFlaxBertEncoder.__call__  s=     zz"7#9!'/!5#  
 
 	
r4   r  r  r3   r4   r5   r  r  v  s         {E39"""#(D(((
 
 
 8<8< ""'%* 
 

  (4
 !) 5
 
 
  
 #
 
 
 
 
 
 
r4   r  c                   H    e Zd ZU eed<   ej        Zej        ed<   d Zd Z	dS )FlaxBertPoolerr8   r9   c                     t          j        | j        j        t          j         j                            | j        j                  | j                  | _	        d S r   )
rA   rn   r8   rD   rE   rF   rG   rH   r9   r   rS   s    r5   rU   zFlaxBertPooler.setup  sF    XK#+224;3PQQ*
 
 



r4   c                 l    |d d df         }|                      |          }t          j        |          S )Nr   )r   rA   tanh)rT   r)   cls_hidden_states      r5   ra   zFlaxBertPooler.__call__  s9    (A.::&677w'(((r4   Nr   r3   r4   r5   r  r    sT         {E39"""
 
 
) ) ) ) )r4   r  c                   H    e Zd ZU eed<   ej        Zej        ed<   d Zd Z	dS )FlaxBertPredictionHeadTransformr8   r9   c                     t          j        | j        j        | j                  | _        t          | j        j                 | _        t          j	        | j        j
        | j                  | _	        d S )Nrj   r=   )rA   rn   r8   rD   r9   r   r   r   r   rN   rO   rS   s    r5   rU   z%FlaxBertPredictionHeadTransform.setup  sS    Xdk5TZHHH
 !78dk.HPTPZ[[[r4   c                     |                      |          }|                     |          }|                     |          S r   )r   r   rN   ry   s     r5   ra   z(FlaxBertPredictionHeadTransform.__call__  s6    

=1166~~m,,,r4   Nr   r3   r4   r5   r  r    sW         {E39"""\ \ \
- - - - -r4   r  c                       e Zd ZU eed<   ej        Zej        ed<   ej	        j
        j        Zedej        f         ed<   d ZddZdS )	FlaxBertLMPredictionHeadr8   r9   .	bias_initc                     t          | j        | j                  | _        t	          j        | j        j        | j        d          | _        |                     d| j	        | j        j        f          | _
        d S )Nrj   F)r9   use_biasr   )r  r8   r9   	transformrA   rn   rC   decoderparamr$  r   rS   s    r5   rU   zFlaxBertLMPredictionHead.setup  s_    8DJWWWx 6djSXYYYJJvt~8N7PQQ			r4   Nc                     |                      |          }|%| j                            dd|j        ii|          }n|                     |          }t	          j        | j        | j                  }||z  }|S )Nparamskernel)r'  r(  applyTr/   asarrayr   r9   )rT   r)   shared_embeddingr   s       r5   ra   z!FlaxBertLMPredictionHead.__call__  sw    }55' L..8EUEW:X/Y[hiiMM LL77M{49dj11r4   r   )r+   r,   r-   r$   r1   r/   rc   r9   rE   rA   rF   r   r$  r   npr0   rU   ra   r3   r4   r5   r#  r#    s         {E39"""+.6+>+DIxRZ(DDDR R R

 
 
 
 
 
r4   r#  c                   J    e Zd ZU eed<   ej        Zej        ed<   d ZddZ	dS )FlaxBertOnlyMLMHeadr8   r9   c                 F    t          | j        | j                  | _        d S )Nrj   )r#  r8   r9   predictionsrS   s    r5   rU   zFlaxBertOnlyMLMHead.setup  s!    3DKtzRRRr4   Nc                 4    |                      ||          }|S Nr0  )r5  )rT   r)   r0  s      r5   ra   zFlaxBertOnlyMLMHead.__call__  s!    ((IY(ZZr4   r   r   r3   r4   r5   r3  r3    s\         {E39"""S S S     r4   r3  c                   >    e Zd ZU ej        Zej        ed<   d Zd ZdS )FlaxBertOnlyNSPHeadr9   c                 F    t          j        d| j                  | _        d S )Nrv   rj   )rA   rn   r9   seq_relationshiprS   s    r5   rU   zFlaxBertOnlyNSPHead.setup  s!     "$* = = =r4   c                 ,    |                      |          S r   )r<  )rT   pooled_outputs     r5   ra   zFlaxBertOnlyNSPHead.__call__  s    $$]333r4   N)	r+   r,   r-   r/   rc   r9   r1   rU   ra   r3   r4   r5   r:  r:    sH         {E39"""> > >4 4 4 4 4r4   r:  c                   J    e Zd ZU eed<   ej        Zej        ed<   d ZddZ	dS )FlaxBertPreTrainingHeadsr8   r9   c                     t          | j        | j                  | _        t	          j        d| j                  | _        d S )Nrj   rv   )r#  r8   r9   r5  rA   rn   r<  rS   s    r5   rU   zFlaxBertPreTrainingHeads.setup  s:    3DKtzRRR "$* = = =r4   Nc                 b    |                      ||          }|                     |          }||fS r7  )r5  r<  )rT   r)   r>  r0  prediction_scoresseq_relationship_scores         r5   ra   z!FlaxBertPreTrainingHeads.__call__  s<     ,,]M],^^!%!6!6}!E!E "888r4   r   r   r3   r4   r5   r@  r@    sY         {E39"""> > >9 9 9 9 9 9r4   r@  c                       e Zd ZU dZeZdZdZej	        e
d<   ddej        ddfd	ed
ededej        dedef fdZd Zddej        j        d
ededefdZd Z ee                    d                    	 	 	 	 	 	 	 	 	 	 	 	 	 ddee         dej        j        dedee         dee         dee         dee         fd            Z xZS ) FlaxBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    bertNmodule_class)r#   r#   r   TFr8   input_shapeseedr9   _do_initr   c                 x     | j         d|||d|}t                                          ||||||           d S )Nr8   r9   r   )rI  rJ  r9   rK  r3   )rH  super__init__)
rT   r8   rI  rJ  r9   rK  r   kwargsmodule	__class__s
            r5   rO  z FlaxBertPreTrainedModel.__init__  se     #" 
#9
 
 	
 
 	[tSXcklllllr4   c                 T    |                      | j        | j        d          | _        d S )NTrM  )rH  r8   r9   _modulerS   s    r5   enable_gradient_checkpointingz5FlaxBertPreTrainedModel.enable_gradient_checkpointing  s/    ((;*#' ) 
 
r4   rngr+  returnc                    t          j        |d          }t          j        |          }t          j        t          j        t          j        |          j        d                   |          }t          j        |          }t          j        | j	        j
        | j	        j        f          }t          j                            |          \  }	}
|	|
d}| j	        j        rHt          j        || j	        j        fz             }|}| j                            ||||||||d	  	        }n!| j                            ||||||d          }|d         }||t'          t)          |                    }t'          t)          |                    }| j        D ]}||         ||<   t-                      | _        t/          t1          |                    S |S )NrX   rj   r   )r+  rR   F)r  r+  )r/   r   
zeros_liker   r   
atleast_2drx   	ones_likerr   r8   r   rk   rE   randomsplitr   rD   rQ  initr   r   _missing_keyssetr   r   )rT   rV  rI  r+  r[   r\   r]   r^   r  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r5   init_weightsz$FlaxBertPreTrainedModel.init_weights  s   Ik666		22'
3>)3L3L3RSU3V(W(WYdeey11Hdk;T[=\]^^	"%*"2"23"7"7
K$==;* 	$'IkT[=T<V.V$W$W!%3""&+"2"2%&! #3 
# 
# #'+"2"2iyfk #3 # # ,H5(-)@)@AAM!(6"2"233F#1 A A&3K&@{##!$D.00111  r4   c                    t          j        ||fd          }t          j        |d          }t          j        t          j        t          j        |          j        d                   |j                  }| j                            t          j
                            d          |||dd          }t          |d                   S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        rX   rj   r   r   FT)r  r   r~   )r/   rr   r[  r   r   rZ  rx   rQ  r^  rE   r\  PRNGKeyr   )rT   r   r   r[   r^   r]   init_variabless          r5   r   z"FlaxBertPreTrainedModel.init_cacheF  s     Hj*5TBBB	y==='
3>)3L3L3RSU3V(W(WYbYhii))Jq!!9nlX]jn * 
 
 w/000r4   batch_size, sequence_lengthr   trainr   r  r  past_key_valuesc                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          j        |          }|It	          j        t	          j        t	          j        |          j	        d                   |j	                  }|t	          j
        |          }|*t	          j        | j         j        | j         j        f          }i }|	|	|d<   d|p| j        i}| j         j        r|r	||d<   dg}nd}| j                            |t	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          |||
 |||||          }|!|r|\  }}t'          |d                   |d	<   |S |3|s1|\  }}|d d
         t'          |d                   fz   |d
d          z   }n| j                            |t	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          |
 ||||          }|S )Nr   rR   r+  r~   FrX   rj   )r\   r]   r  r   r   rV   r   r  r  rb  mutablerl  r#   )r\   r]   r  rV   r   r  r  rb  )r8   r   r  r  r/   rY  r   r   rZ  rx   r[  rr   r   rk   r+  r   rQ  r-  r   r   )rT   r[   r^   r\   r]   r  r   r   r+  r   rk  r   r  r  rl  rb  inputsrn  r   s                      r5   ra   z FlaxBertPreTrainedModel.__call__Y  s   $ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+BY ! ^I66N+CJs~i7P7P7VWY7Z,[,[]f]lmmL! ]955N$+"?A`!abbI ")DOF1dk2;* 1	   "1w")k''	)4000	.555"ytDDD Y|4@@@)IT:::&;'="'i"3%9' (  G$ *{*+2(-5og6N-O-O)* ,[,+2(!"1"+/'2J)K)K(MMPWXYXZXZP[[ k''	)4000	.555"ytDDD Y|4@@@)IT:::"'i"3%9' (  G r4   r   )NNNNNNNNFNNNN) r+   r,   r-   r.   r$   config_classbase_model_prefixrH  rA   Moduler1   r/   rc   r2   intr9   rd   rO  rU  rE   r\  rh  r   rf  r   r!   BERT_INPUTS_DOCSTRINGformatr   dictra   __classcell__)rR  s   @r5   rF  rF    s         
 L"L")"""
 $;',m mm m 	m
 ym m !%m m m m m m$
 
 
(! (!
 2 (! (!PZ (!fp (! (! (! (!V1 1 1& +*+@+G+GHe+f+fgg "#!%*.,0/3&**.^ ^ ^ Z'^ ^ $D>^ 'tn^ d^^ "$^ ^ ^ hg^ ^ ^ ^ ^r4   rF  c                      e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   dZ
e	ed<   d Z	 	 	 	 	 	 	 	 	 	 dd	eej                 d
eej                 deej                 deej                 deej                 de	de	de	de	de	fdZdS )FlaxBertModuler8   r9   Tadd_pooling_layerFr   c                     t          | j        | j                  | _        t	          | j        | j        | j                  | _        t          | j        | j                  | _        d S )Nrj   r  )	r7   r8   r9   
embeddingsr  r   encoderr  poolerrS   s    r5   rU   zFlaxBertModule.setup  s^    ,T[
KKK&K*#'#>
 
 

 %T[
CCCr4   Nr\   r]   r  r   r   r   rV   r   r  r  c                 
   |t          j        |          }|It          j        t          j        t          j        |          j        d                   |j                  }|                     |||||	          }|                     ||||	||||
||
  
        }|d         }| j        r| 	                    |          nd }|s||f|dd          z   S ||f|dd          z   S t          |||j        |j        |j                  S )Nr   rY   )r  rV   r   r   r   r   r  r  r   r#   )r  pooler_outputr)   r*   r	  )r/   rY  r   r   rZ  rx   r|  r}  rz  r~  r   r)   r*   r	  )rT   r[   r^   r\   r]   r  r   r   r   rV   r   r  r  r)   r   pooleds                   r5   ra   zFlaxBertModule.__call__  sC     ! ^I66N +CJs~i7P7P7VWY7Z,[,[]f]lmmL~|^S` ( 
 
 ,,'"7#9!/!5#  
 
  
/3/EO]+++4 	9~%''!""+55!6*WQRR[88?+ !/)$5
 
 
 	
r4   )
NNNNNFTFFT)r+   r,   r-   r$   r1   r/   rc   r9   rz  rd   r   rU   r   r0   ra   r3   r4   r5   ry  ry    s*        {E39""""t"""#(D(((D D D 15.2+/7;8< ""'%* 5
 5
 !-	5

 s{+5
 CK(5
  (45
 !) 55
 5
 5
  5
 #5
 5
 5
 5
 5
 5
 5
r4   ry  z^The bare Bert Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZdS )FlaxBertModelN)r+   r,   r-   ry  rH  r3   r4   r5   r  r    s        
 "LLLr4   r  c            	       r    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 dde	de	d	e	d
e	fdZdS )FlaxBertForPreTrainingModuler8   r9   Fr   c                     t          | j        | j        | j                  | _        t          | j        | j                  | _        d S )NrM  r8   r9   )ry  r8   r9   r   rG  r@  clsrS   s    r5   rU   z"FlaxBertForPreTrainingModule.setup  sF    ";*#'#>
 
 
	
 ,4;djQQQr4   TrV   r   r  r  c
                 V   |                      |||||||||		  	        }
| j        j        r%| j         j        d         d         d         d         }nd }|
d         }|
d         }|                     |||          \  }}|	s||f|
d	d          z   S t          |||
j        |
j        
          S )NrV   r   r  r  r+  r|  rI   	embeddingr   r#   r8  rv   )r'   r(   r)   r*   )rG  r8   tie_word_embeddingsr   r  r&   r)   r*   )rT   r[   r^   r\   r]   r  rV   r   r  r  r   r0  r)   r>  rC  rD  s                   r5   ra   z%FlaxBertForPreTrainingModule.__call__  s     ))'/!5#  

 

 ;* 	$#y28<\JK\]^ij#

48HH=;K 5= 5
 5
11  	M%'=>LL+/$:!/)	
 
 
 	
r4   NTFFTr+   r,   r-   r$   r1   r/   rc   r9   r   rd   rU   ra   r3   r4   r5   r  r    s         {E39"""#(D(((R R R #"'%* -
 -
 -
  -
 #-
 -
 -
 -
 -
 -
 -
r4   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                       e Zd ZeZdS )FlaxBertForPreTrainingN)r+   r,   r-   r  rH  r3   r4   r5   r  r  J  s         0LLLr4   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.seq_relationship_logits
    ```
rj  )output_typerp  c            	       r    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 dde	de	d	e	d
e	fdZdS )FlaxBertForMaskedLMModuler8   r9   Fr   c                     t          | j        d| j        | j                  | _        t          | j        | j                  | _        d S NF)r8   rz  r9   r   r  ry  r8   r9   r   rG  r3  r  rS   s    r5   rU   zFlaxBertForMaskedLMModule.setupv  I    ";#*#'#>	
 
 
	 'dkLLLr4   TrV   r   r  r  c
                 :   |                      |||||||||		  	        }
|
d         }| j        j        r%| j         j        d         d         d         d         }nd }|                     ||          }|	s|f|
dd          z   S t          ||
j        |
j        	          S )
Nr  r   r+  r|  rI   r  r8  r#   logitsr)   r*   )rG  r8   r  r   r  r   r)   r*   )rT   r[   r^   r\   r]   r  rV   r   r  r  r   r)   r0  r  s                 r5   ra   z"FlaxBertForMaskedLMModule.__call__  s     ))'/!5#  

 

  
;* 	$#y28<\JK\]^ij# -:JKK 	+9wqrr{**!!/)
 
 
 	
r4   Nr  r  r3   r4   r5   r  r  q  s         {E39"""#(D(((M M M  #"'%* )
 )
 )
  )
 #)
 )
 )
 )
 )
 )
 )
r4   r  z2Bert Model with a `language modeling` head on top.c                       e Zd ZeZdS )FlaxBertForMaskedLMN)r+   r,   r-   r  rH  r3   r4   r5   r  r    s        ,LLLr4   r  c            	       r    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 dde	de	d	e	d
e	fdZdS )'FlaxBertForNextSentencePredictionModuler8   r9   Fr   c                     t          | j        | j        | j                  | _        t          | j                  | _        d S )NrM  rj   )ry  r8   r9   r   rG  r:  r  rS   s    r5   rU   z-FlaxBertForNextSentencePredictionModule.setup  sB    ";*#'#>
 
 
	
 'TZ888r4   TrV   r   r  r  c
                     |	|	n| j         j        }	|                     |||||||||		  	        }
|
d         }|                     |          }|	s|f|
dd          z   S t	          ||
j        |
j                  S )Nr  r#   rv   r  )r8   r  rG  r  r   r)   r*   )rT   r[   r^   r\   r]   r  rV   r   r  r  r   r>  seq_relationship_scoress                r5   ra   z0FlaxBertForNextSentencePredictionModule.__call__  s     &1%<kk$+BY ))'/!5#  

 

  
"&((="9"9 	<+-;;.*!/)
 
 
 	
r4   Nr  r  r3   r4   r5   r  r    s         {E39"""#(D(((9 9 9 #"'%* %
 %
 %
  %
 #%
 %
 %
 %
 %
 %
 %
r4   r  zJBert Model with a `next sentence prediction (classification)` head on top.c                       e Zd ZeZdS )!FlaxBertForNextSentencePredictionN)r+   r,   r-   r  rH  r3   r4   r5   r  r    s        
 ;LLLr4   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
    >>> encoding = tokenizer(prompt, next_sentence, return_tensors="jax")

    >>> outputs = model(**encoding)
    >>> logits = outputs.logits
    >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
    ```
c            	       r    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 dde	de	d	e	d
e	fdZdS )'FlaxBertForSequenceClassificationModuler8   r9   Fr   c                 "   t          | j        | j        | j                  | _        | j        j        | j        j        n| j        j        }t          j        |          | _	        t          j
        | j        j        | j                  | _        d S )NrM  r?   rj   ry  r8   r9   r   rG  classifier_dropoutrQ   rA   rP   rR   rn   
num_labels
classifierrT   r  s     r5   rU   z-FlaxBertForSequenceClassificationModule.setup  s    ";*#'#>
 
 
	 {-9 K**0 	
 z'9:::(K"*
 
 
r4   TrV   r   r  r  c
                     |                      |||||||||		  	        }
|
d         }|                     ||          }|                     |          }|	s|f|
dd          z   S t          ||
j        |
j                  S )Nr  r#   rY   rv   r  )rG  rR   r  r   r)   r*   )rT   r[   r^   r\   r]   r  rV   r   r  r  r   r>  r  s                r5   ra   z0FlaxBertForSequenceClassificationModule.__call__%  s     ))'/!5#  

 

  
]-PP// 	+9wqrr{**+!/)
 
 
 	
r4   Nr  r  r3   r4   r5   r  r    s         {E39"""#(D(((
 
 
0 #"'%* $
 $
 $
  $
 #$
 $
 $
 $
 $
 $
 $
r4   r  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       e Zd ZeZdS )!FlaxBertForSequenceClassificationN)r+   r,   r-   r  rH  r3   r4   r5   r  r  L  s         ;LLLr4   r  c            	       r    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 dde	de	d	e	d
e	fdZdS )FlaxBertForMultipleChoiceModuler8   r9   Fr   c                     t          | j        | j        | j                  | _        t          j        | j        j                  | _        t          j	        d| j                  | _
        d S )NrM  r?   r#   rj   )ry  r8   r9   r   rG  rA   rP   rQ   rR   rn   r  rS   s    r5   rU   z%FlaxBertForMultipleChoiceModule.setupd  s_    ";*#'#>
 
 
	
 zt{'FGGG(1DJ777r4   TrV   r   r  r  c
                 l   |j         d         }
|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|                     |||||||||		  	        }|d         }|                     ||          }|                     |          }|                    d|
          }|	s|f|dd          z   S t          ||j        |j                  S )Nr#   r   r  rY   rv   r  )rx   rw   rG  rR   r  r   r)   r*   )rT   r[   r^   r\   r]   r  rV   r   r  r  num_choicesr   r>  r  reshaped_logitss                  r5   ra   z(FlaxBertForMultipleChoiceModule.__call__m  sl     oa(BKBWI%%b)/"*=>>>]a	Q_Qk//N4H4LMMMquQ_Qk//N4H4LMMMquKWKc|++B0B20FGGGim ))'/!5#  

 

  
]-PP// ..[99 	4#%33,"!/)
 
 
 	
r4   Nr  r  r3   r4   r5   r  r  _  s         {E39"""#(D(((8 8 8  #"'%* ,
 ,
 ,
  ,
 #,
 ,
 ,
 ,
 ,
 ,
 ,
r4   r  z
    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZdS )FlaxBertForMultipleChoiceN)r+   r,   r-   r  rH  r3   r4   r5   r  r    s         3LLLr4   r  z(batch_size, num_choices, sequence_lengthc            	       r    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 dde	de	d	e	d
e	fdZdS )$FlaxBertForTokenClassificationModuler8   r9   Fr   c                 $   t          | j        | j        d| j                  | _        | j        j        | j        j        n| j        j        }t          j        |          | _	        t          j
        | j        j        | j                  | _        d S )NFr8   r9   rz  r   r?   rj   r  r  s     r5   rU   z*FlaxBertForTokenClassificationModule.setup  s    ";*##'#>	
 
 
	 {-9 K**0 	
 z'9:::(4;#9LLLr4   TrV   r   r  r  c
                     |                      |||||||||		  	        }
|
d         }|                     ||          }|                     |          }|	s|f|
dd          z   S t          ||
j        |
j                  S )Nr  r   rY   r#   r  )rG  rR   r  r   r)   r*   )rT   r[   r^   r\   r]   r  rV   r   r  r  r   r)   r  s                r5   ra   z-FlaxBertForTokenClassificationModule.__call__  s     ))'/!5#  

 

  
]-PP// 	+9wqrr{**(!/)
 
 
 	
r4   Nr  r  r3   r4   r5   r  r    s         {E39"""#(D(((M M M, #"'%* $
 $
 $
  $
 #$
 $
 $
 $
 $
 $
 $
r4   r  z
    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd ZeZdS )FlaxBertForTokenClassificationN)r+   r,   r-   r  rH  r3   r4   r5   r  r    s         8LLLr4   r  c            	       r    e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 dde	de	d	e	d
e	fdZdS )"FlaxBertForQuestionAnsweringModuler8   r9   Fr   c                     t          | j        | j        d| j                  | _        t          j        | j        j        | j                  | _        d S )NFr  rj   )	ry  r8   r9   r   rG  rA   rn   r  
qa_outputsrS   s    r5   rU   z(FlaxBertForQuestionAnsweringModule.setup  sN    ";*##'#>	
 
 
	 (4;#9LLLr4   TrV   r   r  r  c
                 p   |                      |||||||||		  	        }
|
d         }|                     |          }t          j        || j        j        d          \  }}|                    d          }|                    d          }|	s||f|
dd          z   S t          |||
j        |
j	                  S )Nr  r   r   r   r#   )start_logits
end_logitsr)   r*   )
rG  r  r/   r]  r8   r  squeezer   r)   r*   )rT   r[   r^   r\   r]   r  rV   r   r  r  r   r)   r  r  r  s                  r5   ra   z+FlaxBertForQuestionAnsweringModule.__call__  s     ))'/!5#  

 

  
//#&9VT[5KRT#U#U#U j#++B//''++
 	< *-;;/%!!/)	
 
 
 	
r4   Nr  r  r3   r4   r5   r  r    s         {E39"""#(D(((M M M  #"'%* (
 (
 (
  (
 #(
 (
 (
 (
 (
 (
 (
r4   r  z
    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZdS )FlaxBertForQuestionAnsweringN)r+   r,   r-   r  rH  r3   r4   r5   r  r  3  s         6LLLr4   r  c                       e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 de	de	de	de	de	fdZdS )FlaxBertForCausalLMModuler8   r9   Fr   c                     t          | j        d| j        | j                  | _        t          | j        | j                  | _        d S r  r  rS   s    r5   rU   zFlaxBertForCausalLMModule.setupK  r  r4   NTr\   r  r   r   r   rV   r   r  r  c                 L   |                      |||||||||	|
||          }|d         }| j        j        r%| j         j        d         d         d         d         }nd }|                     ||          }|s|f|dd          z   S t          ||j        |j        |j        	          S )
N)r   r   r   rV   r   r  r  r   r+  r|  rI   r  r8  r#   )r  r)   r*   r	  )	rG  r8   r  r   r  r   r)   r*   r	  )rT   r[   r^   r]   r\   r  r   r   r   rV   r   r  r  r   r)   r0  r  s                    r5   ra   z"FlaxBertForCausalLMModule.__call__T  s      ))"7#9!'/!5#  
 
  
;* 	$#y28<\JK\]^ij# -:JKK 	+9wqrr{**4!/)$5	
 
 
 	
r4   )	NNNNFTFFTr  r3   r4   r5   r  r  F  s        {E39"""#(D(((M M M 15+/7;8< ""'%* 0
 0

 !-0
 CK(0
  (40
 !) 50
 0
 0
  0
 #0
 0
 0
 0
 0
 0
 0
r4   r  z
    Bert Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   <    e Zd ZeZddeej                 fdZd Z	dS )FlaxBertForCausalLMNr^   c                 L   |j         \  }}|                     ||          }t          j        ||fd          }|0|                    d          dz
  }t          j        ||d          }n5t          j        t          j        |d          d d d f         ||f          }|||dS )NrX   rj   r   r   r#   )r   r   )rl  r^   r]   )	rx   r   r/   rr   cumsumr   r   r   r   )	rT   r[   r   r^   r   
seq_lengthrl  extended_attention_maskr]   s	            r5   prepare_inputs_for_generationz1FlaxBertForCausalLM.prepare_inputs_for_generation  s    !*
J//*jAA #&(J
+C4"P"P"P%)00b099A=L&)&>?VXfhn&o&o##+CJz,N,N,NtUVUVUVw,WZdfpYqrrL  /5(
 
 	
r4   c                 N    |j         |d<   |d         d d dd f         dz   |d<   |S )Nrl  r]   r   r#   )rl  )rT   model_outputsmodel_kwargss      r5   update_inputs_for_generationz0FlaxBertForCausalLM.update_inputs_for_generation  s;    *7*G&''3N'CAAArssF'Ka'O^$r4   r   )
r+   r,   r-   r  rH  r   rE   Arrayr  r  r3   r4   r5   r  r    sT         -L
 
S[\_\eSf 
 
 
 
*    r4   r  )
r  r  r  r  r  r  r  r  r  rF  )dtypingr   r   flax
flax.linenlinenrA   rE   	jax.numpynumpyr/   r1  flax.core.frozen_dictr   r   r   r   r	   r
   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r    r!   r"   configuration_bertr$   
get_loggerr+   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   struct	dataclassr&   BERT_START_DOCSTRINGrt  rr  r7   rf   r   r   r   r   r   r   r  r  r  r#  r3  r:  r@  rF  ry  r  r  r  #FLAX_BERT_FOR_PRETRAINING_DOCSTRINGru  r  r  r  r  &FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRINGr  r  r  r  r  r  r  r  r  r  __all__r3   r4   r5   <module>r     s{    & % % % % % % %        



           > > > > > > > > > > 6 6 6 6 6 6 6 6 6 6 6 6 6 6 > > > > > > ; ; ; ; ; ; ; ;                                           g f f f f f f f f f f f * * * * * * 
	H	%	%5  4 4 4 4 4; 4 4 4:. `$ N( ( ( ( ( ( ( (Vh h h h hBI h h hV       (' ' ' ' '	 ' ' 'T    29   $    RY   (6 6 6 6 6BI 6 6 6rN
 N
 N
 N
 N
bi N
 N
 N
b$
 $
 $
 $
 $
bi $
 $
 $
N) ) ) ) )RY ) ) )"- - - - -bi - - -    ry   .	 	 	 	 	") 	 	 	4 4 4 4 4") 4 4 49 9 9 9 9ry 9 9 9@ @ @ @ @1 @ @ @FD
 D
 D
 D
 D
RY D
 D
 D
N d " " " " "+ " "	 "  ],?A_ap q q q:
 :
 :
 :
 :
29 :
 :
 :
z   0 0 0 0 04 0 0 0' #&    !>??Bee   !  (DSb   
7
 7
 7
 7
 7
	 7
 7
 7
t NPdee- - - - -1 - - fe-  02EGY[j k k k2
 2
 2
 2
 2
bi 2
 2
 2
j T ; ; ; ; ;(? ; ;	 ;* &,  %  !>??Bhh   !  %3Rap   
:
 :
 :
 :
 :
bi :
 :
 :
z   ; ; ; ; ;(? ; ; ;  % 	  :
 :
 :
 :
 :
bi :
 :
 :
z   3 3 3 3 3 7 3 3 3  4;;<fgg    24QSb  
8
 8
 8
 8
 8
29 8
 8
 8
v   8 8 8 8 8%< 8 8 8  "$79RTc  
6
 6
 6
 6
 6
 6
 6
 6
r   6 6 6 6 6#: 6 6 6   $	  >
 >
 >
 >
 >
	 >
 >
 >
B       1   <  )	    r4   