
     `iK                        d dl mZmZ d dlZd dlmZ d dlZd dlm	Z
 d dl	Zd dlmZmZmZ d dlmZ d dlmZmZ d dlmZ ddlmZmZmZmZmZmZmZ dd	lmZm Z m!Z!m"Z"m#Z# dd
l$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*  e(j+        e,          Z-dZ.dZ/ej0        j1         G d de%                      Z2dZ3dZ4 G d dej5                  Z6 G d dej5                  Z7 G d dej5                  Z8 G d dej5                  Z9 G d dej5                  Z: G d dej5                  Z; G d d ej5                  Z< G d! d"ej5                  Z= G d# d$ej5                  Z> G d% d&e           Z? G d' d(ej5                  Z@ e&d)e3           G d* d+e?                      ZA e!eAe.ee/            G d, d-ej5                  ZB e&d.e3           G d/ d0e?                      ZCd1ZD e#eCe4E                    d2          eDz               e"eCe2e/3            G d4 d5ej5                  ZF e&d6e3           G d7 d8e?                      ZG e!eGe.ee/d9:            G d; d<ej5                  ZH e&d=e3           G d> d?e?                      ZI e!eIe.ee/            G d@ dAej5                  ZJ e&dBe3           G dC dDe?                      ZK e#eKe4E                    dE                      e!eKe.ee/            G dF dGej5                  ZL e&dHe3           G dI dJe?                      ZM e!eMe.ee/            G dK dLej5                  ZN e&dMe3           G dN dOe?                      ZO e!eOe.ee/           g dPZPdS )Q    )CallableOptionalN)
FrozenDictfreezeunfreeze)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxBaseModelOutputWithPoolingFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )AlbertConfigzalbert/albert-base-v2r   c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	e
eej                          ed<   dZe
eej                          ed<   dS )FlaxAlbertForPreTrainingOutputaB  
    Output type of [`FlaxAlbertForPreTraining`].

    Args:
        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nprediction_logits
sop_logitshidden_states
attentions)__name__
__module____qualname____doc__r!   jnpndarray__annotations__r"   r#   r   tupler$        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/albert/modeling_flax_albert.pyr    r    6   s{          , &*s{)))"J"""26M8E#+./666/3Js{+,33333r.   r    a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   T    e Zd ZU dZeed<   ej        Zej        ed<   d Z	d	de
fdZdS )
FlaxAlbertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    t          j        | j        j        | j        j        t
          j         j                            | j        j                            | _	        t          j        | j        j
        | j        j        t
          j         j                            | j        j                            | _        t          j        | j        j        | j        j        t
          j         j                            | j        j                            | _        t          j        | j        j        | j                  | _        t          j        | j        j                  | _        d S )N)stddev)embedding_initepsilonr3   rate)nnEmbedr2   
vocab_sizeembedding_sizejaxinitializersnormalinitializer_rangeword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr3   Dropouthidden_dropout_probdropoutselfs    r/   setupzFlaxAlbertEmbeddings.setup   s   !xK"K&6.55T[=Z5[[ 
  
  

 $&8K/K&6.55T[=Z5[[$
 $
 $
 
 &(XK'K&6.55T[=Z5[[&
 &
 &
"
 dk.HPTPZ[[[zt{'FGGGr.   Tdeterministicc                 ^   |                      |                    d                    }|                     |                    d                    }|                     |                    d                    }||z   |z   }|                     |          }|                     ||          }|S )Ni4rP   )rC   astyperE   rG   rH   rL   )	rN   	input_idstoken_type_idsposition_idsrP   inputs_embedsposition_embedsrG   r#   s	            r/   __call__zFlaxAlbertEmbeddings.__call__   s    ,,Y-=-=d-C-CDD22<3F3Ft3L3LMM $ : :>;P;PQU;V;V W W &(==O }55]-PPr.   NT)r%   r&   r'   r(   r   r+   r)   float32r3   rO   boolrZ   r-   r.   r/   r1   r1      sn         QQ{E39"""H H H& t      r.   r1   c                   P    e Zd ZU eed<   ej        Zej        ed<   d Zd	de	fdZ
dS )
FlaxAlbertSelfAttentionr2   r3   c                    | j         j        | j         j        z  dk    rt          d          t	          j        | j         j        | j        t          j        j        	                    | j         j
                            | _        t	          j        | j         j        | j        t          j        j        	                    | j         j
                            | _        t	          j        | j         j        | j        t          j        j        	                    | j         j
                            | _        t	          j        | j         j        t          j        j        	                    | j         j
                  | j                  | _        t	          j        | j         j        | j                  | _        t	          j        | j         j                  | _        d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r3   kernel_initra   r3   r7   r9   )r2   hidden_sizenum_attention_heads
ValueErrorr;   Denser3   r?   r@   rA   rB   querykeyvaluedenserH   rI   rJ   rK   rL   rM   s    r/   rO   zFlaxAlbertSelfAttention.setup   sz   ;"T[%DDIII  
 XK#*+224;3PQQ
 
 


 8K#*+224;3PQQ
 
 

 XK#*+224;3PQQ
 
 


 XK#+224;3PQQ*
 
 


 dk.HPTPZ[[[zt{'FGGGr.   TFoutput_attentionsc                    | j         j        | j         j        z  }|                     |                              |j        d d         | j         j        |fz             }|                     |                              |j        d d         | j         j        |fz             }|                     |                              |j        d d         | j         j        |fz             }|t          j	        |d          }t          j        |dk    t          j        |j        d                              | j                  t          j        |j        t          j        | j                  j                                      | j                            }	nd }	d }
|s%| j         j        dk    r|                     d          }
t'          |||	|
| j         j        d|| j        d 	  	        }t          j        d	||          }|                    |j        d d         d
z             }|                     |          }|                     ||          }|                     ||z             }|r||fn|f}|S )N   )axisr   g        rL   T)biasdropout_rngdropout_ratebroadcast_dropoutrP   r3   	precisionz...hqk,...khd->...qhd)rS   )r2   rc   rd   rg   reshapeshaperi   rh   r)   expand_dimsr   selectfullrT   r3   finfominattention_probs_dropout_probmake_rngr   einsumrj   rL   rH   )rN   r#   attention_maskrP   rk   head_dimquery_statesvalue_states
key_statesattention_biasrs   attn_weightsattn_outputprojected_attn_outputlayernormed_attn_outputoutputss                   r/   rZ   z FlaxAlbertSelfAttention.__call__   sg   ;*dk.MMzz-0088#t{'F&QQ
 
 zz-0088#t{'F&QQ
 
 XXm,,44#t{'F&QQ
 


 % _^(KKKN Z"-s33::4:FF-sy/D/D/HIIPPQUQ[\\ NN "N 	3!IC!O!O--	22K4#A"'*

 

 

 j!8,UU!))+*;BQB*?%*GHH $

; 7 7 $-BR_ ` `"&..1F1V"W"W=Nn*L99UlTnr.   NTFr%   r&   r'   r   r+   r)   r\   r3   rO   r]   rZ   r-   r.   r/   r_   r_      si         {E39"""H H H<0 0]a 0 0 0 0 0 0r.   r_   c                   X    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 d
de	de	fdZ
d	S )FlaxAlbertLayerr2   r3   c                 v   t          | j        | j                  | _        t	          j        | j        j        t          j        j        	                    | j        j
                  | j                  | _        t          | j        j                 | _        t	          j        | j        j        t          j        j        	                    | j        j
                  | j                  | _        t	          j        | j        j        | j                  | _        t	          j        | j        j                  | _        d S )Nr3   rb   r7   r9   )r_   r2   r3   	attentionr;   rf   intermediate_sizer?   r@   rA   rB   ffnr   
hidden_act
activationrc   
ffn_outputrH   rI   full_layer_layer_normrJ   rK   rL   rM   s    r/   rO   zFlaxAlbertLayer.setup  s    0DJOOO8K)+224;3PQQ*
 
 

 !!78(K#+224;3PQQ*
 
 

 &(\$+:T\`\f%g%g%g"zt{'FGGGr.   TFrP   rk   c                 F   |                      ||||          }|d         }|                     |          }|                     |          }|                     |          }|                     ||          }|                     ||z             }|f}|r||d         fz  }|S )NrP   rk   r   rS   r   )r   r   r   r   rL   r   )	rN   r#   r   rP   rk   attention_outputsattention_outputr   r   s	            r/   rZ   zFlaxAlbertLayer.__call__)  s     !NN>Zk + 
 
 -Q/XX.//
__Z00
__Z00
\\*M\JJ
22:@P3PQQ " 	/)!,..Gr.   Nr   r   r-   r.   r/   r   r     s}         {E39"""H H H( #"'  	
       r.   r   c                   ^    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 dde	de	de	fd	Z
d
S )FlaxAlbertLayerCollectionr2   r3   c                 \      fdt           j        j                  D              _        d S )Nc                 b    g | ]+}t          j        t          |          j                   ,S ))namer3   )r   r2   strr3   .0irN   s     r/   
<listcomp>z3FlaxAlbertLayerCollection.setup.<locals>.<listcomp>F  s?     
 
 
LMODKc!ffDJGGG
 
 
r.   )ranger2   inner_group_numlayersrM   s   `r/   rO   zFlaxAlbertLayerCollection.setupE  s@    
 
 
 
QVW[WbWrQsQs
 
 
r.   TFrP   rk   output_hidden_statesc                     d}d}t          | j                  D ]2\  }}	 |	||||          }
|
d         }|r||
d         fz   }|r||fz   }3|f}|r||fz   }|r||fz   }|S )Nr-   r   r   r   )	enumerater   )rN   r#   r   rP   rk   r   layer_hidden_stateslayer_attentionslayer_indexalbert_layerlayer_outputr   s               r/   rZ   z"FlaxAlbertLayerCollection.__call__J  s     !)24;)?)? 	M 	M%K'<+"3	  L )OM  I#3|A6H#H # M&9]<L&L# " 	7!4 66G 	4!1 33Gr.   NTFFr   r-   r.   r/   r   r   A  s         {E39"""
 
 
 #"'%*  	
   #     r.   r   c                   x    e Zd ZU eed<   ej        Zej        ed<   dZe	e
         ed<   d Z	 	 	 dded	ed
efdZdS )FlaxAlbertLayerCollectionsr2   r3   Nr   c                 F    t          | j        | j                  | _        d S )Nr   )r   r2   r3   albert_layersrM   s    r/   rO   z FlaxAlbertLayerCollections.setupq  s!    6t{$*UUUr.   TFrP   rk   r   c                 :    |                      |||||          }|S NrP   rk   r   )r   )rN   r#   r   rP   rk   r   r   s          r/   rZ   z#FlaxAlbertLayerCollections.__call__t  s4     $$'/!5 % 
 
 r.   r   )r%   r&   r'   r   r+   r)   r\   r3   r   r   r   rO   r]   rZ   r-   r.   r/   r   r   l  s         {E39"""!%K#%%%V V V #"'%*  	
   #     r.   r   c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )FlaxAlbertLayerGroupsr2   r3   c                 \      fdt           j        j                  D              _        d S )Nc           	      ~    g | ]9}t          j        t          |          t          |          j                   :S ))r   r   r3   )r   r2   r   r3   r   s     r/   r   z/FlaxAlbertLayerGroups.setup.<locals>.<listcomp>  sM     
 
 
 't{QSQRVV[_[efff
 
 
r.   )r   r2   num_hidden_groupsr   rM   s   `r/   rO   zFlaxAlbertLayerGroups.setup  s>    
 
 
 
4;899
 
 
r.   TFrP   rk   r   return_dictc                 v   |rdnd }|r|fnd }t          | j        j                  D ]c}	t          |	| j        j        | j        j        z  z            }
 | j        |
         |||||          }|d         }|r||d         z   }|r||fz   }d|st          d |||fD                       S t          |||          S )Nr-   r   r   rw   c              3      K   | ]}||V  	d S Nr-   )r   vs     r/   	<genexpr>z1FlaxAlbertLayerGroups.__call__.<locals>.<genexpr>  s(      hhqZ[ZgZgZgZgZghhr.   )last_hidden_stater#   r$   )r   r2   num_hidden_layersintr   r   r,   r   )rN   r#   r   rP   rk   r   r   all_attentionsall_hidden_statesr   	group_idxlayer_group_outputs               r/   rZ   zFlaxAlbertLayerGroups.__call__  s     1:d0DN],,$t{455 	I 	IAA!>A^!^_``I!7Y!7+"3%9" " " /q1M  I!/2DR2H!H# I$58H$H! 	ihh]4E~$Vhhhhhh"+;LYg
 
 
 	
r.   NTFFTr   r-   r.   r/   r   r     s         {E39"""
 
 
 #"'%* "
 "
 	"

  "
 #"
 "
 "
 "
 "
 "
 "
r.   r   c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )FlaxAlbertEncoderr2   r3   c                     t          j        | j        j        t          j         j                            | j        j                  | j                  | _	        t          | j        | j                  | _        d S )Nrb   r   )r;   rf   r2   rc   r?   r@   rA   rB   r3   embedding_hidden_mapping_inr   albert_layer_groupsrM   s    r/   rO   zFlaxAlbertEncoder.setup  sd    +-8K#+224;3PQQ*,
 ,
 ,
(
 $9DJ#W#W#W   r.   TFrP   rk   r   r   c                 `    |                      |          }|                     |||||          S r   )r   r   )rN   r#   r   rP   rk   r   r   s          r/   rZ   zFlaxAlbertEncoder.__call__  sC     88GG'''/!5 ( 
 
 	
r.   Nr   r   r-   r.   r/   r   r     s         {E39"""X X X #"'%* 
 
 	

  
 #
 
 
 
 
 
 
r.   r   c                       e Zd ZU eed<   ej        Zej        ed<   ej	        j
        j        Zedej        f         ed<   d ZddZdS )	FlaxAlbertOnlyMLMHeadr2   r3   .	bias_initc                    t          j        | j        j        | j                  | _        t          | j        j                 | _        t          j	        | j        j
        | j                  | _	        t          j        | j        j        | j        d          | _        |                     d| j        | j        j        f          | _        d S )Nr   r7   F)r3   use_biasrr   )r;   rf   r2   r>   r3   rj   r   r   r   rH   rI   r=   decoderparamr   rr   rM   s    r/   rO   zFlaxAlbertOnlyMLMHead.setup  s    Xdk8
KKK
 !78dk.HPTPZ[[[x 6djSXYYYJJvt~8N7PQQ			r.   Nc                    |                      |          }|                     |          }|                     |          }|%| j                            dd|j        ii|          }n|                     |          }|| j        z  }|S )Nparamskernel)rj   r   rH   r   applyTrr   )rN   r#   shared_embeddings      r/   rZ   zFlaxAlbertOnlyMLMHead.__call__  s    

=1166}55' L..8EUEW:X/Y[hiiMM LL77M"r.   r   )r%   r&   r'   r   r+   r)   r\   r3   r?   r;   r@   zerosr   r   npr*   rO   rZ   r-   r.   r/   r   r     s         {E39"""+.6+>+DIxRZ(DDDR R R     r.   r   c                   J    e Zd ZU eed<   ej        Zej        ed<   d ZddZ	dS )FlaxAlbertSOPHeadr2   r3   c                     t          j        | j        j                  | _        t          j        d| j                  | _        d S )Nrm   r   )r;   rJ   r2   classifier_dropout_probrL   rf   r3   
classifierrM   s    r/   rO   zFlaxAlbertSOPHead.setup  s4    z$+"EFF(1DJ777r.   Tc                 ^    |                      ||          }|                     |          }|S )NrS   )rL   r   )rN   pooled_outputrP   logitss       r/   rZ   zFlaxAlbertSOPHead.__call__  s-    ]-PP//r.   Nr[   )
r%   r&   r'   r   r+   r)   r\   r3   rO   rZ   r-   r.   r/   r   r     sY         {E39"""8 8 8     r.   r   c                   f    e Zd ZU dZeZdZdZej	        e
d<   ddej        dfded	ed
edej        def
 fdZddej        j        d	ededefdZ ee                    d                    	 	 	 	 	 	 	 	 	 ddee         dej        j        dedee         dee         dee         fd            Z xZS )FlaxAlbertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    albertNmodule_class)r   r   r   Tr2   input_shapeseedr3   _do_initc                 v     | j         d||d|}t                                          ||||||           d S )Nr2   r3   )r   r   r3   r   r-   )r   super__init__)	rN   r2   r   r   r3   r   kwargsmodule	__class__s	           r/   r   z"FlaxAlbertPreTrainedModel.__init__  sQ     #"H&HHHH[tSXcklllllr.   rngr   returnc                    t          j        |d          }t          j        |          }t          j        t          j        t          j        |          j        d                   |          }t          j        |          }t          j	        
                    |          \  }}	||	d}
| j                            |
||||d          d         }||t          t          |                    }t          t          |                    }| j        D ]}||         ||<   t!                      | _        t#          t%          |                    S |S )NrR   r   rw   )r   rL   F)r   r   )r)   r   
zeros_likebroadcast_toarange
atleast_2dry   	ones_liker?   randomsplitr   initr	   r   _missing_keyssetr   r
   )rN   r   r   r   rU   rV   rW   r   
params_rngrs   rngsrandom_paramsmissing_keys                r/   init_weightsz&FlaxAlbertPreTrainedModel.init_weights  s=   Ik666		22'
3>)3L3L3RSU3V(W(WYdeey11"%*"2"23"7"7
K$==(()^^\W\ ) 
 

 (-)@)@AAM!(6"2"233F#1 A A&3K&@{##!$D.00111  r.   batch_size, sequence_lengthFrs   trainrk   r   r   c                 ^   ||n| j         j        }|	|	n| j         j        }	|
|
n| j         j        }
|t	          j        |          }|It	          j        t	          j        t	          j        |          j	        d                   |j	                  }|t	          j
        |          }i }|||d<   | j                            d|p| j        it	          j        |d          t	          j        |d          t	          j        |d          t	          j        |d          | ||	|
|
  
        S )Nrw   rL   r   rR   r   )r  )r2   rk   r   r   r)   r   r   r   r   ry   r   r   r   r   array)rN   rU   r   rV   rW   r   rs   r  rk   r   r   r  s               r/   rZ   z"FlaxAlbertPreTrainedModel.__call__*  sL    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+BY ! ^I66N+CJs~i7P7P7VWY7Z,[,[]f]lmmL! ]955N ")DO{  v,-Iit,,,InD111InD111Il$///I  ! 
 
 	
r.   r   )	NNNNNFNNN)r%   r&   r'   r(   r   config_classbase_model_prefixr   r;   Moduler+   r)   r\   r,   r   r3   r]   r   r?   r  PRNGKeyr   r
  r   ALBERT_INPUTS_DOCSTRINGformatr   dictrZ   __classcell__)r   s   @r/   r   r     s         
  L "L")"""
 $;
m 
m
m 
m 	
m
 y
m 
m 
m 
m 
m 
m 
m! !
 2 ! !PZ !fp ! ! ! !0 +*+B+I+IJg+h+hii !%*.,0/3&*-
 -
 -
 Z'-
 -
 $D>-
 'tn-
 d^-
 -
 -
 ji-
 -
 -
 -
 -
r.   r   c                       e Zd ZU eed<   ej        Zej        ed<   dZe	ed<   d Z
	 	 	 	 	 	 ddeej                 d	eej                 d
e	de	de	de	fdZdS )FlaxAlbertModuler2   r3   Tadd_pooling_layerc                    t          | j        | j                  | _        t	          | j        | j                  | _        | j        rkt          j        | j        j	        t          j        j                            | j        j                  | j        d          | _        t          j        | _        d S d | _        d | _        d S )Nr   pooler)ra   r3   r   )r1   r2   r3   
embeddingsr   encoderr  r;   rf   rc   r?   r@   rA   rB   r  tanhpooler_activationrM   s    r/   rO   zFlaxAlbertModule.setup`  s    .t{$*MMM(DJGGG! 
	*('F/66t{7TUUj	  DK &(WD"""DK%)D"""r.   NFrV   rW   rP   rk   r   r   c	                 4   |t          j        |          }|It          j        t          j        t          j        |          j        d                   |j                  }|                     ||||          }	|                     |	|||||          }
|
d         }	| j        r5| 	                    |	d d df                   }| 
                    |          }nd }|s||	f|
dd          z   S |	|f|
dd          z   S t          |	||
j        |
j                  S )Nrw   rS   rP   rk   r   r   r   r   )r   pooler_outputr#   r$   )r)   r   r   r   r   ry   r  r  r  r  r  r   r#   r$   )rN   rU   r   rV   rW   rP   rk   r   r   r#   r   pooleds               r/   rZ   zFlaxAlbertModule.__call__o  sJ    ! ^I66N +CJs~i7P7P7VWY7Z,[,[]f]lmmL	><_lmm,,'/!5#  
 
  
! 	[[qqq!t!455F++F33FFF 	9~%''!""+55!6*WQRR[88-+ !/)	
 
 
 	
r.   )NNTFFT)r%   r&   r'   r   r+   r)   r\   r3   r  r]   rO   r   r   r*   rZ   r-   r.   r/   r  r  [  s         {E39""""t"""* * *& 04-1""'%* /
 /
 !,	/

 rz*/
 /
  /
 #/
 /
 /
 /
 /
 /
 /
r.   r  z`The bare Albert Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZdS )FlaxAlbertModelN)r%   r&   r'   r  r   r-   r.   r/   r%  r%    s        
 $LLLr.   r%  c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )FlaxAlbertForPreTrainingModuler2   r3   c                     t          | j        | j                  | _        t	          | j        | j                  | _        t          | j        | j                  | _        d S )Nr   )r  r2   r3   r   r   predictionsr   sop_classifierrM   s    r/   rO   z$FlaxAlbertForPreTrainingModule.setup  sR    &dkLLL04:VVV/t{$*UUUr.   TFrP   rk   r   r   c	           
      z   |                      ||||||||          }	| j        j        r%| j         j        d         d         d         d         }
nd }
|	d         }|	d         }|                     ||
          }|                     ||	          }|s||f|	d
d          z   S t          |||	j        |	j                  S )Nr!  r   r  rC   	embeddingr   r   r   rS   rm   )r!   r"   r#   r$   )	r   r2   tie_word_embeddings	variablesr)  r*  r    r#   r$   )rN   rU   r   rV   rW   rP   rk   r   r   r   r   r#   r   prediction_scores
sop_scoress                  r/   rZ   z'FlaxAlbertForPreTrainingModule.__call__  s     ++'/!5#  	
 	
 ;* 	$#{4X>|LM^_`kl#

 ,,]M],^^((m(TT
 	A%z2WQRR[@@-/!!/)	
 
 
 	
r.   Nr   r   r-   r.   r/   r'  r'    s         {E39"""V V V #"'%* *
 *
 *
  *
 #*
 *
 *
 *
 *
 *
 *
r.   r'  z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       e Zd ZeZdS )FlaxAlbertForPreTrainingN)r%   r&   r'   r'  r   r-   r.   r/   r3  r3    s         2LLLr.   r3  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.sop_logits
    ```
r  )output_typer  c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )FlaxAlbertForMaskedLMModuler2   r3   c                     t          | j        d| j                  | _        t	          | j        | j                  | _        d S )NF)r2   r  r3   r   )r  r2   r3   r   r   r)  rM   s    r/   rO   z!FlaxAlbertForMaskedLMModule.setup  s=    &dkUZ^Zdeee04:VVVr.   TFrP   rk   r   r   c	           
      8   |                      ||||||||          }	|	d         }
| j        j        r%| j         j        d         d         d         d         }nd }|                     |
|          }|s|f|	dd          z   S t          ||	j        |	j        	          S )
Nr!  r   r   r  rC   r,  r-  r   r   r#   r$   )r   r2   r.  r/  r)  r   r#   r$   )rN   rU   r   rV   rW   rP   rk   r   r   r   r#   r   r   s                r/   rZ   z$FlaxAlbertForMaskedLMModule.__call__  s     ++'/!5#  	
 	
  
;* 	$#{4X>|LM^_`kl# !!-BR!SS 	+9wqrr{**!!/)
 
 
 	
r.   Nr   r   r-   r.   r/   r6  r6  	  s         {E39"""W W W #"'%* '
 '
 '
  '
 #'
 '
 '
 '
 '
 '
 '
r.   r6  z4Albert Model with a `language modeling` head on top.c                       e Zd ZeZdS )FlaxAlbertForMaskedLMN)r%   r&   r'   r6  r   r-   r.   r/   r;  r;  ;  s        .LLLr.   r;  z
refs/pr/11)revisionc            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS ))FlaxAlbertForSequenceClassificationModuler2   r3   c                    t          | j        | j                  | _        | j        j        | j        j        n| j        j        }t          j        |          | _        t          j	        | j        j
        | j                  | _        d S )Nr   r9   r   r  r2   r3   r   r   rK   r;   rJ   rL   rf   
num_labelsr   rN   classifier_dropouts     r/   rO   z/FlaxAlbertForSequenceClassificationModule.setupI  s    &dkLLL {2> K//0 	
 z'9:::(K"*
 
 
r.   TFrP   rk   r   r   c	           
          |                      ||||||||          }	|	d         }
|                     |
|          }
|                     |
          }|s|f|	dd          z   S t          ||	j        |	j                  S )Nr!  r   rS   rm   r9  )r   rL   r   r   r#   r$   )rN   rU   r   rV   rW   rP   rk   r   r   r   r   r   s               r/   rZ   z2FlaxAlbertForSequenceClassificationModule.__call__V  s     ++'/!5#  	
 	
  
]-PP// 	+9wqrr{**+!/)
 
 
 	
r.   Nr   r   r-   r.   r/   r>  r>  E  s         {E39"""
 
 
& #"'%* "
 "
 "
  "
 #"
 "
 "
 "
 "
 "
 "
r.   r>  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       e Zd ZeZdS )#FlaxAlbertForSequenceClassificationN)r%   r&   r'   r>  r   r-   r.   r/   rF  rF  {  s         =LLLr.   rF  c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )!FlaxAlbertForMultipleChoiceModuler2   r3   c                     t          | j        | j                  | _        t	          j        | j        j                  | _        t	          j        d| j                  | _	        d S )Nr   r9   r   r   )
r  r2   r3   r   r;   rJ   rK   rL   rf   r   rM   s    r/   rO   z'FlaxAlbertForMultipleChoiceModule.setup  sO    &dkLLLzt{'FGGG(1DJ777r.   TFrP   rk   r   r   c	           
      j   |j         d         }	|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|!|                    d|j         d                   nd }|                     ||||||||          }
|
d         }|                     ||          }|                     |          }|                    d|	          }|s|f|
dd          z   S t          ||
j        |
j                  S )Nr   rw   r!  rS   rm   r9  )ry   rx   r   rL   r   r   r#   r$   )rN   rU   r   rV   rW   rP   rk   r   r   num_choicesr   r   r   reshaped_logitss                 r/   rZ   z*FlaxAlbertForMultipleChoiceModule.__call__  si     oa(BKBWI%%b)/"*=>>>]a	Q_Qk//N4H4LMMMquQ_Qk//N4H4LMMMquKWKc|++B0B20FGGGim ++'/!5#  	
 	
  
]-PP// ..[99 	4#%33,"!/)
 
 
 	
r.   Nr   r   r-   r.   r/   rH  rH    s         {E39"""8 8 8 #"'%* *
 *
 *
  *
 #*
 *
 *
 *
 *
 *
 *
r.   rH  z
    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZdS )FlaxAlbertForMultipleChoiceN)r%   r&   r'   rH  r   r-   r.   r/   rN  rN    s         5LLLr.   rN  z(batch_size, num_choices, sequence_lengthc            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )&FlaxAlbertForTokenClassificationModuler2   r3   c                    t          | j        | j        d          | _        | j        j        | j        j        n| j        j        }t          j        |          | _        t          j	        | j        j
        | j                  | _        d S )NFr2   r3   r  r9   r   r@  rB  s     r/   rO   z,FlaxAlbertForTokenClassificationModule.setup  s|    &dk_deee {2> K//0 	
 z'9:::(4;#9LLLr.   TFrP   rk   r   r   c	           
          |                      ||||||||          }	|	d         }
|                     |
|          }
|                     |
          }|s|f|	dd          z   S t          ||	j        |	j                  S )Nr!  r   rS   r   r9  )r   rL   r   r   r#   r$   )rN   rU   r   rV   rW   rP   rk   r   r   r   r#   r   s               r/   rZ   z/FlaxAlbertForTokenClassificationModule.__call__  s     ++'/!5#  	
 	
  
]-PP// 	+9wqrr{**(!/)
 
 
 	
r.   Nr   r   r-   r.   r/   rP  rP    s         {E39"""M M M  #"'%* "
 "
 "
  "
 #"
 "
 "
 "
 "
 "
 "
r.   rP  z
    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd ZeZdS ) FlaxAlbertForTokenClassificationN)r%   r&   r'   rP  r   r-   r.   r/   rU  rU    s         :LLLr.   rU  c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )$FlaxAlbertForQuestionAnsweringModuler2   r3   c                     t          | j        | j        d          | _        t	          j        | j        j        | j                  | _        d S )NFrR  r   )r  r2   r3   r   r;   rf   rA  
qa_outputsrM   s    r/   rO   z*FlaxAlbertForQuestionAnsweringModule.setup$  s>    &dk_deee(4;#9LLLr.   TFrP   rk   r   r   c	           
      n   |                      ||||||||          }	|	d         }
|                     |
          }t          j        || j        j        d          \  }}|                    d          }|                    d          }|s||f|	dd          z   S t          |||	j        |	j	                  S )Nr!  r   rw   rp   r   )start_logits
end_logitsr#   r$   )
r   rY  r)   r  r2   rA  squeezer   r#   r$   )rN   rU   r   rV   rW   rP   rk   r   r   r   r#   r   r[  r\  s                 r/   rZ   z-FlaxAlbertForQuestionAnsweringModule.__call__(  s     ++'/!5#  	
 	
  
//#&9VT[5KRT#U#U#U j#++B//''++
 	< *-;;/%!!/)	
 
 
 	
r.   Nr   r   r-   r.   r/   rW  rW     s         {E39"""M M M #"'%* &
 &
 &
  &
 #&
 &
 &
 &
 &
 &
 &
r.   rW  z
    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZdS )FlaxAlbertForQuestionAnsweringN)r%   r&   r'   rW  r   r-   r.   r/   r_  r_  Q  s         8LLLr.   r_  )r   r%  r3  r;  rF  rN  rU  r_  )Qtypingr   r   flax
flax.linenlinenr;   r?   	jax.numpynumpyr)   r   flax.core.frozen_dictr   r   r   flax.linen.attentionr   flax.traverse_utilr	   r
   r   modeling_flax_outputsr   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r   r   r   configuration_albertr   
get_loggerr%   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCstruct	dataclassr    ALBERT_START_DOCSTRINGr  r  r1   r_   r   r   r   r   r   r   r   r   r  r%  r'  r3  %FLAX_ALBERT_FOR_PRETRAINING_DOCSTRINGr  r6  r;  r>  rF  rH  rN  rP  rU  rW  r_  __all__r-   r.   r/   <module>rv     s    & % % % % % % %        



           > > > > > > > > > > > > > > > > ; ; ; ; ; ; ; ;                                     g f f f f f f f f f f f . . . . . . 
	H	%	%-   4 4 4 4 4[ 4 4 4:! F B% % % % %29 % % %PR R R R Rbi R R Rj) ) ) ) )bi ) ) )X( ( ( ( (	 ( ( (V       4,
 ,
 ,
 ,
 ,
BI ,
 ,
 ,
^
 
 
 
 
	 
 
 
>    BI   4    	   \
 \
 \
 \
 \
 3 \
 \
 \
~C
 C
 C
 C
 C
ry C
 C
 C
L f $ $ $ $ $/ $ $	 $  _.ACacr s s s3
 3
 3
 3
 3
RY 3
 3
 3
l   2 2 2 2 28 2 2 2) %&  ""#@AADii   !  *HWf   
/
 /
 /
 /
 /
") /
 /
 /
d PRhii/ / / / /5 / / ji/  .0BO^j   
3
 3
 3
 3
 3
	 3
 3
 3
l   = = = = =*C = = =  ' 	  3
 3
 3
 3
 3
	 3
 3
 3
l   5 5 5 5 5"; 5 5 5  !8!?!?@j!k!k    !	  0
 0
 0
 0
 0
RY 0
 0
 0
f   : : : : :'@ : : :  $	  .
 .
 .
 .
 .
29 .
 .
 .
b   8 8 8 8 8%> 8 8 8  "$	  	 	 	r.   