
     `i                        d dl Z d dlmZmZ d dlmZ d dlZd dlm	Z
 d dl	Zd dlmZmZmZ d dlmZmZ d dlmZ ddlmZmZmZmZmZmZ ddlmZmZmZmZ dd	l m!Z!m"Z"m#Z# d
dl$m%Z%  e#j&        e'          Z(dZ)dZ*dZ+dZ,d Z-d Z. G d dej/                  Z0 G d dej/                  Z1 G d dej/                  Z2 G d dej/                  Z3 G d dej/                  Z4 G d dej/                  Z5 G d dej/                  Z6 G d  d!e          Z7 G d" d#ej/                  Z8 e!d$e+           G d% d&e7                      Z9 ee9e)de*            G d' d(ej/                  Z: e!d)e+           G d* d+e7                      Z; ee;e)ee*            G d, d-ej/                  Z< e!d.e+           G d/ d0e7                      Z= ee=e)ee*            G d1 d2ej/                  Z> e!d3e+           G d4 d5e7                      Z? ee?e,@                    d6                      ee?e)ee*            G d7 d8ej/                  ZA e!d9e+           G d: d;e7                      ZB eeBe)ee*            G d< d=ej/                  ZC e!d>e+           G d? d@e7                      ZD eeDe)ee*           g dAZEdS )B    N)CallableOptional)
FrozenDictfreezeunfreeze)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DistilBertConfigzdistilbert-base-uncasedr   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                 r    dt          j        dd|dz  z  t          j        |          z            z  }| |z  S )Nr   i'     )nppowerfloat32)posid_modelangle_ratess       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py
get_anglesr%   `   s:    bhuqAF|rz'7J7J&JKKKK    c                    t          t          j        |           d d t          j        f         t          j        |          t          j        d d f         |          }t          j        |d d dd df                   |d d dd df<   t          j        |d d dd df                   |d d dd df<   |t          j        df         }t          j        |          S )Nr   r   r   .)r%   r   arangenewaxissincosjnparray)positionr"   
angle_radspos_encodings       r$   positional_encodingr1   e   s    BIh//2:>	'@R@RSUS]_`_`_`S`@acjkkJ &AAAqt!tG!455Jqqq!$Q$w &AAAqt!tG!455Jqqq!$Q$wbj#o.L9\"""r&   c                   T    e Zd ZU dZeed<   ej        Zej        ed<   d Z	d	de
fdZdS )
FlaxEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 n   t          j        | j        j        | j        j        t
          j         j                            | j        j                            | _	        | j        j
        s^t          j        | j        j        | j        j        t
          j         j                            | j        j                            | _        n)t          | j        j        | j        j                  | _        t          j        d| j                  | _        t          j        | j        j                  | _        d S )Nstddev)embedding_init-q=epsilonr5   rate)nnEmbedr4   
vocab_sizedimjaxinitializersnormalinitializer_rangeword_embeddingssinusoidal_pos_embdsmax_position_embeddingsposition_embeddingsr1   r0   	LayerNormr5   Dropoutdropoutselfs    r$   setupzFlaxEmbeddings.setupz   s    !xK"KO6.55T[=Z5[[ 
  
  

 {/ 	j')x3"v299A^9__( ( (D$$ !4DK4WY]YdYh i iDe4:FFFzt{':;;;r&   Tdeterministicc                    |j         \  }}|                     |                    d                    }| j        j        sht          j        |                              d          }t          j        |||f          }|                     |                    d                    }n0| j	        d d d |d d f         }|                    |j
                  }||z   }|                     |          }|                     ||          }|S )Ni4)shaperQ   )rT   rG   astyper4   rH   r,   r(   broadcast_torJ   r0   r5   rK   rM   )	rO   	input_idsrQ   
batch_size
seq_lengthinputs_embedsposition_idsposition_embedshidden_statess	            r$   __call__zFlaxEmbeddings.__call__   s   !*
J,,Y-=-=d-C-CDD{/ 	J:j1188>>L+LZ@XYYYL"66|7J7J47P7PQQOO"/;J;0ABO-44]5HIIO &7 }55]-PPr&   NT)__name__
__module____qualname____doc__r   __annotations__r,   r   r5   rP   boolr_    r&   r$   r3   r3   t   sk         QQ{E39"""< < <"       r&   r3   c                   X    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 d
de	de	fdZ
d	S )FlaxMultiHeadSelfAttentionr4   r5   c                    | j         j        | _        | j         j        | _        t          j        | j         j                  | _        | j        | j        z  dk    st          d| j         d| j                   t          j        | j        | j	        t          j        j                            | j         j                            | _        t          j        | j        | j	        t          j        j                            | j         j                            | _        t          j        | j        | j	        t          j        j                            | j         j                            | _        t          j        | j        | j	        t          j        j                            | j         j                            | _        d S )Nr=   r   Hidden size " not dividable by number of heads r7   r5   kernel_init)r4   n_headsrB   r?   rL   attention_dropoutrM   
ValueErrorDenser5   rC   rD   rE   rF   q_link_linv_linout_linrN   s    r$   rP   z FlaxMultiHeadSelfAttention.setup   su   {*;?zt{'DEEE4<'1,,fDHffX\XdffgggXH*+22$+:W2XX
 
 


 XH*+22$+:W2XX
 
 


 XH*+22$+:W2XX
 
 


 xH*+22$+:W2XX
 
 
r&   TFrQ   output_attentionsc           	           |j         \  }}|j         d         }	 j         j        z  dd|	f}
 fd} fd} |                     |                    } |                     |                    } |                     |                    }|t          j                  z  }t          j	        ||
                    dddd                    }t          j        ||
          }|                    |j                  }|dd|z
  z  z
  }t          j        |d	
          }                     ||          }t          j	        ||          } ||          }                     |          }|r||fS |fS )Nr   c                 j    |                      dj                                      dddd          S )zseparate headsr   r   r   r   )reshapero   	transposexbsdim_per_headrO   s    r$   rT   z2FlaxMultiHeadSelfAttention.__call__.<locals>.shape   s3    99RT\<@@JJ1aQRTUVVVr&   c                 n    |                      dddd                              dj        z            S )zgroup headsr   r   r   r   rz   )r|   r{   ro   r}   s    r$   unshapez4FlaxMultiHeadSelfAttention.__call__.<locals>.unshape   s4    ;;q!Q**222r4<,;VWWWr&   r   r   r   gꌠ9Y>)Fg      ?rz   axisrU   )rT   rB   ro   rs   rt   ru   mathsqrtr,   matmulr|   r{   rV   r5   r?   softmaxrM   rv   )rO   querykeyvaluemaskrQ   rw   q_lenrB   k_len
mask_reshprT   r   qkvscoresweightscontextr   r   s   `                  @@r$   r_   z#FlaxMultiHeadSelfAttention.__call__   s    E3	! x4</!Q&
	W 	W 	W 	W 	W 	W 	W	X 	X 	X 	X 	X 	X 	X E$**U##$$E$**S//""E$**U##$$	,'''Aq{{1aA6677{4,,{{6<(($#*--*V"---,,wm,DD*Wa((''"",,w'' 	W%%:r&   N)TFra   rb   rc   r   re   r,   r   r5   rP   rf   r_   rg   r&   r$   ri   ri      s{         {E39"""
 
 
F #"'/ / /  / / / / / /r&   ri   c                   P    e Zd ZU eed<   ej        Zej        ed<   d Zdde	fdZ
dS )	FlaxFFNr4   r5   c                    t          j        | j        j                  | _        | j        j        | _        d| _        t          j        | j        j        | j        t          j         j
                            | j        j                            | _        t          j        | j        j        | j        t          j         j
                            | j        j                            | _        t           | j        j                 | _        d S )Nr=   r   r7   rm   )r?   rL   r4   rM   chunk_size_feed_forwardseq_len_dimrr   
hidden_dimr5   rC   rD   rE   rF   lin1rB   lin2r   
activationrN   s    r$   rP   zFlaxFFN.setup   s    zt{':;;;'+{'J$HK"*+22$+:W2XX
 
 
	
 HKO*+22$+:W2XX
 
 
	 !!78r&   TrQ   c                     |                      |          }|                     |          }|                     |          }|                     ||          }|S )NrU   )r   r   r   rM   )rO   r^   rQ   s      r$   r_   zFlaxFFN.__call__	  sP    		-0066		-00]-PPr&   Nr`   r   rg   r&   r$   r   r      se         {E39"""9 9 9" T      r&   r   c                   X    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 d
de	de	fdZ
d	S )FlaxTransformerBlockr4   r5   c                    | j         j        | j         j        z  dk    s$J d| j         j         d| j         j                     t          | j         | j                  | _        t          j        d| j                  | _        t          | j         | j                  | _
        t          j        d| j                  | _        d S )Nr   rk   rl   r5   r:   r;   )r4   rB   ro   ri   r5   	attentionr?   rK   sa_layer_normr   ffnoutput_layer_normrN   s    r$   rP   zFlaxTransformerBlock.setup  s    {!44999c4;?ccdkNacc :99 4DKtzRRR\%tzJJJ4;dj999!#e4:!N!N!Nr&   FTrw   rQ   c                 0   |                      ||||||          }|r|\  }}n t          |          t          u sJ |d         }|                     ||z             }|                     ||          }|                     ||z             }|f}|r|f|z   }|S )N)r   r   r   r   rw   rQ   r   rU   )r   typetupler   r   r   )	rO   r^   	attn_maskrw   rQ   	sa_output
sa_weights
ffn_outputoutputs	            r$   r_   zFlaxTransformerBlock.__call__   s     NN/' # 
 
	  	%$-!Izz	??e++++!!I&&y='@AA	 XXi}XEE
++J,BCC
 	, ]V+Fr&   N)FTr   rg   r&   r$   r   r     s}         {E39"""	O 	O 	O #("   	
      r&   r   c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )FlaxTransformerr4   r5   c                 \      fdt           j        j                  D              _        d S )Nc                 b    g | ]+}t          j        t          |          j                   ,S ))namer5   )r   r4   strr5   ).0r!   rO   s     r$   
<listcomp>z)FlaxTransformer.setup.<locals>.<listcomp>E  s@     
 
 
QR 3q66LLL
 
 
r&   )ranger4   n_layerslayersrN   s   `r$   rP   zFlaxTransformer.setupD  s@    
 
 
 
V[\`\g\pVqVq
 
 
r&   FTrw   output_hidden_statesrQ   return_dictc                 R   |rdnd }|rdnd }| j         D ]\}	|r||fz   } |	||||          }
|
d         }|r$t          |
          dk    sJ |
d         }||fz   }Gt          |
          dk    sJ ]|r||fz   }|st          d |||fD                       S t          |||          S )	Nrg   )r^   r   rw   rQ   rz   r   r   r   c              3      K   | ]}||V  	d S Nrg   )r   r   s     r$   	<genexpr>z+FlaxTransformer.__call__.<locals>.<genexpr>m  s(      hhqZ[ZgZgZgZgZghhr&   )last_hidden_stater^   
attentions)r   lenr   r   )rO   r^   attention_maskrw   r   rQ   r   all_hidden_statesall_attentionslayer_modulelayer_outputsr   s               r$   r_   zFlaxTransformer.__call__I  s3    #7@BBD0:d K 	/ 	/L# I$58H$H!(L+("3+	  M *"-M  /=))Q....*1-
!/:-!?=))Q.....   	E 1]4D D 	ihh]NDU$Vhhhhhh"+;LYg
 
 
 	
r&   NFFTFr   rg   r&   r$   r   r   @  s         {E39"""
 
 
 #(%*"!'
 '
  	'

 #'
 '
 '
 '
 '
 '
 '
 '
r&   r   c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )FlaxTransformerEncoderr4   r5   c                 F    t          | j        | j                  | _        d S Nr   )r   r4   r5   layerrN   s    r$   rP   zFlaxTransformerEncoder.setupw  s    $T[
CCC


r&   FTrw   r   rQ   r   c                 8    |                      ||||||          S )N)r^   r   rw   r   rQ   r   )r   )rO   r^   r   rw   r   rQ   r   s          r$   r_   zFlaxTransformerEncoder.__call__z  s1     zz')/!5'#  
 
 	
r&   Nr   r   rg   r&   r$   r   r   s  s         {E39"""D D D #(%*"!
 
  	

 #
 
 
 
 
 
 
 
r&   r   c                       e Zd ZU eed<   ej        Zej        ed<   ej	        j
        j        Zedej        f         ed<   d Zd ZdS )FlaxDistilBertLMDecoderr4   r5   .	bias_initc                 ^    |                      d| j        | j        j        f          | _        d S )Nbias)paramr   r4   rA   r   rN   s    r$   rP   zFlaxDistilBertLMDecoder.setup  s'    JJvt~8N7PQQ			r&   c                     t          j        || j                  }t          j        || j                  }t          j        |||j        dz
  fdfdf          }t          j        | j        | j                  }||z   }|S )Nr   )r   )rg   rg   )r,   asarrayr5   r
   dot_generalndimr   )rO   inputskernelyr   s        r$   r_   z FlaxDistilBertLMDecoder.__call__  sq    VTZ00VTZ00OFFv{Q.@$-G,RSS{49dj11Hr&   N)ra   rb   rc   r   re   r,   r   r5   rC   r?   rD   zerosr   r   r   ndarrayrP   r_   rg   r&   r$   r   r     sz         {E39"""+.6+>+DIxRZ(DDDR R R    r&   r   c                   d    e Zd ZU dZeZdZdZej	        e
d<   ddej        dfded	ed
edej        def
 fdZddej        j        d	ededefdZ ee                    d                    	 	 	 	 	 	 	 	 ddee         dej        j        dedee         dee         dee         fd            Z xZS )FlaxDistilBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
distilbertNmodule_class)r   r   r   Tr4   input_shapeseedr5   _do_initc                 v     | j         d||d|}t                                          ||||||           d S )Nr4   r5   )r   r   r5   r   rg   )r   super__init__)	rO   r4   r   r   r5   r   kwargsmodule	__class__s	           r$   r   z&FlaxDistilBertPreTrainedModel.__init__  sQ     #"H&HHHH[tSXcklllllr&   rngparamsreturnc                    t          j        |d          }t          j        |          }t          j                            |          \  }}||d}| j                            |||d          d         }	||t          t          |	                    }	t          t          |                    }| j
        D ]}
|	|
         ||
<   t                      | _
        t          t          |                    S |	S )NrS   r   )r   rM   F)r   r   )r,   r   	ones_likerC   randomsplitr   initr   r   _missing_keyssetr   r	   )rO   r   r   r   rX   r   
params_rngdropout_rngrngsrandom_paramsmissing_keys              r$   init_weightsz*FlaxDistilBertPreTrainedModel.init_weights  s    Ik666	y11"%*"2"23"7"7
K$==((y.V[(\\]ef(-)@)@AAM!(6"2"233F#1 A A&3K&@{##!$D.00111  r&   zbatch_size, sequence_lengthFr   trainrw   r   r   c
           
      H   ||n| j         j        }||n| j         j        }|	|	n| j         j        }	|t	          j        |          }i }
|||
d<   | j                            d|p| j        it	          j	        |d          t	          j	        |d          | |||	|
          S )NrM   r   rS   r   )r   )
r4   rw   r   r   r,   r   r   applyr   r-   )rO   rX   r   	head_maskr   r   r   rw   r   r   r   s              r$   r_   z&FlaxDistilBertPreTrainedModel.__call__  s     2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+BY! ]955N ")DO{  v,-Iit,,,InD111I  ! 	
 	
 		
r&   r   )NNNNFNNN)ra   rb   rc   rd   r   config_classbase_model_prefixr   r?   Modulere   r,   r   r   intr5   rf   r   rC   r   PRNGKeyr   r   r   DISTILBERT_INPUTS_DOCSTRINGformatr   dictr_   __classcell__)r   s   @r$   r   r     s         
 $L$"L")"""
 $;
m 
m 
m 
m 	
m
 y
m 
m 
m 
m 
m 
m 
m! !
 2 ! !PZ !fp ! ! ! !( +*+F+M+MNk+l+lmm !%*.,0/3&*#
 #

 #
 Z'#
 #
 $D>#
 'tn#
 d^#
 #
 #
 nm#
 #
 #
 #
 #
r&   r   c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )FlaxDistilBertModuler4   r5   c                     t          | j        | j                  | _        t	          | j        | j                  | _        d S r   )r3   r4   r5   
embeddingsr   transformerrN   s    r$   rP   zFlaxDistilBertModule.setup  s9    (DJGGG1$+TZPPPr&   TFrQ   rw   r   r   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||          }|                     ||||||          S )NrU   )r^   r   rQ   rw   r   r   )r4   rw   r   r   r  r  )rO   rX   r   rQ   rw   r   r   input_embedss           r$   r_   zFlaxDistilBertModule.__call__  s     2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+BYyNN&)'/!5#   
 
 	
r&   NTFFTr   rg   r&   r$   r  r    s         {E39"""Q Q Q #"'%* 
 
 	

  
 #
 
 
 
 
 
 
r&   r  zdThe bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZdS )FlaxDistilBertModelN)ra   rb   rc   r  r   rg   r&   r$   r  r    s        
 (LLLr&   r  c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )FlaxDistilBertForMaskedLMModuler4   r5   c                 B   t          | j        | j                  | _        t	          j        | j        j        | j        t          j        j        	                    | j        j
                            | _        t	          j        d| j                  | _        | j        j        r"t          | j        | j                  | _        d S t	          j        | j        j        | j        t          j        j        	                    | j        j
                            | _        d S )Nr   r7   rm   r:   r;   )r  r4   r5   r   r?   rr   rB   rC   rD   rE   rF   vocab_transformrK   vocab_layer_normtie_word_embeddingsr   vocab_projectorrA   rN   s    r$   rP   z%FlaxDistilBertForMaskedLMModule.setup   s    .t{$*MMM!xKO*+22$+:W2XX 
  
  

 !#U$* M M M;* 
	#:j$ $ $D   
 $&8&jF/66dk>[6\\$ $ $D   r&   TFrQ   rw   r   r   c                    ||n| j         j        }|                     ||||||          }|d         }|                     |          }	t	          | j         j                 |	          }	|                     |	          }	| j         j        r@| j        j        d         d         d         d         }
| 	                    |	|
j
                  }	n| 	                    |	          }	|s|	f|dd          z   }|S t          |	|j        |j                  S )	N)rX   r   rw   r   rQ   r   r   r   r  rG   	embeddingr   logitsr^   r   )r4   use_return_dictr   r  r   r   r  r  	variablesr  Tr   r^   r   )rO   rX   r   rQ   rw   r   r   dlbrt_outputr^   prediction_logitsshared_embeddingr   s               r$   r_   z(FlaxDistilBertForMaskedLMModule.__call__4  s9    &1%<kk$+B])/!5'# ' 
 
 %Q 00??"4;#9:;LMM 112CDD;* 	H#8B<PQbcdop $ 4 45FHXHZ [ [ $ 4 45F G G 	')L,<<FM!$&4#.
 
 
 	
r&   Nr  r   rg   r&   r$   r  r    s         {E39"""  0 #"'%* &
 &
 	&

  &
 #&
 &
 &
 &
 &
 &
 &
r&   r  z8DistilBert Model with a `language modeling` head on top.c                       e Zd ZeZdS )FlaxDistilBertForMaskedLMN)ra   rb   rc   r  r   rg   r&   r$   r'  r'  ]  s        2LLLr&   r'  c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )-FlaxDistilBertForSequenceClassificationModuler4   r5   c                    t          | j        | j                  | _        t	          j        | j        j        | j        t          j        j        	                    | j        j
                            | _        t	          j        | j        j                  | _        t	          j        | j        j        | j                  | _        d S )Nr   r7   rm   r=   r   )r  r4   r5   r   r?   rr   rB   rC   rD   rE   rF   pre_classifierrL   seq_classif_dropoutrM   
num_labels
classifierrN   s    r$   rP   z3FlaxDistilBertForSequenceClassificationModule.setupi  s    .dkTTT hKO*+22$+:W2XX
 
 

 zt{'FGGG(K"*
 
 
r&   TFrQ   rw   r   r   c                    ||n| j         j        }|                     ||||||          }|d         }|d d df         }	|                     |	          }	t	          d         |	          }	|                     |	|          }	|                     |	          }
|s|
f|dd          z   S t          |
|j        |j	                  S )NrQ   rw   r   r   r   relurU   r   r  )
r4   r   r   r+  r   rM   r.  r   r^   r   )rO   rX   r   rQ   rw   r   r   distilbert_outputhidden_statepooled_outputr  s              r$   r_   z6FlaxDistilBertForSequenceClassificationModule.__call__v  s     &1%<kk$+B] OO'/!5# , 
 
 )+$QQQT*++M::v}55]-PP// 	590444++9(3
 
 
 	
r&   Nr  r   rg   r&   r$   r)  r)  e  s         {E39"""
 
 
" #"'%* !
 !
 	!

  !
 #!
 !
 !
 !
 !
 !
 !
r&   r)  z
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd ZeZdS )'FlaxDistilBertForSequenceClassificationN)ra   rb   rc   r)  r   rg   r&   r$   r6  r6    s         ALLLr&   r6  c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )%FlaxDistilBertForMultipleChoiceModuler4   r5   c                 ~   t          | j        | j                  | _        t	          j        | j        j        | j        t          j        j        	                    | j        j
                            | _        t	          j        | j        j                  | _        t	          j        d| j                  | _        d S )Nr   r7   rm   r=   r   r   )r  r4   r5   r   r?   rr   rB   rC   rD   rE   rF   r+  rL   r,  rM   r.  rN   s    r$   rP   z+FlaxDistilBertForMultipleChoiceModule.setup  s    .dkTTT hKO*+22$+:W2XX
 
 

 zt{'FGGG(*
 
 
r&   TFrQ   rw   r   r   c                 ^   ||n| j         j        }|j        d         }|!|                    d|j        d                   nd }|!|                    d|j        d                   nd }|                     ||||||          }|d         }	|	d d df         }
|                     |
          }
t          d         |
          }
|                     |
|          }
|                     |
          }|                    d|          }|s|f|dd          z   S t          ||j
        |j                  S )	Nr   rz   r0  r   r1  rU   r   r  )r4   r   rT   r{   r   r+  r   rM   r.  r   r^   r   )rO   rX   r   rQ   rw   r   r   num_choicesoutputsr3  r4  r  reshaped_logitss                r$   r_   z.FlaxDistilBertForMultipleChoiceModule.__call__  s\    &1%<kk$+B]oa(BKBWI%%b)/"*=>>>]a	Q_Qk//N4H4LMMMqu //'/!5# " 
 
 qz$QQQT*++M::v}55]-PP// ..[99 	4#%33,"!/)
 
 
 	
r&   Nr  r   rg   r&   r$   r8  r8    s         {E39"""
 
 
" #"'%* (
 (
 	(

  (
 #(
 (
 (
 (
 (
 (
 (
r&   r8  z
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZdS )FlaxDistilBertForMultipleChoiceN)ra   rb   rc   r8  r   rg   r&   r$   r?  r?    s         9LLLr&   r?  z(batch_size, num_choices, sequence_lengthc            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )*FlaxDistilBertForTokenClassificationModuler4   r5   c                     t          | j        | j                  | _        t	          j        | j        j                  | _        t	          j        | j        j        | j                  | _	        d S )Nr   r=   r   )
r  r4   r5   r   r?   rL   rM   rr   r-  r.  rN   s    r$   rP   z0FlaxDistilBertForTokenClassificationModule.setup  sT    .dkTTTzt{':;;;(4;#9LLLr&   TFrQ   rw   r   r   c                    ||n| j         j        }|                     ||||||          }|d         }|                     ||          }|                     |          }	|s|	f|dd          z   S t          |	|j        |j                  S )Nr0  r   rU   r   r  )r4   r   r   rM   r.  r   r^   r   )
rO   rX   r   rQ   rw   r   r   r<  r^   r  s
             r$   r_   z3FlaxDistilBertForTokenClassificationModule.__call__  s     &1%<kk$+B]//'/!5# " 
 
  
]-PP// 	+9wqrr{**(!/)
 
 
 	
r&   Nr  r   rg   r&   r$   rA  rA    s         {E39"""M M M #"'%* 
 
 	

  
 #
 
 
 
 
 
 
r&   rA  z
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    c                       e Zd ZeZdS )$FlaxDistilBertForTokenClassificationN)ra   rb   rc   rA  r   rg   r&   r$   rE  rE  *  s         >LLLr&   rE  c            	       d    e Zd ZU eed<   ej        Zej        ed<   d Z	 	 	 	 dde	de	de	d	e	fd
Z
dS )(FlaxDistilBertForQuestionAnsweringModuler4   r5   c                    t          | j        | j                  | _        t	          j        | j        j        | j                  | _        | j        j        dk    sJ t	          j        | j        j	                  | _
        d S )Nr   r   r   r=   )r  r4   r5   r   r?   rr   r-  
qa_outputsrL   
qa_dropoutrM   rN   s    r$   rP   z.FlaxDistilBertForQuestionAnsweringModule.setupA  sj    .dkTTT(4;#9LLL{%****zt{'=>>>r&   TFrQ   rw   r   r   c                    ||n| j         j        }|                     ||||||          }|d         }|                     ||          }|                     |          }	t          j        |	| j         j        d          \  }
}|
                    d          }
|                    d          }|s|
|f|dd          z   S t          |
||j
        |j                  S )Nr0  r   rU   rz   r   r   )start_logits
end_logitsr^   r   )r4   r   r   rM   rI  r,   r   r-  squeezer   r^   r   )rO   rX   r   rQ   rw   r   r   r2  r^   r  rL  rM  s               r$   r_   z1FlaxDistilBertForQuestionAnsweringModule.__call__G  s	    &1%<kk$+B] !OO'/!5# , 
 
 *!,]-PP//#&9VT[5KRT#U#U#U j#++B//''++
 	F *-0A!""0EEE/%!+9(3	
 
 
 	
r&   Nr  r   rg   r&   r$   rG  rG  =  s         {E39"""? ? ? #"'%* %
 %
 	%

  %
 #%
 %
 %
 %
 %
 %
 %
r&   rG  z
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZdS )"FlaxDistilBertForQuestionAnsweringN)ra   rb   rc   rG  r   rg   r&   r$   rP  rP  o  s         <LLLr&   rP  )r'  r?  rP  r6  rE  r  r   )Fr   typingr   r   
flax.linenlinenr?   rC   	jax.numpynumpyr,   r   flax.core.frozen_dictr   r   r   flax.traverse_utilr   r	   r
   modeling_flax_outputsr   r   r   r   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_distilbertr   
get_loggerra   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCFLAX_DISTILBERT_START_DOCSTRINGr  r%   r1   r  r3   ri   r   r   r   r   r   r   r  r  r  r'  r)  r6  r8  r?  r  rA  rE  rG  rP  __all__rg   r&   r$   <module>rb     s     % % % % % % % %       



           > > > > > > > > > > ; ; ; ; ; ; ; ;                      w v v v v v v v v v v v Y Y Y Y Y Y Y Y Y Y 6 6 6 6 6 6 
	H	%	%/ $# . 6  
# # #* * * * *RY * * *ZP P P P P P P Pf    bi   :, , , , ,29 , , ,^0
 0
 0
 0
 0
bi 0
 0
 0
f
 
 
 
 
RY 
 
 
4    bi   "N
 N
 N
 N
 N
$7 N
 N
 N
b
 
 
 
 
29 
 
 
D j# ( ( ( ( (7 ( (	 (  02Et_ ] ] ]>
 >
 >
 >
 >
bi >
 >
 >
B TVuvv3 3 3 3 3 = 3 3 wv3  68KM_ap q q q2
 2
 2
 2
 2
BI 2
 2
 2
j  $ A A A A A.K A A A  + 	  9
 9
 9
 9
 9
BI 9
 9
 9
x  $ 9 9 9 9 9&C 9 9 9  #%@%G%GHr%s%s    #!	  (
 (
 (
 (
 (
 (
 (
 (
V  $ > > > > >+H > > >  (	  /
 /
 /
 /
 /
ry /
 /
 /
d  $ < < < < <)F < < <  &$	    r&   