
     `i&                    ~   d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.  e*j/        e0          Z1 G d dej2                  Z3 G d dej2                  Z4 G d de4          Z5 G d dej2                  Z6e4e5dZ7 G d dej2                  Z8 G d dej2                  Z9 G d  d!ej2                  Z: G d" d#e          Z; G d$ d%ej2                  Z< G d& d'ej2                  Z=e) G d( d)e#                      Z> G d* d+ej2                  Z? G d, d-ej2                  Z@e) G d. d/e>                      ZAe) G d0 d1e>                      ZB e)d23           G d4 d5e>                      ZCe) G d6 d7e>                      ZDe) G d8 d9e>                      ZEe) G d: d;e>                      ZF e)d<3           G d= d>e>e                      ZGdAd?ZHg d@ZIdS )BzPyTorch CamemBERT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging)deprecate_kwarg   )CamembertConfigc                   2     e Zd ZdZ fdZ	 ddZd Z xZS )CamembertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                 l   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           |j        | _        t          j        |j        |j        | j                  | _	        d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r"   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr*   register_buffertorcharangeexpandzerosr,   sizelongr'   selfconfig	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/camembert/modeling_camembert.pyr3   zCamembertEmbeddings.__init__9   s}   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	

 ".#%<*F,>DL\$
 $
 $
       Nr   c                    |.|t          || j        |          }n|                     |          }||                                }n|                                d d         }|d         }|mt	          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t          j	        | j
        j                  }||                     |          }|                     |          }
||
z   }| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nr-   r"   r/   r   r1   devicer+   )"create_position_ids_from_input_idsr'   &create_position_ids_from_inputs_embedsrH   hasattrr/   rF   rD   rG   rI   r,   rR   r8   r<   r*   r:   r=   rA   )rK   	input_idsr/   r,   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr<   
embeddingsr:   s                rN   forwardzCamembertEmbeddings.forwardR   s{    $A)TM]_uvv#JJ=YY #..**KK',,..ss3K ^

 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
rO   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr-   r"   rQ   r   )rH   rD   rE   r'   rI   rR   	unsqueezerF   )rK   rW   rY   sequence_lengthr,   s        rN   rT   z:CamembertEmbeddings.create_position_ids_from_inputs_embedsz   s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<rO   )NNNNr   )__name__
__module____qualname____doc__r3   r^   rT   __classcell__rM   s   @rN   r%   r%   3   sm         

 
 
 
 
4 rs& & & &P= = = = = = =rO   r%   c                       e Zd Zd fd	Z eddd          	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 dee	         dee
         deej                 deej                 fd            Z xZS )CamembertSelfAttentionNc                 R   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        |pt#          |dd          | _        | j        dk    s| j        d	k    r6|j        | _        t          j        d
|j        z  dz
  | j                  | _        |j        | _        || _        d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r*   r+   relative_keyrelative_key_query   r"   )r2   r3   r6   num_attention_headsrU   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer?   attention_probs_dropout_probrA   rB   r*   r9   r4   distance_embedding
is_decoder	layer_idxrK   rL   r*   r|   rM   s       rN   r3   zCamembertSelfAttention.__init__   s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'> (
'-zC
 C
$ '>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD# +"rO   past_key_valuepast_key_values4.58new_nameversionFhidden_statesattention_mask	head_maskencoder_hidden_statesoutput_attentionscache_positionreturnc                    |j         \  }}	}
|                     |          }|                    |d| j        | j                                      dd          }d}|d u}|Ht          |t                    r1|j        	                    | j
                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                    |d| j        | j                                      dd          }|                     |          }|                    |d| j        | j                                      dd          }|N|s|nd }|                    ||| j
        d|i          \  }}|r$t          |t                    rd|j        | j
        <   t'          j        ||                    dd                    }| j        dk    s| j        d	k    rt|j         d         |j         d         }}|>t'          j        |dz
  t&          j        |j        
                              dd          }n:t'          j        |t&          j        |j        
                              dd          }t'          j        |t&          j        |j        
                              dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j        dk    rt'          j        d||          }||z   }n?| j        d	k    r4t'          j        d||          }t'          j        d||          }||z   |z   }|t?          j         | j                  z  }|||z   }tB          j"        #                    |d          }| $                    |          }|||z  }t'          j        ||          }|%                    dddd          &                                }|'                                d d         | j(        fz   }|                    |          }||fS )Nr-   r"   ro   Fr   Trm   rn   rQ   r0   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r	   ))shaperv   viewrp   rs   	transpose
isinstancer   
is_updatedgetr|   cross_attention_cacheself_attention_cachelayerskeysvaluesrw   rx   updaterD   matmulr*   tensorrI   rR   rE   rz   r9   tor1   einsummathsqrtr   
functionalsoftmaxrA   permute
contiguousrH   rt   )rK   r   r   r   r   r   r   r   
batch_sizerZ   _query_layerr   is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  rN   r^   zCamembertSelfAttention.forward   s    %2$7!
Jjj//!&&z2t7OQUQijjttq
 
 
2$>&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#2DW..- 	F/"=*"=+24>BGI-4T^DKKK00I!z2t7OQUQijjtt1 I **^44K%**B 8$:R i1oo  *7I!St)<)C)C{DN=M~<^* *&	; & F*_FY*Z*Z FAEO.t~> !<Y5H5HR5P5PQQ'>99T=Y]q=q=q'2'8';Y_Q=O*L*!&j1nEJWdWk!l!l!l!q!q" " "'l%*UbUi!j!j!j!o!oprtu!v!v"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCo--rO   NNNNNNFNrb   rc   rd   r3   r!   rD   Tensorr   FloatTensorr   booltupler^   rf   rg   s   @rN   ri   ri      s       # # # # # #6 _%0A6RRR 7;15=A+/,115e. e.|e. !!23e. E-.	e.
  ((9:e. "%e. $D>e. !.e. 
u|	e. e. e. SRe. e. e. e. e.rO   ri   c                       e Zd Zd fd	Z eddd          	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 dee	         dee
         deej                 deej                 f fd            Z xZS )CamembertSdpaSelfAttentionNc                 h    t                                          |||           |j        | _        d S Nr*   r|   )r2   r3   ry   dropout_probr}   s       rN   r3   z#CamembertSdpaSelfAttention.__init__  s5    9P\efff"?rO   r~   r   r   r   Fr   r   r   r   r   r   r   c           	      P   | j         dk    s|s|At                              d           t                                          |||||||          S |                                \  }}	}
|                     |                              |d| j        | j	                  
                    dd          }d}|d u}|r|n|}|Ht          |t                    r1|j                            | j                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j                 j        }|j        | j                 j        }n|                     |                              |d| j        | j	                  
                    dd          }|                     |                              |d| j        | j	                  
                    dd          }|N|s|nd }|                    ||| j        d|i          \  }}|r$t          |t                    rd|j        | j        <   | j        o| o	|d u o|	dk    }t2          j        j                            ||||| j        r| j        nd	|
          }|
                    dd          }|                    ||	| j                   }|d fS )Nr+   a  CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r-   r"   ro   Fr   T        )	attn_mask	dropout_p	is_causal)!r*   loggerwarning_oncer2   r^   rH   rv   r   rp   rs   r   r   r   r   r   r|   r   r   r   r   r   rw   rx   r   r{   rD   r   r   scaled_dot_product_attentiontrainingr   reshapert   )rK   r   r   r   r   r   r   r   bsztgt_lenr   r   r   r   r   r   r   r   r   attn_outputrM   s                       rN   r^   z"CamembertSdpaSelfAttention.forward  s    ':559J5iNcH   77??%!   (,,..Wa JJ}%%**3D4LdNfggqqrsuvww 	 
2$>2DW..-&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#2DW..- 	F/"=*"=+24>BGI-4T^DKKK ((c2t79QRR1a  

>**c2t79QRR1a  *7I!St)<)C)C{DN=M~<^* *&	; & F*_FY*Z*Z FAEO.t~> Oi,>(>i>UYCYi^ehi^i	h)FF$+/=Ad''c G 
 
 "++Aq11!))#w8JKKD  rO   r   r   r   rg   s   @rN   r   r     s+       @ @ @ @ @ @
 _%0A6RRR 2615=A+/,115^! ^!|^! !.^! E-.	^!
  ((9:^! "%^! $D>^! !.^! 
u|	^! ^! ^! ^! ^! SR^! ^! ^! ^! ^!rO   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )CamembertSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr(   )r2   r3   r   ru   r6   denser=   r>   r?   r@   rA   rJ   s     rN   r3   zCamembertSelfOutput.__init__}  sf    Yv163EFF
f&8f>STTTz&"<==rO   r   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S Nr   rA   r=   rK   r   r   s      rN   r^   zCamembertSelfOutput.forward  @    

=11]33}|'CDDrO   rb   rc   rd   r3   rD   r   r^   rf   rg   s   @rN   r   r   |  i        > > > > >U\  RWR^        rO   r   )eagersdpac                       e Zd Zd fd	Zd Z eddd          	 	 	 	 	 	 dd	ej        d
eej	                 deej	                 deej	                 dee
         dee         deej                 deej                 fd            Z xZS )CamembertAttentionNc                     t                                                       t          |j                 |||          | _        t          |          | _        t                      | _        d S r   )	r2   r3    CAMEMBERT_SELF_ATTENTION_CLASSES_attn_implementationrK   r   outputsetpruned_headsr}   s       rN   r3   zCamembertAttention.__init__  sc    4V5PQ$;
 
 
	
 *&11EErO   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r"   r   )lenr   rK   rp   rs   r   r   rv   rw   rx   r   r   rt   union)rK   headsindexs      rN   prune_headszCamembertAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::rO   r~   r   r   r   Fr   r   r   r   r   r   r   c           	          |                      |||||||          }|                     |d         |          }	|	f|dd          z   }
|
S )Nr   r   r   r   r   r   r   r"   )rK   r   )rK   r   r   r   r   r   r   r   self_outputsattention_outputoutputss              rN   r^   zCamembertAttention.forward  sf     yy)"7+/) ! 
 
  ;;|AFF#%QRR(88rO   r   r   )rb   rc   rd   r3   r   r!   rD   r   r   r   r   r   r   r^   rf   rg   s   @rN   r   r     s       " " " " " "; ; ;$ _%0A6RRR 7;15=A+/,115 | !!23 E-.	
  ((9: "% $D> !. 
u|	   SR    rO   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )CamembertIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r2   r3   r   ru   r6   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnrJ   s     rN   r3   zCamembertIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$rO   r   r   c                 Z    |                      |          }|                     |          }|S r   )r   r   )rK   r   s     rN   r^   zCamembertIntermediate.forward  s,    

=1100??rO   r   rg   s   @rN   r   r     s^        9 9 9 9 9U\ el        rO   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )CamembertOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )r2   r3   r   ru   r   r6   r   r=   r>   r?   r@   rA   rJ   s     rN   r3   zCamembertOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==rO   r   r   r   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      rN   r^   zCamembertOutput.forward  r   rO   r   rg   s   @rN   r   r     r   rO   r   c                   0    e Zd Zd fd	Z eddd          	 	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 deej                 dee	         dee
         deej                 deej                 fd            Zd Z xZS )CamembertLayerNc                    t                                                       |j        | _        d| _        t	          ||          | _        |j        | _        |j        | _        | j        r0| j        st          |  d          t	          |d|          | _	        t          |          | _        t          |          | _        d S )Nr"   r|   z> should be used as a decoder model if cross attention is addedr+   r   )r2   r3   chunk_size_feed_forwardseq_len_dimr   	attentionr{   add_cross_attentionrq   crossattentionr   intermediater   r   rK   rL   r|   rM   s      rN   r3   zCamembertLayer.__init__  s    '-'E$+FiHHH +#)#= # 	v? j D!h!h!hiii"4VU_kt"u"u"uD1&99%f--rO   r~   r   r   r   Fr   r   r   r   encoder_attention_maskr   r   r   c	           	      h   |                      ||||||          }	|	d         }
|	dd          }| j        rV|Tt          | d          st          d|  d          |                     |
||||||          }|d         }
||dd          z   }t          | j        | j        | j        |
          }|f|z   }|S )N)r   r   r   r   r   r   r"   r
  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r  r{   rU   rq   r
  r   feed_forward_chunkr  r  )rK   r   r   r   r   r  r   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputs                 rN   r^   zCamembertLayer.forward  s!    "&)/+) "0 "
 "
 2!4(,? 	<4@4!122  Dd D D D  
 '+&9&9 5#&; /"3- ': ' '#  7q9 7 ;;G0#T%A4CSUe
 
  /G+rO   c                 \    |                      |          }|                     ||          }|S r   )r  r   )rK   r   intermediate_outputr  s       rN   r  z!CamembertLayer.feed_forward_chunk'  s2    "//0@AA{{#68HIIrO   r   )NNNNNFN)rb   rc   rd   r3   r!   rD   r   r   r   r   r   r   r^   r  rf   rg   s   @rN   r  r    s.       . . . . . . _%0A6RRR 7;15=A>B+/,115. .|. !!23. E-.	.
  ((9:. !)): ;. "%. $D>. !.. 
u|	. . . SR.`      rO   r  c                   H    e Zd Zd fd	Z	 	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
ee         dee	         dee	         dee	         dee	         deej                 de
eej                 ef         fdZ xZS )CamembertEncoderNc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 2    g | ]}t          |           S )r  )r  ).0irL   s     rN   
<listcomp>z-CamembertEncoder.__init__.<locals>.<listcomp>2  s&    #q#q#qAN6Q$G$G$G#q#q#qrO   F)	r2   r3   rL   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr  s    ` rN   r3   zCamembertEncoder.__init__/  sb    ]#q#q#q#qQVW]WoQpQp#q#q#qrr
&+###rO   FTr   r   r   r   r  r   	use_cacher   output_hidden_statesreturn_dictr   r   c                    |	rdnd }|rdnd }|r| j         j        rdnd }| j        r%| j        r|rt                              d           d}|rD| j         j        r8|6t          t          | j                   t          | j                             }|rO| j         j        rCt          |t                    r.t                              d           t          j        |          }t          | j                  D ]Z\  }}|	r||fz   }|||         nd } |||||||||          }|d         }|r$||d         fz   }| j         j        r||d	         fz   }[|	r||fz   }|
st          d
 |||||fD                       S t          |||||          S )N zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rL   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r  r   r   r   r   r"   ro   c              3      K   | ]}||V  	d S r   r%  )r  vs     rN   	<genexpr>z+CamembertEncoder.forward.<locals>.<genexpr>t  s4       
 
 =  !===
 
rO   )last_hidden_stater   r   
attentionscross_attentions)rL   r	  r   r   r   r   r{   r   r   r   r   from_legacy_cache	enumerater  r   )rK   r   r   r   r   r  r   r!  r   r"  r#  r   all_hidden_statesall_self_attentionsall_cross_attentionsr  layer_modulelayer_head_masklayer_outputss                      rN   r^   zCamembertEncoder.forward5  sc    #7@BBD$5?bb4%6d4;;Zdrr`d& 	"4= 	" "##p   "	 	v/ 	vO4K1,dk2R2R2RT`hlhsTtTtTtuuO 	U/ 	UJPU4V4V 	U\  
 2COTTO(44 	V 	VOA|# I$58H$H!.7.CillO(L%'= /"3-	 	 	M *!,M  V&9]1=M<O&O#;2 V+?=QRCSBU+U( 	E 1]4D D 	 
 
 "#%'(
 
 
 
 
 
 9+++*1
 
 
 	
rO   r   )
NNNNNNFFTN)rb   rc   rd   r3   rD   r   r   r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r  .  sP       , , , , , , 7;15=A>B+/$(,1/4&*15P
 P
|P
 !!23P
 E-.	P

  ((9:P
 !)): ;P
 "%P
 D>P
 $D>P
 'tnP
 d^P
 !.P
 
uU\"$MM	NP
 P
 P
 P
 P
 P
 P
 P
rO   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )CamembertPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )r2   r3   r   ru   r6   r   Tanh
activationrJ   s     rN   r3   zCamembertPooler.__init__  sC    Yv163EFF
'))rO   r   r   c                 r    |d d df         }|                      |          }|                     |          }|S Nr   )r   r8  )rK   r   first_token_tensorpooled_outputs       rN   r^   zCamembertPooler.forward  s@     +111a40

#56666rO   r   rg   s   @rN   r5  r5    s^        $ $ $ $ $
U\ el        rO   r5  c                   ,    e Zd ZU eed<   dZdZdZd ZdS )CamembertPreTrainedModelrL   robertaTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS t          |t                    r |j        j        	                                 dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   ru   weightdatanormal_rL   initializer_rangebiaszero_r4   r'   r=   fill_CamembertLMHead)rK   modules     rN   _init_weightsz&CamembertPreTrainedModel._init_weights  sX   fbi(( 	% M&&CT[5R&SSS{& &&((((( '&-- 	%M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	%K""$$$M$$S)))))00 	%K""$$$$$	% 	%rO   N)	rb   rc   rd   r#   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_sdparL  r%  rO   rN   r>  r>    sB         !&*#N% % % % %rO   r>  c                   (     e Zd ZdZ fdZd Z xZS )CamembertClassificationHeadz-Head for sentence-level classification tasks.c                 4   t                                                       t          j        |j        |j                  | _        |j        |j        n|j        }t          j        |          | _	        t          j        |j        |j
                  | _        d S r   )r2   r3   r   ru   r6   r   classifier_dropoutr@   r?   rA   
num_labelsout_projrK   rL   rT  rM   s      rN   r3   z$CamembertClassificationHead.__init__  s    Yv163EFF
)/)B)NF%%TZTn 	 z"455	&"4f6GHHrO   c                     |d d dd d f         }|                      |          }|                     |          }t          j        |          }|                      |          }|                     |          }|S r:  )rA   r   rD   tanhrV  rK   featureskwargsxs       rN   r^   z#CamembertClassificationHead.forward  sj    QQQ111WLLOOJJqMMJqMMLLOOMM!rO   )rb   rc   rd   re   r3   r^   rf   rg   s   @rN   rR  rR    sR        77I I I I I      rO   rR  c                   .     e Zd ZdZ fdZd Zd Z xZS )rJ  z,Camembert Head for masked language modeling.c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j	                  | _
        t          j        t          j        |j	                            | _        | j        | j
        _        d S r   )r2   r3   r   ru   r6   r   r=   r>   
layer_normr5   decoder	ParameterrD   rG   rG  rJ   s     rN   r3   zCamembertLMHead.__init__  s    Yv163EFF
,v'9v?TUUUy!3V5FGGLV->!?!?@@	 IrO   c                     |                      |          }t          |          }|                     |          }|                     |          }|S r   )r   r   r`  ra  rZ  s       rN   r^   zCamembertLMHead.forward  sE    JJx  GGOOA LLOOrO   c                     | j         j        j        j        dk    r| j        | j         _        d S | j         j        | _        d S )Nmeta)ra  rG  rR   typerK   s    rN   _tie_weightszCamembertLMHead._tie_weights  s<     <#(F22 $	DL)DIIIrO   )rb   rc   rd   re   r3   r^   rh  rf   rg   s   @rN   rJ  rJ    s\        66& & & & &  * * * * * * *rO   rJ  c            "           e Zd ZdZg Zd fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 	 dde
ej                 d	e
ej                 d
e
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de
e         de
e         de
e         de
e         de
e         de
ej                 deeej                 ef         fd            Z xZS )CamembertModela1  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762

    Tc                 0   t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd| _        |j	        | _
        |j        | _        |                                  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r2   r3   rL   r%   r]   r  encoderr5  poolerr   attn_implementationr*   	post_init)rK   rL   add_pooling_layerrM   s      rN   r3   zCamembertModel.__init__  s    
 	   -f55'//1BLof---#)#> '-'E$ 	rO   c                     | j         j        S r   r]   r8   rg  s    rN   get_input_embeddingsz#CamembertModel.get_input_embeddings  s    ..rO   c                     || j         _        d S r   rr  )rK   rx   s     rN   set_input_embeddingsz#CamembertModel.set_input_embeddings  s    */'''rO   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrl  r  r  r   )rK   heads_to_pruner  r   s       rN   _prune_headszCamembertModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	CrO   NrV   r   r/   r,   r   rW   r   r  r   r!  r   r"  r#  r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }| j         j        r|
|
n| j         j        }
nd}
||t          d          |+|                     ||           |                                }n.||                                d d         }nt          d          |\  }}||j	        n|j	        }d}|	Bt          |	t                    s|	d         d         j        d         n|	                                }|gt          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t%          j        |t$          j        |          }|                     |||||	          }|t%          j        |||z   f|
          }| j        dk    o| j        dk    o|d u o| }|rO|                                dk    r7| j         j        rt3          ||||          }n.t5          ||j        |          }n|                     ||          }| j         j        r~|||                                \  }}}||f}|t%          j        ||
          }|r0|                                dk    rt5          ||j        |          }n|                     |          }nd }|                     || j         j                  }|                      ||||||	|
||||          }|d         }| j!        | !                    |          nd } |s|| f|dd          z   S tE          || |j#        |j$        |j%        |j&                  S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer-   z5You have to specify either input_ids or inputs_embedsr   r   r/   rQ   )rV   r,   r/   rW   rX   )rR   r   r+   ro   )r   )
r   r   r   r  r   r!  r   r"  r#  r   r"   )r)  pooler_outputr   r   r*  r+  )'rL   r   r"  use_return_dictr{   r!  rq   %warn_if_padding_and_no_attention_maskrH   rR   r   r   r   get_seq_lengthrU   r]   r/   rF   rD   rG   rI   onesrn  r*   r   r   r   r1   get_extended_attention_maskinvert_attention_maskget_head_maskr  rl  rm  r   r   r   r*  r+  )!rK   rV   r   r/   r,   r   rW   r   r  r   r!  r   r"  r#  r   rY   r   rZ   rR   rX   r[   r\   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputr<  s!                                    rN   r^   zCamembertModel.forward  ss   & 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B];! 	%.%:		@UIII ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"& "/5996"1%+B//$3355 # !t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z??%)'#9 + 
 
 !"ZZBX5X(YbhiiiN $. &,
:&T!& &%	 	! $ 	d(:(:(<(<(A(A {% 
*T"$*	+ +'' +N"$4$:J+ + +'' '+&F&F~Wb&c&c# ;! 	3&;&G=R=W=W=Y=Y: 7$68O#P %-).4HQW)X)X)X&' e,B,F,F,H,HA,M,M 3V*,<,BJ3 3 3// 372L2LMc2d2d//.2+ &&y$+2OPP	,,2"7#B+/!5#) ' 
 
 *!,8<8OO444UY 	J#]3oabb6III;-'+;)7&1,=
 
 
 	
rO   )TNNNNNNNNNNNNNN)rb   rc   rd   re   _no_split_modulesr3   rs  ru  ry  r   r   rD   r   r   r   r   r   r   r^   rf   rg   s   @rN   rj  rj    s              &/ / /0 0 0C C C  -11515/3,0048<9=+/$(,0/3&*15S
 S
EL)S
 !.S
 !.	S

 u|,S
 EL)S
  -S
  (5S
 !) 6S
 "%S
 D>S
 $D>S
 'tnS
 d^S
 !.S
  
uU\"$PP	Q!S
 S
 S
 ^S
 S
 S
 S
 S
rO   rj  c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j
                 dee	j                 dee	j                 dee	j                 dee	j                 dee	j
                 dee         dee         dee         deee	j                 ef         fd            Z xZS )CamembertForMaskedLMlm_head.decoder.weightlm_head.decoder.biasc                    t                                          |           |j        rt                              d           t          |d          | _        t          |          | _        | 	                                 d S )NzpIf you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Frp  
r2   r3   r{   r   warningrj  r?  rJ  lm_headro  rJ   s     rN   r3   zCamembertForMaskedLM.__init__  s~        	NN1  
 &fFFF&v.. 	rO   c                     | j         j        S r   r  ra  rg  s    rN   get_output_embeddingsz*CamembertForMaskedLM.get_output_embeddings      |##rO   c                     || j         _        d S r   r  rK   new_embeddingss     rN   set_output_embeddingsz*CamembertForMaskedLM.set_output_embeddings      -rO   NrV   r   r/   r,   r   rW   r   r  labelsr   r"  r#  r   c                    ||n| j         j        }|                     |||||||||
||          }|d         }|                     |          }d}|	e|	                    |j                  }	t                      } ||                    d| j         j                  |	                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r   r/   r,   r   rW   r   r  r   r"  r#  r   r-   ro   losslogitsr   r*  )rL   r|  r?  r  r   rR   r   r   r5   r   r   r*  )rK   rV   r   r/   r,   r   rW   r   r  r  r   r"  r#  r   r  prediction_scoresmasked_lm_lossloss_fctr   s                      rN   r^   zCamembertForMaskedLM.forward  s/   > &1%<kk$+B],,))%'"7#9/!5#  
 
 "!* LL99YY0788F'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
rO   )NNNNNNNNNNNN)rb   rc   rd   _tied_weights_keysr3   r  r  r   r   rD   
LongTensorr   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    s        34JK    $ $ $. . .  156:59371559=A>B-1,0/3&*@
 @
E,-@
 !!23@
 !!12	@

 u/0@
 E-.@
   12@
  ((9:@
 !)): ;@
 )*@
 $D>@
 'tn@
 d^@
 
uU\"N2	3@
 @
 @
 ^@
 @
 @
 @
 @
rO   r  z
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eej                 ef         fd            Z xZS )"CamembertForSequenceClassificationc                     t                                          |           |j        | _        || _        t	          |d          | _        t          |          | _        |                                  d S NFr  )	r2   r3   rU  rL   rj  r?  rR  
classifierro  rJ   s     rN   r3   z+CamembertForSequenceClassification.__init__  sg        +%fFFF5f== 	rO   NrV   r   r/   r,   r   rW   r  r   r"  r#  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|t|                    |j                  }| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j
        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt!                      } |||          }|
s|f|d	d         z   }||f|z   n|S t#          |||j        |j        
          S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r/   r,   r   rW   r   r"  r#  r   r"   
regressionsingle_label_classificationmulti_label_classificationr-   ro   r  )rL   r|  r?  r  r   rR   problem_typerU  r1   rD   rI   rr   r   squeezer   r   r   r   r   r*  rK   rV   r   r/   r,   r   rW   r  r   r"  r#  r   r  r  r  r  r   s                    rN   r^   z*CamembertForSequenceClassification.forward(  s   : &1%<kk$+B],,))%'/!5#  

 

 "!*11YYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
rO   
NNNNNNNNNN)rb   rc   rd   r3   r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    sS       	 	 	 	 	  156:59371559-1,0/3&*N
 N
E,-N
 !!23N
 !!12	N

 u/0N
 E-.N
   12N
 )*N
 $D>N
 'tnN
 d^N
 
uU\"$<<	=N
 N
 N
 ^N
 N
 N
 N
 N
rO   r  c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eej                 ef         fd            Z xZS )CamembertForMultipleChoicec                    t                                          |           t          |          | _        t	          j        |j                  | _        t	          j        |j	        d          | _
        |                                  d S )Nr"   )r2   r3   rj  r?  r   r?   r@   rA   ru   r6   r  ro  rJ   s     rN   r3   z#CamembertForMultipleChoice.__init__}  sl       %f--z&"<==)F$6:: 	rO   NrV   r/   r   r  r,   r   rW   r   r"  r#  r   c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|4|                    |j	                  }t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr"   r-   r   )r,   r/   r   r   rW   r   r"  r#  ro   r  )rL   r|  r   r   rH   r?  rA   r  r   rR   r   r   r   r*  )rK   rV   r/   r   r  r,   r   rW   r   r"  r#  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r<  r  reshaped_logitsr  r  r   s                           rN   r^   z"CamembertForMultipleChoice.forward  sB   Z &1%<kk$+B],5,Aioa((}GZ[\G]CLCXINN2,>,>???^bLXLdL--b,2C2CB2G2GHHHjnR`Rln11"n6I6I"6M6MNNNrvR`Rln11"n6I6I"6M6MNNNrv ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 ,,*..,/!5#  

 

  
]33// ++b+66YY566F'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
rO   r  )rb   rc   rd   r3   r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r  z  sS             15596:-1371559,0/3&*Z
 Z
E,-Z
 !!12Z
 !!23	Z

 )*Z
 u/0Z
 E-.Z
   12Z
 $D>Z
 'tnZ
 d^Z
 
uU\"$==	>Z
 Z
 Z
 ^Z
 Z
 Z
 Z
 Z
rO   r  c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eej                 ef         fd            Z xZS )CamembertForTokenClassificationc                 Z   t                                          |           |j        | _        t          |d          | _        |j        |j        n|j        }t          j        |          | _	        t          j
        |j        |j                  | _        |                                  d S r  )r2   r3   rU  rj  r?  rT  r@   r   r?   rA   ru   r6   r  ro  rW  s      rN   r3   z(CamembertForTokenClassification.__init__  s        +%fFFF)/)B)NF%%TZTn 	 z"455)F$68IJJ 	rO   NrV   r   r/   r,   r   rW   r  r   r"  r#  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|`|                    |j                  }t                      } ||                    d| j	                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r-   ro   r  )rL   r|  r?  rA   r  r   rR   r   r   rU  r   r   r*  r  s                    rN   r^   z'CamembertForTokenClassification.forward  s(   6 &1%<kk$+B],,))%'/!5#  

 

 "!*,,7711YYv}--F'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
rO   r  )rb   rc   rd   r3   r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    s?             156:59371559-1,0/3&*=
 =
E,-=
 !!23=
 !!12	=

 u/0=
 E-.=
   12=
 )*=
 $D>=
 'tn=
 d^=
 
uU\"$99	:=
 =
 =
 ^=
 =
 =
 =
 =
rO   r  c                   x    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         de
eej                 ef         fd            Z xZS )CamembertForQuestionAnsweringc                     t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _        | 	                                 d S r  )
r2   r3   rU  rj  r?  r   ru   r6   
qa_outputsro  rJ   s     rN   r3   z&CamembertForQuestionAnswering.__init__:  sj        +%fFFF)F$68IJJ 	rO   NrV   r   r/   r,   r   rW   start_positionsend_positionsr   r"  r#  r   c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Nr  r   r"   r-   r   )ignore_indexro   )r  start_logits
end_logitsr   r*  )rL   r|  r?  r  splitr  r   r   rH   clampr   r   r   r*  )rK   rV   r   r/   r,   r   rW   r  r  r   r"  r#  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rN   r^   z%CamembertForQuestionAnswering.forwardD  s   4 &1%<kk$+B],,))%'/!5#  

 

 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
rO   )NNNNNNNNNNN)rb   rc   rd   r3   r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r  7  si             156:593715596:48,0/3&*I
 I
E,-I
 !!23I
 !!12	I

 u/0I
 E-.I
   12I
 "%"23I
   01I
 $D>I
 'tnI
 d^I
 
uU\"$@@	AI
 I
 I
 ^I
 I
 I
 I
 I
rO   r  zU
    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c            "           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j
                 dee	j                 dee	j                 dee	j                 dee	j                 dee	j
                 dee         dee         dee         dee         dee         deee	j                 ef         fd            Z xZS )CamembertForCausalLMr  r  c                    t                                          |           |j        st                              d           t          |d          | _        t          |          | _        | 	                                 d S )NzQIf you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  r  rJ   s     rN   r3   zCamembertForCausalLM.__init__  su         	pNNnooo%fFFF&v.. 	rO   c                     | j         j        S r   r  rg  s    rN   r  z*CamembertForCausalLM.get_output_embeddings  r  rO   c                     || j         _        d S r   r  r  s     rN   r  z*CamembertForCausalLM.set_output_embeddings  r  rO   NrV   r   r/   r,   r   rW   r   r  r  r   r!  r   r"  r#  r   c                    ||n| j         j        }|	d}|                     |||||||||
||||          }|d         }|                     |          }d}|	5|	                    |j                  }	 | j        ||	fd| j         j        i|}|s|f|dd         z   }||f|z   n|S t          |||j	        |j
        |j        |j                  S )aq  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
        >>> config.is_decoder = True
        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r   r/   r,   r   rW   r   r  r   r!  r   r"  r#  r   r5   ro   )r  r  r   r   r*  r+  )rL   r|  r?  r  r   rR   loss_functionr5   r   r   r   r*  r+  )rK   rV   r   r/   r,   r   rW   r   r  r  r   r!  r   r"  r#  r\  r   r  r  lm_lossr   s                        rN   r^   zCamembertForCausalLM.forward  sC   d &1%<kk$+B]I,,))%'"7#9+/!5#  
 
  "!* LL99YY0788F(d(!   ;1 	 G  	L')GABBK7F,3,?WJ''VK0$#3!/)$5
 
 
 	
rO   r  )rb   rc   rd   r  r3   r  r  r   r   rD   r  r   r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    s        34JK
 
 
 
 
$ $ $. . .  156:59371559=A>B-1+/$(,0/3&*^
 ^
E,-^
 !!23^
 !!12	^

 u/0^
 E-.^
   12^
  ((9:^
 !)): ;^
 )*^
 "%^
 D>^
 $D>^
 'tn^
 d^^
" 
uU\"$EE	F#^
 ^
 ^
 ^^
 ^
 ^
 ^
 ^
rO   r  c                     |                      |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r"   r   )nerr   rD   cumsumtype_asrI   )rV   r'   rX   maskincremental_indicess        rN   rS   rS     sg     <<$$((**D <!444<<TBBE[[_cc##%%33rO   )r  r  r  r  r  r  rj  r>  )r   )Jre   r   typingr   r   rD   r   torch.nnr   r   r   activationsr
   r   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r    utils.deprecationr!   configuration_camembertr#   
get_loggerrb   r   Moduler%   ri   r   r   r   r   r   r   r  r  r5  r>  rR  rJ  rj  r  r  r  r  r  r  rS   __all__r%  rO   rN   <module>r     sc       " " " " " " " "        A A A A A A A A A A ' ' ' ' ' ' ' ' C C C C C C C C C C ) ) ) ) ) ) w w w w w w w w 9 9 9 9 9 9	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 . - - - - - l l l l l l l l l l , , , , , , , , 0 0 0 0 0 0 4 4 4 4 4 4 
	H	%	%V= V= V= V= V=") V= V= V=tB. B. B. B. B.RY B. B. B.Le! e! e! e! e!!7 e! e! e!R    ")    $&$ $  3 3 3 3 3 3 3 3n    BI        bi   C C C C C/ C C CNW
 W
 W
 W
 W
ry W
 W
 W
v    bi    % % % % % % % %6    ")   .* * * * *bi * * *> I
 I
 I
 I
 I
- I
 I
 I
X Y
 Y
 Y
 Y
 Y
3 Y
 Y
 Y
x   [
 [
 [
 [
 [
)A [
 [
 [
| f
 f
 f
 f
 f
!9 f
 f
 f
R M
 M
 M
 M
 M
&> M
 M
 M
` U
 U
 U
 U
 U
$< U
 U
 U
p   t
 t
 t
 t
 t
3_ t
 t
 t
p4 4 4 4 	 	 	rO   